[PATCH] aarch64: Check the ldp/stp policy model correctly when mem ops are reversed.

2024-01-17 Thread Manos Anagnostakis
The current ldp/stp policy framework implementation was missing cases, where
the memory operands were reversed. Therefore the call to the framework function
is moved after the lower mem check with the suitable parameters. Also removes
the mode of aarch64_operands_ok_for_ldpstp, which becomes unused and triggers
a warning on bootstrap.

gcc/ChangeLog:

* config/aarch64/aarch64-ldpstp.md: Remove unused mode.
* config/aarch64/aarch64-protos.h (aarch64_operands_ok_for_ldpstp):
Likewise.
* config/aarch64/aarch64.cc (aarch64_operands_ok_for_ldpstp):
Call on framework moved later.

Signed-off-by: Manos Anagnostakis 
Co-Authored-By: Manolis Tsamis 
---
 gcc/config/aarch64/aarch64-ldpstp.md | 22 +++---
 gcc/config/aarch64/aarch64-protos.h  |  2 +-
 gcc/config/aarch64/aarch64.cc| 18 +-
 3 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-ldpstp.md 
b/gcc/config/aarch64/aarch64-ldpstp.md
index b668fa8e2a6..b7c0bf05cd1 100644
--- a/gcc/config/aarch64/aarch64-ldpstp.md
+++ b/gcc/config/aarch64/aarch64-ldpstp.md
@@ -23,7 +23,7 @@
(match_operand:GPI 1 "memory_operand" ""))
(set (match_operand:GPI 2 "register_operand" "")
(match_operand:GPI 3 "memory_operand" ""))]
-  "aarch64_operands_ok_for_ldpstp (operands, true, mode)"
+  "aarch64_operands_ok_for_ldpstp (operands, true)"
   [(const_int 0)]
 {
   aarch64_finish_ldpstp_peephole (operands, true);
@@ -35,7 +35,7 @@
(match_operand:GPI 1 "aarch64_reg_or_zero" ""))
(set (match_operand:GPI 2 "memory_operand" "")
(match_operand:GPI 3 "aarch64_reg_or_zero" ""))]
-  "aarch64_operands_ok_for_ldpstp (operands, false, mode)"
+  "aarch64_operands_ok_for_ldpstp (operands, false)"
   [(const_int 0)]
 {
   aarch64_finish_ldpstp_peephole (operands, false);
@@ -47,7 +47,7 @@
(match_operand:GPF 1 "memory_operand" ""))
(set (match_operand:GPF 2 "register_operand" "")
(match_operand:GPF 3 "memory_operand" ""))]
-  "aarch64_operands_ok_for_ldpstp (operands, true, mode)"
+  "aarch64_operands_ok_for_ldpstp (operands, true)"
   [(const_int 0)]
 {
   aarch64_finish_ldpstp_peephole (operands, true);
@@ -59,7 +59,7 @@
(match_operand:GPF 1 "aarch64_reg_or_fp_zero" ""))
(set (match_operand:GPF 2 "memory_operand" "")
(match_operand:GPF 3 "aarch64_reg_or_fp_zero" ""))]
-  "aarch64_operands_ok_for_ldpstp (operands, false, mode)"
+  "aarch64_operands_ok_for_ldpstp (operands, false)"
   [(const_int 0)]
 {
   aarch64_finish_ldpstp_peephole (operands, false);
@@ -71,7 +71,7 @@
(match_operand:DREG 1 "memory_operand" ""))
(set (match_operand:DREG2 2 "register_operand" "")
(match_operand:DREG2 3 "memory_operand" ""))]
-  "aarch64_operands_ok_for_ldpstp (operands, true, mode)"
+  "aarch64_operands_ok_for_ldpstp (operands, true)"
   [(const_int 0)]
 {
   aarch64_finish_ldpstp_peephole (operands, true);
@@ -83,7 +83,7 @@
(match_operand:DREG 1 "register_operand" ""))
(set (match_operand:DREG2 2 "memory_operand" "")
(match_operand:DREG2 3 "register_operand" ""))]
-  "aarch64_operands_ok_for_ldpstp (operands, false, mode)"
+  "aarch64_operands_ok_for_ldpstp (operands, false)"
   [(const_int 0)]
 {
   aarch64_finish_ldpstp_peephole (operands, false);
@@ -96,7 +96,7 @@
(set (match_operand:VQ2 2 "register_operand" "")
(match_operand:VQ2 3 "memory_operand" ""))]
   "TARGET_FLOAT
-   && aarch64_operands_ok_for_ldpstp (operands, true, mode)
+   && aarch64_operands_ok_for_ldpstp (operands, true)
&& (aarch64_tune_params.extra_tuning_flags
& AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0"
   [(const_int 0)]
@@ -111,7 +111,7 @@
(set (match_operand:VQ2 2 "memory_operand" "")
(match_operand:VQ2 3 "register_operand" ""))]
   "TARGET_FLOAT
-   && aarch64_operands_ok_for_ldpstp (operands, false, mode)
+   && aarch64_operands_ok_for_ldpstp (operands, false)
&& (aarch64_tune_params.extra_tuning_flags
& AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0"
   [(const_int 0)]
@@ -128,7 +128,7 @@
(sign_extend:DI (match_operand:SI 1 "memory_operand" "")))
(set (match_operand:DI 2 "register_operand" "")
(sign_extend:DI (match_operand:SI 3 "memory_operand" "&

Re: [PATCH v6] aarch64: New RTL optimization pass avoid-store-forwarding.

2023-12-08 Thread Manos Anagnostakis
So is it OK for trunk as is in v6 with the generic changes added in GCC-15?

Manos.

Στις Πέμ 7 Δεκ 2023, 16:10 ο χρήστης Richard Biener <
richard.guent...@gmail.com> έγραψε:

> On Thu, Dec 7, 2023 at 1:20 PM Richard Sandiford
>  wrote:
> >
> > Richard Biener  writes:
> > > On Wed, Dec 6, 2023 at 7:44 PM Philipp Tomsich <
> philipp.toms...@vrull.eu> wrote:
> > >>
> > >> On Wed, 6 Dec 2023 at 23:32, Richard Biener <
> richard.guent...@gmail.com> wrote:
> > >> >
> > >> > On Wed, Dec 6, 2023 at 2:48 PM Manos Anagnostakis
> > >> >  wrote:
> > >> > >
> > >> > > This is an RTL pass that detects store forwarding from stores to
> larger loads (load pairs).
> > >> > >
> > >> > > This optimization is SPEC2017-driven and was found to be
> beneficial for some benchmarks,
> > >> > > through testing on ampere1/ampere1a machines.
> > >> > >
> > >> > > For example, it can transform cases like
> > >> > >
> > >> > > str  d5, [sp, #320]
> > >> > > fmul d5, d31, d29
> > >> > > ldp  d31, d17, [sp, #312] # Large load from small store
> > >> > >
> > >> > > to
> > >> > >
> > >> > > str  d5, [sp, #320]
> > >> > > fmul d5, d31, d29
> > >> > > ldr  d31, [sp, #312]
> > >> > > ldr  d17, [sp, #320]
> > >> > >
> > >> > > Currently, the pass is disabled by default on all architectures
> and enabled by a target-specific option.
> > >> > >
> > >> > > If deemed beneficial enough for a default, it will be enabled on
> ampere1/ampere1a,
> > >> > > or other architectures as well, without needing to be turned on
> by this option.
> > >> >
> > >> > What is aarch64-specific about the pass?
> > >> >
> > >> > I see an increasingly large number of target specific passes pop up
> (probably
> > >> > for the excuse we can generalize them if necessary).  But GCC isn't
> LLVM
> > >> > and this feels like getting out of hand?
> > >>
> > >> We had an OK from Richard Sandiford on the earlier (v5) version with
> > >> v6 just fixing an obvious bug... so I was about to merge this earlier
> > >> just when you commented.
> > >>
> > >> Given that this had months of test exposure on our end, I would prefer
> > >> to move this forward for GCC14 in its current form.
> > >> The project of replacing architecture-specific store-forwarding passes
> > >> with a generalized infrastructure could then be addressed in the GCC15
> > >> timeframe (or beyond)?
> > >
> > > It's up to target maintainers, I just picked this pass (randomly) to
> make this
> > > comment (of course also knowing that STLF fails are a common issue on
> > > pipelined uarchs).
> >
> > I agree there's scope for making some of this target-independent.
> >
> > One vague thing I've been wondering about is whether, for some passes
> > like these, we should use inheritance rather than target hooks.  So in
> > this case, the target-independent code would provide a framework for
> > iterating over the function and testing for forwarding, but the target
> > would ultimately decide what to do with that information.  This would
> > also make it easier for targets to add genuinely target-specific
> > information to the bookkeeping structures.
> >
> > In case it sounds otherwise, that's supposed to be more than
> > just a structural C++-vs-C thing.  The idea is that we'd have
> > a pass for "resolving store forwarding-related problems",
> > but the specific goals would be mostly (or at least partially)
> > target-specific rather than target-independent.
>
> In some cases we've used target hooks for this, in this case it might
> work as well.
>
> > I'd wondered the same thing about the early-ra pass that we're
> > adding for SME.  Some of the framework could be generalised and
> > made target-independent, but the main purpose of the pass (using
> > strided registers with certain patterns and constraints) is highly
> > target-specific.
>
> .. not sure about this one though.
>
> Richard.
>
> > Thanks,
> > Richard
>


Re: [PATCH v6] aarch64: New RTL optimization pass avoid-store-forwarding.

2023-12-07 Thread Manos Anagnostakis
Στις Πέμ 7 Δεκ 2023, 09:39 ο χρήστης Richard Biener <
richard.guent...@gmail.com> έγραψε:

> On Wed, Dec 6, 2023 at 6:15 PM Manos Anagnostakis
>  wrote:
> >
> > Hi again,
> >
> > I went and tested the requested changes and found out the following:
> >
> > 1. The pass is currently increasing insn_cnt on a NONJUMP_INSN_P, which
> is a subset of NONDEBUG_INSN_P. I think there is no problem with depending
> on -g with the current version. Do you see something I don't or did you
> mean something else?
>
> It just occurred to me - thanks for double-checking (it wasn't obvious
> to me NONJUMP_INSN_P doesn't include DEBUG_INSNs ...).
>
> > 2. Not processing all instructions is not letting cselib record all the
> effects they have, thus it does not have updated information to find true
> forwardings at any given time. I can confirm this since I am witnessing
> many unexpected changes on the number of handled cases if I do this only
> for loads/stores.
>
> Ah, OK.  I guess I don't fully get it, it seems you use cselib to
> compare addresses and while possibly not
> processing part of the address computation might break this other
> stmts inbetween should be uninteresting
> (at least I don't see you handling intermediate may-aliasing [small]
> stores to disable the splitting).
>
> So in the end it's a compile-time trade-off between relying solely on
> cselib or trying to optimize
> cselib use with DF for address computes?
>
> Richard.
>
I am not really familiar with the DF technique, but what we did is the
following:

At first we were using a combination of alias analysis with cselib, which
while trying to avoid false positives that had their memory-register
address changed inbetween, was rejecting part of stores from being
considered as candidates for a forwarding to an ldp.

Thus in the end using only the cselib with the correct lookups was the way
to address both the register difference and/or the intermediate change on
the address value.

>
> > Thanks in advance and please let me know your thoughts on the above.
> > Manos.
> >
> > On Wed, Dec 6, 2023 at 5:10 PM Manos Anagnostakis <
> manos.anagnosta...@vrull.eu> wrote:
> >>
> >> Hi Richard,
> >>
> >> thanks for the useful comments.
> >>
> >> On Wed, Dec 6, 2023 at 4:32 PM Richard Biener <
> richard.guent...@gmail.com> wrote:
> >>>
> >>> On Wed, Dec 6, 2023 at 2:48 PM Manos Anagnostakis
> >>>  wrote:
> >>> >
> >>> > This is an RTL pass that detects store forwarding from stores to
> larger loads (load pairs).
> >>> >
> >>> > This optimization is SPEC2017-driven and was found to be beneficial
> for some benchmarks,
> >>> > through testing on ampere1/ampere1a machines.
> >>> >
> >>> > For example, it can transform cases like
> >>> >
> >>> > str  d5, [sp, #320]
> >>> > fmul d5, d31, d29
> >>> > ldp  d31, d17, [sp, #312] # Large load from small store
> >>> >
> >>> > to
> >>> >
> >>> > str  d5, [sp, #320]
> >>> > fmul d5, d31, d29
> >>> > ldr  d31, [sp, #312]
> >>> > ldr  d17, [sp, #320]
> >>> >
> >>> > Currently, the pass is disabled by default on all architectures and
> enabled by a target-specific option.
> >>> >
> >>> > If deemed beneficial enough for a default, it will be enabled on
> ampere1/ampere1a,
> >>> > or other architectures as well, without needing to be turned on by
> this option.
> >>>
> >>> What is aarch64-specific about the pass?
> >>
> >> The pass was designed to target load pairs, which are aarch64 specific,
> thus it cannot handle generic loads.
> >>>
> >>>
> >>> I see an increasingly large number of target specific passes pop up
> (probably
> >>> for the excuse we can generalize them if necessary).  But GCC isn't
> LLVM
> >>> and this feels like getting out of hand?
> >>>
> >>> The x86 backend also has its store-forwarding "pass" as part of mdreorg
> >>> in ix86_split_stlf_stall_load.
> >>>
> >>> Richard.
> >>>
> >>> > Bootstrapped and regtested on aarch64-linux.
> >>> >
> >>> > gcc/ChangeLog:
> >>> >
> >>> > * config.gcc: Add aarch64-store-forwarding.o to extra_objs.
> >>> > * config/aarch64/aarch64-passes.def (INSERT_PASS_AFTER): New
&g

Re: [PATCH v6] aarch64: New RTL optimization pass avoid-store-forwarding.

2023-12-06 Thread Manos Anagnostakis
Hi again,

I went and tested the requested changes and found out the following:

1. The pass is currently increasing insn_cnt on a NONJUMP_INSN_P, which is
a subset of NONDEBUG_INSN_P. I think there is no problem with depending on
-g with the current version. Do you see something I don't or did you mean
something else?
2. Not processing all instructions is not letting cselib record all the
effects they have, thus it does not have updated information to find true
forwardings at any given time. I can confirm this since I am witnessing
many unexpected changes on the number of handled cases if I do this only
for loads/stores.

Thanks in advance and please let me know your thoughts on the above.
Manos.

On Wed, Dec 6, 2023 at 5:10 PM Manos Anagnostakis <
manos.anagnosta...@vrull.eu> wrote:

> Hi Richard,
>
> thanks for the useful comments.
>
> On Wed, Dec 6, 2023 at 4:32 PM Richard Biener 
> wrote:
>
>> On Wed, Dec 6, 2023 at 2:48 PM Manos Anagnostakis
>>  wrote:
>> >
>> > This is an RTL pass that detects store forwarding from stores to larger
>> loads (load pairs).
>> >
>> > This optimization is SPEC2017-driven and was found to be beneficial for
>> some benchmarks,
>> > through testing on ampere1/ampere1a machines.
>> >
>> > For example, it can transform cases like
>> >
>> > str  d5, [sp, #320]
>> > fmul d5, d31, d29
>> > ldp  d31, d17, [sp, #312] # Large load from small store
>> >
>> > to
>> >
>> > str  d5, [sp, #320]
>> > fmul d5, d31, d29
>> > ldr  d31, [sp, #312]
>> > ldr  d17, [sp, #320]
>> >
>> > Currently, the pass is disabled by default on all architectures and
>> enabled by a target-specific option.
>> >
>> > If deemed beneficial enough for a default, it will be enabled on
>> ampere1/ampere1a,
>> > or other architectures as well, without needing to be turned on by this
>> option.
>>
>> What is aarch64-specific about the pass?
>>
> The pass was designed to target load pairs, which are aarch64 specific,
> thus it cannot handle generic loads.
>
>>
>> I see an increasingly large number of target specific passes pop up
>> (probably
>> for the excuse we can generalize them if necessary).  But GCC isn't LLVM
>> and this feels like getting out of hand?
>>
>> The x86 backend also has its store-forwarding "pass" as part of mdreorg
>> in ix86_split_stlf_stall_load.
>>
>> Richard.
>>
>> > Bootstrapped and regtested on aarch64-linux.
>> >
>> > gcc/ChangeLog:
>> >
>> > * config.gcc: Add aarch64-store-forwarding.o to extra_objs.
>> > * config/aarch64/aarch64-passes.def (INSERT_PASS_AFTER): New
>> pass.
>> > * config/aarch64/aarch64-protos.h
>> (make_pass_avoid_store_forwarding): Declare.
>> > * config/aarch64/aarch64.opt (mavoid-store-forwarding): New
>> option.
>> > (aarch64-store-forwarding-threshold): New param.
>> > * config/aarch64/t-aarch64: Add aarch64-store-forwarding.o
>> > * doc/invoke.texi: Document new option and new param.
>> > * config/aarch64/aarch64-store-forwarding.cc: New file.
>> >
>> > gcc/testsuite/ChangeLog:
>> >
>> > * gcc.target/aarch64/ldp_ssll_no_overlap_address.c: New test.
>> > * gcc.target/aarch64/ldp_ssll_no_overlap_offset.c: New test.
>> > * gcc.target/aarch64/ldp_ssll_overlap.c: New test.
>> >
>> > Signed-off-by: Manos Anagnostakis 
>> > Co-Authored-By: Manolis Tsamis 
>> > Co-Authored-By: Philipp Tomsich 
>> > ---
>> > Changes in v6:
>> > - An obvious change. insn_cnt was incremented only on
>> >   stores and not for every insn in the bb. Now restored.
>> >
>> >  gcc/config.gcc|   1 +
>> >  gcc/config/aarch64/aarch64-passes.def |   1 +
>> >  gcc/config/aarch64/aarch64-protos.h   |   1 +
>> >  .../aarch64/aarch64-store-forwarding.cc   | 318 ++
>> >  gcc/config/aarch64/aarch64.opt|   9 +
>> >  gcc/config/aarch64/t-aarch64  |  10 +
>> >  gcc/doc/invoke.texi   |  11 +-
>> >  .../aarch64/ldp_ssll_no_overlap_address.c |  33 ++
>> >  .../aarch64/ldp_ssll_no_overlap_offset.c  |  33 ++
>> >  .../gcc.target/aarch64/ldp_ssll_overlap.c |  33 ++
>> >  10 files changed, 449 insertions(+), 1 deletion(-)
>> >  create mode 100

Re: [PATCH v6] aarch64: New RTL optimization pass avoid-store-forwarding.

2023-12-06 Thread Manos Anagnostakis
Hi Richard,

thanks for the useful comments.

On Wed, Dec 6, 2023 at 4:32 PM Richard Biener 
wrote:

> On Wed, Dec 6, 2023 at 2:48 PM Manos Anagnostakis
>  wrote:
> >
> > This is an RTL pass that detects store forwarding from stores to larger
> loads (load pairs).
> >
> > This optimization is SPEC2017-driven and was found to be beneficial for
> some benchmarks,
> > through testing on ampere1/ampere1a machines.
> >
> > For example, it can transform cases like
> >
> > str  d5, [sp, #320]
> > fmul d5, d31, d29
> > ldp  d31, d17, [sp, #312] # Large load from small store
> >
> > to
> >
> > str  d5, [sp, #320]
> > fmul d5, d31, d29
> > ldr  d31, [sp, #312]
> > ldr  d17, [sp, #320]
> >
> > Currently, the pass is disabled by default on all architectures and
> enabled by a target-specific option.
> >
> > If deemed beneficial enough for a default, it will be enabled on
> ampere1/ampere1a,
> > or other architectures as well, without needing to be turned on by this
> option.
>
> What is aarch64-specific about the pass?
>
The pass was designed to target load pairs, which are aarch64 specific,
thus it cannot handle generic loads.

>
> I see an increasingly large number of target specific passes pop up
> (probably
> for the excuse we can generalize them if necessary).  But GCC isn't LLVM
> and this feels like getting out of hand?
>
> The x86 backend also has its store-forwarding "pass" as part of mdreorg
> in ix86_split_stlf_stall_load.
>
> Richard.
>
> > Bootstrapped and regtested on aarch64-linux.
> >
> > gcc/ChangeLog:
> >
> > * config.gcc: Add aarch64-store-forwarding.o to extra_objs.
> > * config/aarch64/aarch64-passes.def (INSERT_PASS_AFTER): New
> pass.
> > * config/aarch64/aarch64-protos.h
> (make_pass_avoid_store_forwarding): Declare.
> > * config/aarch64/aarch64.opt (mavoid-store-forwarding): New
> option.
> > (aarch64-store-forwarding-threshold): New param.
> > * config/aarch64/t-aarch64: Add aarch64-store-forwarding.o
> > * doc/invoke.texi: Document new option and new param.
> > * config/aarch64/aarch64-store-forwarding.cc: New file.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/aarch64/ldp_ssll_no_overlap_address.c: New test.
> > * gcc.target/aarch64/ldp_ssll_no_overlap_offset.c: New test.
> > * gcc.target/aarch64/ldp_ssll_overlap.c: New test.
> >
> > Signed-off-by: Manos Anagnostakis 
> > Co-Authored-By: Manolis Tsamis 
> > Co-Authored-By: Philipp Tomsich 
> > ---
> > Changes in v6:
> > - An obvious change. insn_cnt was incremented only on
> >   stores and not for every insn in the bb. Now restored.
> >
> >  gcc/config.gcc|   1 +
> >  gcc/config/aarch64/aarch64-passes.def |   1 +
> >  gcc/config/aarch64/aarch64-protos.h   |   1 +
> >  .../aarch64/aarch64-store-forwarding.cc   | 318 ++
> >  gcc/config/aarch64/aarch64.opt|   9 +
> >  gcc/config/aarch64/t-aarch64  |  10 +
> >  gcc/doc/invoke.texi   |  11 +-
> >  .../aarch64/ldp_ssll_no_overlap_address.c |  33 ++
> >  .../aarch64/ldp_ssll_no_overlap_offset.c  |  33 ++
> >  .../gcc.target/aarch64/ldp_ssll_overlap.c |  33 ++
> >  10 files changed, 449 insertions(+), 1 deletion(-)
> >  create mode 100644 gcc/config/aarch64/aarch64-store-forwarding.cc
> >  create mode 100644
> gcc/testsuite/gcc.target/aarch64/ldp_ssll_no_overlap_address.c
> >  create mode 100644
> gcc/testsuite/gcc.target/aarch64/ldp_ssll_no_overlap_offset.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_ssll_overlap.c
> >
> > diff --git a/gcc/config.gcc b/gcc/config.gcc
> > index 6450448f2f0..7c48429eb82 100644
> > --- a/gcc/config.gcc
> > +++ b/gcc/config.gcc
> > @@ -350,6 +350,7 @@ aarch64*-*-*)
> > cxx_target_objs="aarch64-c.o"
> > d_target_objs="aarch64-d.o"
> > extra_objs="aarch64-builtins.o aarch-common.o
> aarch64-sve-builtins.o aarch64-sve-builtins-shapes.o
> aarch64-sve-builtins-base.o aarch64-sve-builtins-sve2.o
> aarch64-sve-builtins-sme.o cortex-a57-fma-steering.o aarch64-speculation.o
> falkor-tag-collision-avoidance.o aarch-bti-insert.o aarch64-cc-fusion.o"
> > +   extra_objs="${extra_objs} aarch64-store-forwarding.o"
> > target_gtfiles="\$(srcdir)/config/aarch64/aarch64-builtins.cc
&

[PATCH v6] aarch64: New RTL optimization pass avoid-store-forwarding.

2023-12-06 Thread Manos Anagnostakis
This is an RTL pass that detects store forwarding from stores to larger loads 
(load pairs).

This optimization is SPEC2017-driven and was found to be beneficial for some 
benchmarks,
through testing on ampere1/ampere1a machines.

For example, it can transform cases like

str  d5, [sp, #320]
fmul d5, d31, d29
ldp  d31, d17, [sp, #312] # Large load from small store

to

str  d5, [sp, #320]
fmul d5, d31, d29
ldr  d31, [sp, #312]
ldr  d17, [sp, #320]

Currently, the pass is disabled by default on all architectures and enabled by 
a target-specific option.

If deemed beneficial enough for a default, it will be enabled on 
ampere1/ampere1a,
or other architectures as well, without needing to be turned on by this option.

Bootstrapped and regtested on aarch64-linux.

gcc/ChangeLog:

* config.gcc: Add aarch64-store-forwarding.o to extra_objs.
* config/aarch64/aarch64-passes.def (INSERT_PASS_AFTER): New pass.
* config/aarch64/aarch64-protos.h (make_pass_avoid_store_forwarding): 
Declare.
* config/aarch64/aarch64.opt (mavoid-store-forwarding): New option.
(aarch64-store-forwarding-threshold): New param.
* config/aarch64/t-aarch64: Add aarch64-store-forwarding.o
* doc/invoke.texi: Document new option and new param.
* config/aarch64/aarch64-store-forwarding.cc: New file.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/ldp_ssll_no_overlap_address.c: New test.
* gcc.target/aarch64/ldp_ssll_no_overlap_offset.c: New test.
* gcc.target/aarch64/ldp_ssll_overlap.c: New test.

Signed-off-by: Manos Anagnostakis 
Co-Authored-By: Manolis Tsamis 
Co-Authored-By: Philipp Tomsich 
---
Changes in v6:
- An obvious change. insn_cnt was incremented only on
  stores and not for every insn in the bb. Now restored.

 gcc/config.gcc|   1 +
 gcc/config/aarch64/aarch64-passes.def |   1 +
 gcc/config/aarch64/aarch64-protos.h   |   1 +
 .../aarch64/aarch64-store-forwarding.cc   | 318 ++
 gcc/config/aarch64/aarch64.opt|   9 +
 gcc/config/aarch64/t-aarch64  |  10 +
 gcc/doc/invoke.texi   |  11 +-
 .../aarch64/ldp_ssll_no_overlap_address.c |  33 ++
 .../aarch64/ldp_ssll_no_overlap_offset.c  |  33 ++
 .../gcc.target/aarch64/ldp_ssll_overlap.c |  33 ++
 10 files changed, 449 insertions(+), 1 deletion(-)
 create mode 100644 gcc/config/aarch64/aarch64-store-forwarding.cc
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/ldp_ssll_no_overlap_address.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/ldp_ssll_no_overlap_offset.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_ssll_overlap.c

diff --git a/gcc/config.gcc b/gcc/config.gcc
index 6450448f2f0..7c48429eb82 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -350,6 +350,7 @@ aarch64*-*-*)
cxx_target_objs="aarch64-c.o"
d_target_objs="aarch64-d.o"
extra_objs="aarch64-builtins.o aarch-common.o aarch64-sve-builtins.o 
aarch64-sve-builtins-shapes.o aarch64-sve-builtins-base.o 
aarch64-sve-builtins-sve2.o aarch64-sve-builtins-sme.o 
cortex-a57-fma-steering.o aarch64-speculation.o 
falkor-tag-collision-avoidance.o aarch-bti-insert.o aarch64-cc-fusion.o"
+   extra_objs="${extra_objs} aarch64-store-forwarding.o"
target_gtfiles="\$(srcdir)/config/aarch64/aarch64-builtins.cc 
\$(srcdir)/config/aarch64/aarch64-sve-builtins.h 
\$(srcdir)/config/aarch64/aarch64-sve-builtins.cc"
target_has_targetm_common=yes
;;
diff --git a/gcc/config/aarch64/aarch64-passes.def 
b/gcc/config/aarch64/aarch64-passes.def
index 662a13fd5e6..94ced0aebf6 100644
--- a/gcc/config/aarch64/aarch64-passes.def
+++ b/gcc/config/aarch64/aarch64-passes.def
@@ -24,3 +24,4 @@ INSERT_PASS_BEFORE (pass_late_thread_prologue_and_epilogue, 
1, pass_switch_pstat
 INSERT_PASS_AFTER (pass_machine_reorg, 1, pass_tag_collision_avoidance);
 INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_bti);
 INSERT_PASS_AFTER (pass_if_after_combine, 1, pass_cc_fusion);
+INSERT_PASS_AFTER (pass_peephole2, 1, pass_avoid_store_forwarding);
diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 60ff61f6d54..8f5f2ca4710 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -1069,6 +1069,7 @@ rtl_opt_pass *make_pass_tag_collision_avoidance 
(gcc::context *);
 rtl_opt_pass *make_pass_insert_bti (gcc::context *ctxt);
 rtl_opt_pass *make_pass_cc_fusion (gcc::context *ctxt);
 rtl_opt_pass *make_pass_switch_pstate_sm (gcc::context *ctxt);
+rtl_opt_pass *make_pass_avoid_store_forwarding (gcc::context *ctxt);

 poly_uint64 aarch64_regmode_natural_size (machine_mode);

diff --git a/gcc/config/aarch64/aarch64-store-forwarding.cc 
b/gcc/config/aarch64/aarch64-store-forwarding.cc
new file mode 100644
index 000..8a6faefd8c0
--

Re: [PATCH v4] aarch64: New RTL optimization pass avoid-store-forwarding.

2023-12-05 Thread Manos Anagnostakis
Hi Richard,

the patch is working correctly with the four lines deleted and "get_addr"
on "load_mem" inlined on "rtx_equal_for_cselib_1" call.
Also changed store_info to str_info to avoid a warning of duplicate names
on bootstrap (one definition rule).

Rebased on top of your sme2 changes and submitted as v5.

Thanks!
Manos.

On Mon, Dec 4, 2023 at 10:35 PM Richard Sandiford 
wrote:

> Manos Anagnostakis  writes:
> > Στις Δευ 4 Δεκ 2023, 21:22 ο χρήστης Richard Sandiford <
> > richard.sandif...@arm.com> έγραψε:
> >
> >> Manos Anagnostakis  writes:
> >> > This is an RTL pass that detects store forwarding from stores to
> larger
> >> loads (load pairs).
> >> >
> >> > This optimization is SPEC2017-driven and was found to be beneficial
> for
> >> some benchmarks,
> >> > through testing on ampere1/ampere1a machines.
> >> >
> >> > For example, it can transform cases like
> >> >
> >> > str  d5, [sp, #320]
> >> > fmul d5, d31, d29
> >> > ldp  d31, d17, [sp, #312] # Large load from small store
> >> >
> >> > to
> >> >
> >> > str  d5, [sp, #320]
> >> > fmul d5, d31, d29
> >> > ldr  d31, [sp, #312]
> >> > ldr  d17, [sp, #320]
> >> >
> >> > Currently, the pass is disabled by default on all architectures and
> >> enabled by a target-specific option.
> >> >
> >> > If deemed beneficial enough for a default, it will be enabled on
> >> ampere1/ampere1a,
> >> > or other architectures as well, without needing to be turned on by
> this
> >> option.
> >> >
> >> > Bootstrapped and regtested on aarch64-linux.
> >> >
> >> > gcc/ChangeLog:
> >> >
> >> > * config.gcc: Add aarch64-store-forwarding.o to extra_objs.
> >> > * config/aarch64/aarch64-passes.def (INSERT_PASS_AFTER): New
> >> pass.
> >> > * config/aarch64/aarch64-protos.h
> >> (make_pass_avoid_store_forwarding): Declare.
> >> > * config/aarch64/aarch64.opt (mavoid-store-forwarding): New
> >> option.
> >> >   (aarch64-store-forwarding-threshold): New param.
> >> > * config/aarch64/t-aarch64: Add aarch64-store-forwarding.o
> >> > * doc/invoke.texi: Document new option and new param.
> >> > * config/aarch64/aarch64-store-forwarding.cc: New file.
> >> >
> >> > gcc/testsuite/ChangeLog:
> >> >
> >> > * gcc.target/aarch64/ldp_ssll_no_overlap_address.c: New test.
> >> > * gcc.target/aarch64/ldp_ssll_no_overlap_offset.c: New test.
> >> > * gcc.target/aarch64/ldp_ssll_overlap.c: New test.
> >> >
> >> > Signed-off-by: Manos Anagnostakis 
> >> > Co-Authored-By: Manolis Tsamis 
> >> > Co-Authored-By: Philipp Tomsich 
> >> > ---
> >> > Changes in v4:
> >> >   - I had problems to make cselib_subst_to_values work correctly
> >> > so I used cselib_lookup to implement the exact same behaviour
> and
> >> > record the store value at the time we iterate over it.
> >> >   - Removed the store/load_mem_addr check from is_forwarding as
> >> > unnecessary.
> >> >   - The pass is called on all optimization levels right now.
> >> >   - The threshold check should remain as it is as we only care for
> >> > the front element of the list. The comment above the check
> >> explains
> >> > why a single if is enough.
> >>
> >> I still think this is structurally better as a while.  There's no reason
> >> in principle we why wouldn't want to record the stores in:
> >>
> >> stp x0, x1, [x4, #8]
> >> ldp x0, x1, [x4, #0]
> >> ldp x2, x3, [x4, #16]
> >>
> >> and then the two stores should have the same distance value.
> >> I realise we don't do that yet, but still.
> >>
> > Ah, you mean forwarding from stp. I was a bit confused with what you
> meant
> > the previous time. This was not initially meant for this patch, but I
> think
> > it wouldn't take long to implement that before pushing this. It is your
> > call of course if I should include it.
>
> No strong opinion either way, really.  It's definitely fine to check
> only for STRs initially.  I'd just rather not b

[PATCH v5] aarch64: New RTL optimization pass avoid-store-forwarding.

2023-12-05 Thread Manos Anagnostakis
This is an RTL pass that detects store forwarding from stores to larger loads 
(load pairs).

This optimization is SPEC2017-driven and was found to be beneficial for some 
benchmarks,
through testing on ampere1/ampere1a machines.

For example, it can transform cases like

str  d5, [sp, #320]
fmul d5, d31, d29
ldp  d31, d17, [sp, #312] # Large load from small store

to

str  d5, [sp, #320]
fmul d5, d31, d29
ldr  d31, [sp, #312]
ldr  d17, [sp, #320]

Currently, the pass is disabled by default on all architectures and enabled by 
a target-specific option.

If deemed beneficial enough for a default, it will be enabled on 
ampere1/ampere1a,
or other architectures as well, without needing to be turned on by this option.

Bootstrapped and regtested on aarch64-linux.

gcc/ChangeLog:

* config.gcc: Add aarch64-store-forwarding.o to extra_objs.
* config/aarch64/aarch64-passes.def (INSERT_PASS_AFTER): New pass.
* config/aarch64/aarch64-protos.h (make_pass_avoid_store_forwarding): 
Declare.
* config/aarch64/aarch64.opt (mavoid-store-forwarding): New option.
(aarch64-store-forwarding-threshold): New param.
* config/aarch64/t-aarch64: Add aarch64-store-forwarding.o
* doc/invoke.texi: Document new option and new param.
* config/aarch64/aarch64-store-forwarding.cc: New file.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/ldp_ssll_no_overlap_address.c: New test.
* gcc.target/aarch64/ldp_ssll_no_overlap_offset.c: New test.
* gcc.target/aarch64/ldp_ssll_overlap.c: New test.

Signed-off-by: Manos Anagnostakis 
Co-Authored-By: Manolis Tsamis 
Co-Authored-By: Philipp Tomsich 
---
Changes in v5:
- Remove unnecessary cselib_lookup on load_mem_addr.
- Fix warning with store_info by renaming to str_info.

 gcc/config.gcc|   1 +
 gcc/config/aarch64/aarch64-passes.def |   1 +
 gcc/config/aarch64/aarch64-protos.h   |   1 +
 .../aarch64/aarch64-store-forwarding.cc   | 319 ++
 gcc/config/aarch64/aarch64.opt|   9 +
 gcc/config/aarch64/t-aarch64  |  10 +
 gcc/doc/invoke.texi   |  11 +-
 .../aarch64/ldp_ssll_no_overlap_address.c |  33 ++
 .../aarch64/ldp_ssll_no_overlap_offset.c  |  33 ++
 .../gcc.target/aarch64/ldp_ssll_overlap.c |  33 ++
 10 files changed, 450 insertions(+), 1 deletion(-)
 create mode 100644 gcc/config/aarch64/aarch64-store-forwarding.cc
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/ldp_ssll_no_overlap_address.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/ldp_ssll_no_overlap_offset.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_ssll_overlap.c

diff --git a/gcc/config.gcc b/gcc/config.gcc
index 6450448f2f0..7c48429eb82 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -350,6 +350,7 @@ aarch64*-*-*)
cxx_target_objs="aarch64-c.o"
d_target_objs="aarch64-d.o"
extra_objs="aarch64-builtins.o aarch-common.o aarch64-sve-builtins.o 
aarch64-sve-builtins-shapes.o aarch64-sve-builtins-base.o 
aarch64-sve-builtins-sve2.o aarch64-sve-builtins-sme.o 
cortex-a57-fma-steering.o aarch64-speculation.o 
falkor-tag-collision-avoidance.o aarch-bti-insert.o aarch64-cc-fusion.o"
+   extra_objs="${extra_objs} aarch64-store-forwarding.o"
target_gtfiles="\$(srcdir)/config/aarch64/aarch64-builtins.cc 
\$(srcdir)/config/aarch64/aarch64-sve-builtins.h 
\$(srcdir)/config/aarch64/aarch64-sve-builtins.cc"
target_has_targetm_common=yes
;;
diff --git a/gcc/config/aarch64/aarch64-passes.def 
b/gcc/config/aarch64/aarch64-passes.def
index 662a13fd5e6..94ced0aebf6 100644
--- a/gcc/config/aarch64/aarch64-passes.def
+++ b/gcc/config/aarch64/aarch64-passes.def
@@ -24,3 +24,4 @@ INSERT_PASS_BEFORE (pass_late_thread_prologue_and_epilogue, 
1, pass_switch_pstat
 INSERT_PASS_AFTER (pass_machine_reorg, 1, pass_tag_collision_avoidance);
 INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_bti);
 INSERT_PASS_AFTER (pass_if_after_combine, 1, pass_cc_fusion);
+INSERT_PASS_AFTER (pass_peephole2, 1, pass_avoid_store_forwarding);
diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 25a9103f0e7..bd4b34d9af1 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -1063,6 +1063,7 @@ rtl_opt_pass *make_pass_tag_collision_avoidance 
(gcc::context *);
 rtl_opt_pass *make_pass_insert_bti (gcc::context *ctxt);
 rtl_opt_pass *make_pass_cc_fusion (gcc::context *ctxt);
 rtl_opt_pass *make_pass_switch_pstate_sm (gcc::context *ctxt);
+rtl_opt_pass *make_pass_avoid_store_forwarding (gcc::context *ctxt);

 poly_uint64 aarch64_regmode_natural_size (machine_mode);

diff --git a/gcc/config/aarch64/aarch64-store-forwarding.cc 
b/gcc/config/aarch64/aarch64-store-forwarding.cc
new file mode 100644
index 000..33bdc4bc9ca
--

Re: [PATCH v4] aarch64: New RTL optimization pass avoid-store-forwarding.

2023-12-04 Thread Manos Anagnostakis
Στις Δευ 4 Δεκ 2023, 21:22 ο χρήστης Richard Sandiford <
richard.sandif...@arm.com> έγραψε:

> Manos Anagnostakis  writes:
> > This is an RTL pass that detects store forwarding from stores to larger
> loads (load pairs).
> >
> > This optimization is SPEC2017-driven and was found to be beneficial for
> some benchmarks,
> > through testing on ampere1/ampere1a machines.
> >
> > For example, it can transform cases like
> >
> > str  d5, [sp, #320]
> > fmul d5, d31, d29
> > ldp  d31, d17, [sp, #312] # Large load from small store
> >
> > to
> >
> > str  d5, [sp, #320]
> > fmul d5, d31, d29
> > ldr  d31, [sp, #312]
> > ldr  d17, [sp, #320]
> >
> > Currently, the pass is disabled by default on all architectures and
> enabled by a target-specific option.
> >
> > If deemed beneficial enough for a default, it will be enabled on
> ampere1/ampere1a,
> > or other architectures as well, without needing to be turned on by this
> option.
> >
> > Bootstrapped and regtested on aarch64-linux.
> >
> > gcc/ChangeLog:
> >
> > * config.gcc: Add aarch64-store-forwarding.o to extra_objs.
> > * config/aarch64/aarch64-passes.def (INSERT_PASS_AFTER): New
> pass.
> > * config/aarch64/aarch64-protos.h
> (make_pass_avoid_store_forwarding): Declare.
> > * config/aarch64/aarch64.opt (mavoid-store-forwarding): New
> option.
> >   (aarch64-store-forwarding-threshold): New param.
> > * config/aarch64/t-aarch64: Add aarch64-store-forwarding.o
> > * doc/invoke.texi: Document new option and new param.
> > * config/aarch64/aarch64-store-forwarding.cc: New file.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/aarch64/ldp_ssll_no_overlap_address.c: New test.
> > * gcc.target/aarch64/ldp_ssll_no_overlap_offset.c: New test.
> > * gcc.target/aarch64/ldp_ssll_overlap.c: New test.
> >
> > Signed-off-by: Manos Anagnostakis 
> > Co-Authored-By: Manolis Tsamis 
> > Co-Authored-By: Philipp Tomsich 
> > ---
> > Changes in v4:
> >   - I had problems to make cselib_subst_to_values work correctly
> > so I used cselib_lookup to implement the exact same behaviour and
> > record the store value at the time we iterate over it.
> >   - Removed the store/load_mem_addr check from is_forwarding as
> > unnecessary.
> >   - The pass is called on all optimization levels right now.
> >   - The threshold check should remain as it is as we only care for
> > the front element of the list. The comment above the check
> explains
> > why a single if is enough.
>
> I still think this is structurally better as a while.  There's no reason
> in principle we why wouldn't want to record the stores in:
>
> stp x0, x1, [x4, #8]
> ldp x0, x1, [x4, #0]
> ldp x2, x3, [x4, #16]
>
> and then the two stores should have the same distance value.
> I realise we don't do that yet, but still.
>
Ah, you mean forwarding from stp. I was a bit confused with what you meant
the previous time. This was not initially meant for this patch, but I think
it wouldn't take long to implement that before pushing this. It is your
call of course if I should include it.

>
> >   - The documentation changes requested.
> >   - Adjusted a comment.
> >
> >  gcc/config.gcc|   1 +
> >  gcc/config/aarch64/aarch64-passes.def |   1 +
> >  gcc/config/aarch64/aarch64-protos.h   |   1 +
> >  .../aarch64/aarch64-store-forwarding.cc   | 321 ++
> >  gcc/config/aarch64/aarch64.opt|   9 +
> >  gcc/config/aarch64/t-aarch64  |  10 +
> >  gcc/doc/invoke.texi   |  11 +-
> >  .../aarch64/ldp_ssll_no_overlap_address.c |  33 ++
> >  .../aarch64/ldp_ssll_no_overlap_offset.c  |  33 ++
> >  .../gcc.target/aarch64/ldp_ssll_overlap.c |  33 ++
> >  10 files changed, 452 insertions(+), 1 deletion(-)
> >  create mode 100644 gcc/config/aarch64/aarch64-store-forwarding.cc
> >  create mode 100644
> gcc/testsuite/gcc.target/aarch64/ldp_ssll_no_overlap_address.c
> >  create mode 100644
> gcc/testsuite/gcc.target/aarch64/ldp_ssll_no_overlap_offset.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_ssll_overlap.c
> >
> > diff --git a/gcc/config.gcc b/gcc/config.gcc
> > index 748430194f3..2ee3b61c4fa 100644
> > --- a/gcc/config.gcc
> > +++ b/gcc/config.gcc
> > @@ -350,6 +

[PATCH v4] aarch64: New RTL optimization pass avoid-store-forwarding.

2023-12-04 Thread Manos Anagnostakis
This is an RTL pass that detects store forwarding from stores to larger loads 
(load pairs).

This optimization is SPEC2017-driven and was found to be beneficial for some 
benchmarks,
through testing on ampere1/ampere1a machines.

For example, it can transform cases like

str  d5, [sp, #320]
fmul d5, d31, d29
ldp  d31, d17, [sp, #312] # Large load from small store

to

str  d5, [sp, #320]
fmul d5, d31, d29
ldr  d31, [sp, #312]
ldr  d17, [sp, #320]

Currently, the pass is disabled by default on all architectures and enabled by 
a target-specific option.

If deemed beneficial enough for a default, it will be enabled on 
ampere1/ampere1a,
or other architectures as well, without needing to be turned on by this option.

Bootstrapped and regtested on aarch64-linux.

gcc/ChangeLog:

* config.gcc: Add aarch64-store-forwarding.o to extra_objs.
* config/aarch64/aarch64-passes.def (INSERT_PASS_AFTER): New pass.
* config/aarch64/aarch64-protos.h (make_pass_avoid_store_forwarding): 
Declare.
* config/aarch64/aarch64.opt (mavoid-store-forwarding): New option.
(aarch64-store-forwarding-threshold): New param.
* config/aarch64/t-aarch64: Add aarch64-store-forwarding.o
* doc/invoke.texi: Document new option and new param.
* config/aarch64/aarch64-store-forwarding.cc: New file.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/ldp_ssll_no_overlap_address.c: New test.
* gcc.target/aarch64/ldp_ssll_no_overlap_offset.c: New test.
* gcc.target/aarch64/ldp_ssll_overlap.c: New test.

Signed-off-by: Manos Anagnostakis 
Co-Authored-By: Manolis Tsamis 
Co-Authored-By: Philipp Tomsich 
---
Changes in v4:
- I had problems to make cselib_subst_to_values work correctly
  so I used cselib_lookup to implement the exact same behaviour and
  record the store value at the time we iterate over it.
- Removed the store/load_mem_addr check from is_forwarding as
  unnecessary.
- The pass is called on all optimization levels right now.
- The threshold check should remain as it is as we only care for
  the front element of the list. The comment above the check explains
  why a single if is enough.
- The documentation changes requested.
- Adjusted a comment.

 gcc/config.gcc|   1 +
 gcc/config/aarch64/aarch64-passes.def |   1 +
 gcc/config/aarch64/aarch64-protos.h   |   1 +
 .../aarch64/aarch64-store-forwarding.cc   | 321 ++
 gcc/config/aarch64/aarch64.opt|   9 +
 gcc/config/aarch64/t-aarch64  |  10 +
 gcc/doc/invoke.texi   |  11 +-
 .../aarch64/ldp_ssll_no_overlap_address.c |  33 ++
 .../aarch64/ldp_ssll_no_overlap_offset.c  |  33 ++
 .../gcc.target/aarch64/ldp_ssll_overlap.c |  33 ++
 10 files changed, 452 insertions(+), 1 deletion(-)
 create mode 100644 gcc/config/aarch64/aarch64-store-forwarding.cc
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/ldp_ssll_no_overlap_address.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/ldp_ssll_no_overlap_offset.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_ssll_overlap.c

diff --git a/gcc/config.gcc b/gcc/config.gcc
index 748430194f3..2ee3b61c4fa 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -350,6 +350,7 @@ aarch64*-*-*)
cxx_target_objs="aarch64-c.o"
d_target_objs="aarch64-d.o"
extra_objs="aarch64-builtins.o aarch-common.o aarch64-sve-builtins.o 
aarch64-sve-builtins-shapes.o aarch64-sve-builtins-base.o 
aarch64-sve-builtins-sve2.o cortex-a57-fma-steering.o aarch64-speculation.o 
falkor-tag-collision-avoidance.o aarch-bti-insert.o aarch64-cc-fusion.o"
+   extra_objs="${extra_objs} aarch64-store-forwarding.o"
target_gtfiles="\$(srcdir)/config/aarch64/aarch64-builtins.cc 
\$(srcdir)/config/aarch64/aarch64-sve-builtins.h 
\$(srcdir)/config/aarch64/aarch64-sve-builtins.cc"
target_has_targetm_common=yes
;;
diff --git a/gcc/config/aarch64/aarch64-passes.def 
b/gcc/config/aarch64/aarch64-passes.def
index 6ace797b738..fa79e8adca8 100644
--- a/gcc/config/aarch64/aarch64-passes.def
+++ b/gcc/config/aarch64/aarch64-passes.def
@@ -23,3 +23,4 @@ INSERT_PASS_BEFORE (pass_reorder_blocks, 1, 
pass_track_speculation);
 INSERT_PASS_AFTER (pass_machine_reorg, 1, pass_tag_collision_avoidance);
 INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_bti);
 INSERT_PASS_AFTER (pass_if_after_combine, 1, pass_cc_fusion);
+INSERT_PASS_AFTER (pass_peephole2, 1, pass_avoid_store_forwarding);
diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index d2718cc87b3..7d9dfa06af9 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -1050,6 +1050,7 @@ rtl_opt_pass *make_pass_track_speculation (gcc::context 
*);
 rtl_opt_pas

[PATCH v3] aarch64: New RTL optimization pass avoid-store-forwarding.

2023-11-30 Thread Manos Anagnostakis
This is an RTL pass that detects store forwarding from stores to larger loads 
(load pairs).

This optimization is SPEC2017-driven and was found to be beneficial for some 
benchmarks,
through testing on ampere1/ampere1a machines.

For example, it can transform cases like

str  d5, [sp, #320]
fmul d5, d31, d29
ldp  d31, d17, [sp, #312] # Large load from small store

to

str  d5, [sp, #320]
fmul d5, d31, d29
ldr  d31, [sp, #312]
ldr  d17, [sp, #320]

Currently, the pass is disabled by default on all architectures and enabled by 
a target-specific option.

If deemed beneficial enough for a default, it will be enabled on 
ampere1/ampere1a,
or other architectures as well, without needing to be turned on by this option.

Bootstrapped and regtested on aarch64-linux.

gcc/ChangeLog:

* config.gcc: Add aarch64-store-forwarding.o to extra_objs.
* config/aarch64/aarch64-passes.def (INSERT_PASS_AFTER): New pass.
* config/aarch64/aarch64-protos.h (make_pass_avoid_store_forwarding): 
Declare.
* config/aarch64/aarch64.opt (mavoid-store-forwarding): New option.
(aarch64-store-forwarding-threshold): New param.
* config/aarch64/t-aarch64: Add aarch64-store-forwarding.o
* doc/invoke.texi: Document new option and new param.
* config/aarch64/aarch64-store-forwarding.cc: New file.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/ldp_ssll_no_overlap_address.c: New test.
* gcc.target/aarch64/ldp_ssll_no_overlap_offset.c: New test.
* gcc.target/aarch64/ldp_ssll_overlap.c: New test.

Signed-off-by: Manos Anagnostakis 
Co-Authored-By: Manolis Tsamis 
Co-Authored-By: Philipp Tomsich 
---
Changes in v3:
- Removed obvious zext/sext check in SET_DEST.
- Replaced map/vector with list/struct.
- Replaced current threshold check algorithm with the
suggested, more efficient one (actually an inline check).
- Added usage of cselib_subst_to_values, which works well
with cselib_process_insn.
- Removed canon_rtx, but cannot remove init/end_alias_analysis,
as this is required for cselib to work properly (see cselib_init
in gcc/cselib.cc).
- Adjusted comments according to the new changes.

 gcc/config.gcc|   1 +
 gcc/config/aarch64/aarch64-passes.def |   1 +
 gcc/config/aarch64/aarch64-protos.h   |   1 +
 .../aarch64/aarch64-store-forwarding.cc   | 319 ++
 gcc/config/aarch64/aarch64.opt|   9 +
 gcc/config/aarch64/t-aarch64  |  10 +
 gcc/doc/invoke.texi   |  12 +-
 .../aarch64/ldp_ssll_no_overlap_address.c |  33 ++
 .../aarch64/ldp_ssll_no_overlap_offset.c  |  33 ++
 .../gcc.target/aarch64/ldp_ssll_overlap.c |  33 ++
 10 files changed, 451 insertions(+), 1 deletion(-)
 create mode 100644 gcc/config/aarch64/aarch64-store-forwarding.cc
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/ldp_ssll_no_overlap_address.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/ldp_ssll_no_overlap_offset.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_ssll_overlap.c

diff --git a/gcc/config.gcc b/gcc/config.gcc
index 748430194f3..2ee3b61c4fa 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -350,6 +350,7 @@ aarch64*-*-*)
cxx_target_objs="aarch64-c.o"
d_target_objs="aarch64-d.o"
extra_objs="aarch64-builtins.o aarch-common.o aarch64-sve-builtins.o 
aarch64-sve-builtins-shapes.o aarch64-sve-builtins-base.o 
aarch64-sve-builtins-sve2.o cortex-a57-fma-steering.o aarch64-speculation.o 
falkor-tag-collision-avoidance.o aarch-bti-insert.o aarch64-cc-fusion.o"
+   extra_objs="${extra_objs} aarch64-store-forwarding.o"
target_gtfiles="\$(srcdir)/config/aarch64/aarch64-builtins.cc 
\$(srcdir)/config/aarch64/aarch64-sve-builtins.h 
\$(srcdir)/config/aarch64/aarch64-sve-builtins.cc"
target_has_targetm_common=yes
;;
diff --git a/gcc/config/aarch64/aarch64-passes.def 
b/gcc/config/aarch64/aarch64-passes.def
index 6ace797b738..fa79e8adca8 100644
--- a/gcc/config/aarch64/aarch64-passes.def
+++ b/gcc/config/aarch64/aarch64-passes.def
@@ -23,3 +23,4 @@ INSERT_PASS_BEFORE (pass_reorder_blocks, 1, 
pass_track_speculation);
 INSERT_PASS_AFTER (pass_machine_reorg, 1, pass_tag_collision_avoidance);
 INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_bti);
 INSERT_PASS_AFTER (pass_if_after_combine, 1, pass_cc_fusion);
+INSERT_PASS_AFTER (pass_peephole2, 1, pass_avoid_store_forwarding);
diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index d2718cc87b3..7d9dfa06af9 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -1050,6 +1050,7 @@ rtl_opt_pass *make_pass_track_speculation (gcc::context 
*);
 rtl_opt_pass *make_pass_tag_collision_avoidance (gcc::context *);
 rtl_opt_pass *make_p

[PATCH v2] aarch64: New RTL optimization pass avoid-store-forwarding.

2023-11-28 Thread Manos Anagnostakis
This is an RTL pass that detects store forwarding from stores to larger loads 
(load pairs).

This optimization is SPEC2017-driven and was found to be beneficial for some 
benchmarks,
through testing on ampere1/ampere1a machines.

For example, it can transform cases like

str  d5, [sp, #320]
fmul d5, d31, d29
ldp  d31, d17, [sp, #312] # Large load from small store

to

str  d5, [sp, #320]
fmul d5, d31, d29
ldr  d31, [sp, #312]
ldr  d17, [sp, #320]

Currently, the pass is disabled by default on all architectures and enabled by 
a target-specific option.

If deemed beneficial enough for a default, it will be enabled on 
ampere1/ampere1a,
or other architectures as well, without needing to be turned on by this option.

Bootstrapped and regtested on aarch64-linux.

gcc/ChangeLog:

* config.gcc: Add aarch64-store-forwarding.o to extra_objs.
* config/aarch64/aarch64-passes.def (INSERT_PASS_AFTER): New pass.
* config/aarch64/aarch64-protos.h (make_pass_avoid_store_forwarding): 
Declare.
* config/aarch64/aarch64.opt (mavoid-store-forwarding): New option.
(aarch64-store-forwarding-threshold): New param.
* config/aarch64/t-aarch64: Add aarch64-store-forwarding.o
* doc/invoke.texi: Document new option and new param.
* config/aarch64/aarch64-store-forwarding.cc: New file.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/ldp_ssll_no_overlap_address.c: New test.
* gcc.target/aarch64/ldp_ssll_no_overlap_offset.c: New test.
* gcc.target/aarch64/ldp_ssll_overlap.c: New test.

Signed-off-by: Manos Anagnostakis 
Co-Authored-By: Manolis Tsamis 
Co-Authored-By: Philipp Tomsich 
---
Changes in v2:
- Remove usage of memrefs_conflict_p with the helper
check_memory_reg_ovewrite, which actually prevents some cases of
being handled correctly.
- Code cleanup requested in v1.

 gcc/config.gcc|   1 +
 gcc/config/aarch64/aarch64-passes.def |   1 +
 gcc/config/aarch64/aarch64-protos.h   |   1 +
 .../aarch64/aarch64-store-forwarding.cc   | 323 ++
 gcc/config/aarch64/aarch64.opt|   9 +
 gcc/config/aarch64/t-aarch64  |  10 +
 gcc/doc/invoke.texi   |  12 +-
 .../aarch64/ldp_ssll_no_overlap_address.c |  33 ++
 .../aarch64/ldp_ssll_no_overlap_offset.c  |  33 ++
 .../gcc.target/aarch64/ldp_ssll_overlap.c |  33 ++
 10 files changed, 455 insertions(+), 1 deletion(-)
 create mode 100644 gcc/config/aarch64/aarch64-store-forwarding.cc
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/ldp_ssll_no_overlap_address.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/ldp_ssll_no_overlap_offset.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_ssll_overlap.c

diff --git a/gcc/config.gcc b/gcc/config.gcc
index b88591b6fd8..4eb41584b94 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -350,6 +350,7 @@ aarch64*-*-*)
cxx_target_objs="aarch64-c.o"
d_target_objs="aarch64-d.o"
extra_objs="aarch64-builtins.o aarch-common.o aarch64-sve-builtins.o 
aarch64-sve-builtins-shapes.o aarch64-sve-builtins-base.o 
aarch64-sve-builtins-sve2.o cortex-a57-fma-steering.o aarch64-speculation.o 
falkor-tag-collision-avoidance.o aarch-bti-insert.o aarch64-cc-fusion.o"
+   extra_objs="${extra_objs} aarch64-store-forwarding.o"
target_gtfiles="\$(srcdir)/config/aarch64/aarch64-builtins.cc 
\$(srcdir)/config/aarch64/aarch64-sve-builtins.h 
\$(srcdir)/config/aarch64/aarch64-sve-builtins.cc"
target_has_targetm_common=yes
;;
diff --git a/gcc/config/aarch64/aarch64-passes.def 
b/gcc/config/aarch64/aarch64-passes.def
index 6ace797b738..fa79e8adca8 100644
--- a/gcc/config/aarch64/aarch64-passes.def
+++ b/gcc/config/aarch64/aarch64-passes.def
@@ -23,3 +23,4 @@ INSERT_PASS_BEFORE (pass_reorder_blocks, 1, 
pass_track_speculation);
 INSERT_PASS_AFTER (pass_machine_reorg, 1, pass_tag_collision_avoidance);
 INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_bti);
 INSERT_PASS_AFTER (pass_if_after_combine, 1, pass_cc_fusion);
+INSERT_PASS_AFTER (pass_peephole2, 1, pass_avoid_store_forwarding);
diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 36d6c688bc8..aee074e58dd 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -1051,6 +1051,7 @@ rtl_opt_pass *make_pass_track_speculation (gcc::context 
*);
 rtl_opt_pass *make_pass_tag_collision_avoidance (gcc::context *);
 rtl_opt_pass *make_pass_insert_bti (gcc::context *ctxt);
 rtl_opt_pass *make_pass_cc_fusion (gcc::context *ctxt);
+rtl_opt_pass *make_pass_avoid_store_forwarding (gcc::context *ctxt);

 poly_uint64 aarch64_regmode_natural_size (machine_mode);

diff --git a/gcc/config/aarch64/aarch64-store-forwarding.cc 
b/gcc/config/aarch64/aarch64-store-forwarding.cc
new file mode 100644
inde

Re: [PATCH] aarch64: New RTL optimization pass avoid-store-forwarding.

2023-11-13 Thread Manos Anagnostakis
Hi Richard,

thank you for reviewing the patch.

On Sat, Nov 11, 2023 at 6:57 PM Richard Sandiford 
wrote:

> Thanks for the patch.
>
> Manos Anagnostakis  writes:
> > This is an RTL pass that detects store forwarding from stores to larger
> loads (load pairs).
> >
> > This optimization is SPEC2017-driven and was found to be beneficial for
> some benchmarks,
> > through testing on ampere1/ampere1a machines.
> >
> > For example, it can transform cases like
> >
> > str  d5, [sp, #320]
> > fmul d5, d31, d29
> > ldp  d31, d17, [sp, #312] # Large load from small store
> >
> > to
> >
> > str  d5, [sp, #320]
> > fmul d5, d31, d29
> > ldr  d31, [sp, #312]
> > ldr  d17, [sp, #320]
>
> For this particular case, it would be good to have a spill forwarding
> pass that inserts:
>
>   mov d17, d5
>
> before the fmul (possibly with further clean-up beyond that).
>
> But I realise the patch handles the general case where that isn't possible.
> The patch can also afford to be approximate.
>
> Some of the insn recognition code will probably need to be updated for
> Alex's pass that forms LDP/STP, since that's going to have writeback
> support.  We'll also need to arrange a pass ording that works for both
> passes.
>
> The patch generally looks good, and it's nicely compact.  Some specific
> comments below:
>
> > Currently, the pass is disabled by default on all architectures and
> enabled by a target-specific option.
> >
> > If deemed beneficial enough for a default, it will be enabled on
> ampere1/ampere1a,
> > or other architectures as well, without needing to be turned on by this
> option.
> >
> > Bootstrapped and regtested on aarch64-linux.
> >
> > gcc/ChangeLog:
> >
> > * alias.cc (memrefs_conflict_p): Expose static function.
> > * alias.h (memrefs_conflict_p): Expose static function.
> > * config.gcc: Add aarch64-store-forwarding.o to extra_objs.
> > * config/aarch64/aarch64-passes.def (INSERT_PASS_AFTER): New
> pass.
> > * config/aarch64/aarch64-protos.h
> (make_pass_avoid_store_forwarding): Declare.
> > * config/aarch64/aarch64.opt (mavoid-store-forwarding): New
> option.
> >   (aarch64-store-forwarding-threshold): New param.
> > * config/aarch64/t-aarch64: Add aarch64-store-forwarding.o
> > * doc/invoke.texi: Document new option and new param.
> > * config/aarch64/aarch64-store-forwarding.cc: New file.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/aarch64/ldp_ssll_no_overlap_address.c: New test.
> > * gcc.target/aarch64/ldp_ssll_no_overlap_offset.c: New test.
> > * gcc.target/aarch64/ldp_ssll_overlap.c: New test.
> >
> > Signed-off-by: Manos Anagnostakis 
> > Co-Authored-By: Manolis Tsamis 
> > Co-Authored-By: Philipp Tomsich 
> > ---
> >  gcc/alias.cc  |   2 +-
> >  gcc/alias.h   |   1 +
> >  gcc/config.gcc|   1 +
> >  gcc/config/aarch64/aarch64-passes.def |   1 +
> >  gcc/config/aarch64/aarch64-protos.h   |   1 +
> >  .../aarch64/aarch64-store-forwarding.cc   | 347 ++
> >  gcc/config/aarch64/aarch64.opt|   9 +
> >  gcc/config/aarch64/t-aarch64  |  10 +
> >  gcc/doc/invoke.texi   |  12 +-
> >  .../aarch64/ldp_ssll_no_overlap_address.c |  33 ++
> >  .../aarch64/ldp_ssll_no_overlap_offset.c  |  33 ++
> >  .../gcc.target/aarch64/ldp_ssll_overlap.c |  33 ++
> >  12 files changed, 481 insertions(+), 2 deletions(-)
> >  create mode 100644 gcc/config/aarch64/aarch64-store-forwarding.cc
> >  create mode 100644
> gcc/testsuite/gcc.target/aarch64/ldp_ssll_no_overlap_address.c
> >  create mode 100644
> gcc/testsuite/gcc.target/aarch64/ldp_ssll_no_overlap_offset.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_ssll_overlap.c
> >
> > diff --git a/gcc/alias.cc b/gcc/alias.cc
> > index 86d8f7104ad..303683f85e3 100644
> > --- a/gcc/alias.cc
> > +++ b/gcc/alias.cc
> > @@ -2488,7 +2488,7 @@ offset_overlap_p (poly_int64 c, poly_int64 xsize,
> poly_int64 ysize)
> > one for X + non-constant and Y + non-constant when X and Y are equal.
> > If that is fixed the TBAA hack for union type-punning can be
> removed.  */
> >
> > -static int
> > +int
> >  memrefs_conflict_p (poly_int64 xsize, rtx x, poly_int64 ysize, rtx y,
> >   poly_int64 c)
&

[PATCH] aarch64: New RTL optimization pass avoid-store-forwarding.

2023-11-07 Thread Manos Anagnostakis
This is an RTL pass that detects store forwarding from stores to larger loads 
(load pairs).

This optimization is SPEC2017-driven and was found to be beneficial for some 
benchmarks,
through testing on ampere1/ampere1a machines.

For example, it can transform cases like

str  d5, [sp, #320]
fmul d5, d31, d29
ldp  d31, d17, [sp, #312] # Large load from small store

to

str  d5, [sp, #320]
fmul d5, d31, d29
ldr  d31, [sp, #312]
ldr  d17, [sp, #320]

Currently, the pass is disabled by default on all architectures and enabled by 
a target-specific option.

If deemed beneficial enough for a default, it will be enabled on 
ampere1/ampere1a,
or other architectures as well, without needing to be turned on by this option.

Bootstrapped and regtested on aarch64-linux.

gcc/ChangeLog:

* alias.cc (memrefs_conflict_p): Expose static function.
* alias.h (memrefs_conflict_p): Expose static function.
* config.gcc: Add aarch64-store-forwarding.o to extra_objs.
* config/aarch64/aarch64-passes.def (INSERT_PASS_AFTER): New pass.
* config/aarch64/aarch64-protos.h (make_pass_avoid_store_forwarding): 
Declare.
* config/aarch64/aarch64.opt (mavoid-store-forwarding): New option.
(aarch64-store-forwarding-threshold): New param.
* config/aarch64/t-aarch64: Add aarch64-store-forwarding.o
* doc/invoke.texi: Document new option and new param.
* config/aarch64/aarch64-store-forwarding.cc: New file.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/ldp_ssll_no_overlap_address.c: New test.
* gcc.target/aarch64/ldp_ssll_no_overlap_offset.c: New test.
* gcc.target/aarch64/ldp_ssll_overlap.c: New test.

Signed-off-by: Manos Anagnostakis 
Co-Authored-By: Manolis Tsamis 
Co-Authored-By: Philipp Tomsich 
---
 gcc/alias.cc  |   2 +-
 gcc/alias.h   |   1 +
 gcc/config.gcc|   1 +
 gcc/config/aarch64/aarch64-passes.def |   1 +
 gcc/config/aarch64/aarch64-protos.h   |   1 +
 .../aarch64/aarch64-store-forwarding.cc   | 347 ++
 gcc/config/aarch64/aarch64.opt|   9 +
 gcc/config/aarch64/t-aarch64  |  10 +
 gcc/doc/invoke.texi   |  12 +-
 .../aarch64/ldp_ssll_no_overlap_address.c |  33 ++
 .../aarch64/ldp_ssll_no_overlap_offset.c  |  33 ++
 .../gcc.target/aarch64/ldp_ssll_overlap.c |  33 ++
 12 files changed, 481 insertions(+), 2 deletions(-)
 create mode 100644 gcc/config/aarch64/aarch64-store-forwarding.cc
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/ldp_ssll_no_overlap_address.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/ldp_ssll_no_overlap_offset.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_ssll_overlap.c

diff --git a/gcc/alias.cc b/gcc/alias.cc
index 86d8f7104ad..303683f85e3 100644
--- a/gcc/alias.cc
+++ b/gcc/alias.cc
@@ -2488,7 +2488,7 @@ offset_overlap_p (poly_int64 c, poly_int64 xsize, 
poly_int64 ysize)
one for X + non-constant and Y + non-constant when X and Y are equal.
If that is fixed the TBAA hack for union type-punning can be removed.  */

-static int
+int
 memrefs_conflict_p (poly_int64 xsize, rtx x, poly_int64 ysize, rtx y,
poly_int64 c)
 {
diff --git a/gcc/alias.h b/gcc/alias.h
index ab06ac9055f..49836f7d808 100644
--- a/gcc/alias.h
+++ b/gcc/alias.h
@@ -41,6 +41,7 @@ bool alias_ptr_types_compatible_p (tree, tree);
 int compare_base_decls (tree, tree);
 bool refs_same_for_tbaa_p (tree, tree);
 bool mems_same_for_tbaa_p (rtx, rtx);
+int memrefs_conflict_p (poly_int64, rtx, poly_int64, rtx, poly_int64);

 /* This alias set can be used to force a memory to conflict with all
other memories, creating a barrier across which no memory reference
diff --git a/gcc/config.gcc b/gcc/config.gcc
index ba6d63e33ac..ae50d36004f 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -350,6 +350,7 @@ aarch64*-*-*)
cxx_target_objs="aarch64-c.o"
d_target_objs="aarch64-d.o"
extra_objs="aarch64-builtins.o aarch-common.o aarch64-sve-builtins.o 
aarch64-sve-builtins-shapes.o aarch64-sve-builtins-base.o 
aarch64-sve-builtins-sve2.o cortex-a57-fma-steering.o aarch64-speculation.o 
falkor-tag-collision-avoidance.o aarch-bti-insert.o aarch64-cc-fusion.o"
+   extra_objs="${extra_objs} aarch64-store-forwarding.o"
target_gtfiles="\$(srcdir)/config/aarch64/aarch64-builtins.cc 
\$(srcdir)/config/aarch64/aarch64-sve-builtins.h 
\$(srcdir)/config/aarch64/aarch64-sve-builtins.cc"
target_has_targetm_common=yes
;;
diff --git a/gcc/config/aarch64/aarch64-passes.def 
b/gcc/config/aarch64/aarch64-passes.def
index 6ace797b738..fa79e8adca8 100644
--- a/gcc/config/aarch64/aarch64-passes.def
+++ b/gcc/config/aarch64/aarch64-passes.def
@@ -23,3 +23,4 @@ INSERT_PASS_BEFORE (pass_reorder_blocks, 1, 
pass_track_specu

[PATCH v2] aarch64: Improve on ldp-stp policies code structure.

2023-09-29 Thread Manos Anagnostakis
Improves on: 834fc2bf

This improves the code structure of the ldp-stp policies
patch introduced in 834fc2bf

Bootstrapped and regtested on aarch64-linux.

gcc/ChangeLog:
* config/aarch64/aarch64-opts.h (enum aarch64_ldp_policy): Removed.
(enum aarch64_ldp_stp_policy): Merged enums aarch64_ldp_policy
and aarch64_stp_policy to aarch64_ldp_stp_policy.
(enum aarch64_stp_policy): Removed.
* config/aarch64/aarch64-protos.h (struct tune_params): Removed
aarch64_ldp_policy_model and aarch64_stp_policy_model enum types
and left only the definitions to the aarch64-opts one.
* config/aarch64/aarch64.cc (aarch64_parse_ldp_policy): Removed.
(aarch64_parse_stp_policy): Removed.
(aarch64_override_options_internal): Removed calls to parsing
functions and added obvious direct assignments.
(aarch64_mem_ok_with_ldpstp_policy_model): Improved
code quality based on the new changes.
* config/aarch64/aarch64.opt: Use single enum type
aarch64_ldp_stp_policy for both ldp and stp options.

gcc/testsuite/ChangeLog:
* gcc.target/aarch64/ldp_aligned.c: Splitted into this and
ldp_unaligned.
* gcc.target/aarch64/stp_aligned.c: Splitted into this and
stp_unaligned.
* gcc.target/aarch64/ldp_unaligned.c: New test.
* gcc.target/aarch64/stp_unaligned.c: New test.

Signed-off-by: Manos Anagnostakis 
---
 gcc/config/aarch64/aarch64-opts.h |  26 ++-
 gcc/config/aarch64/aarch64-protos.h   |  25 +--
 gcc/config/aarch64/aarch64.cc | 160 +++---
 gcc/config/aarch64/aarch64.opt|  29 +---
 .../gcc.target/aarch64/ldp_aligned.c  |  28 ---
 .../gcc.target/aarch64/ldp_unaligned.c|  40 +
 .../gcc.target/aarch64/stp_aligned.c  |  25 ---
 .../gcc.target/aarch64/stp_unaligned.c|  37 
 8 files changed, 155 insertions(+), 215 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_unaligned.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/stp_unaligned.c

diff --git a/gcc/config/aarch64/aarch64-opts.h 
b/gcc/config/aarch64/aarch64-opts.h
index db8348507a3..831e28ab52a 100644
--- a/gcc/config/aarch64/aarch64-opts.h
+++ b/gcc/config/aarch64/aarch64-opts.h
@@ -108,20 +108,18 @@ enum aarch64_key_type {
   AARCH64_KEY_B
 };
 
-/* Load pair policy type.  */
-enum aarch64_ldp_policy {
-  LDP_POLICY_DEFAULT,
-  LDP_POLICY_ALWAYS,
-  LDP_POLICY_NEVER,
-  LDP_POLICY_ALIGNED
-};
-
-/* Store pair policy type.  */
-enum aarch64_stp_policy {
-  STP_POLICY_DEFAULT,
-  STP_POLICY_ALWAYS,
-  STP_POLICY_NEVER,
-  STP_POLICY_ALIGNED
+/* An enum specifying how to handle load and store pairs using
+   a fine-grained policy:
+   - LDP_STP_POLICY_DEFAULT: Use the policy defined in the tuning structure.
+   - LDP_STP_POLICY_ALIGNED: Emit ldp/stp if the source pointer is aligned
+   to at least double the alignment of the type.
+   - LDP_STP_POLICY_ALWAYS: Emit ldp/stp regardless of alignment.
+   - LDP_STP_POLICY_NEVER: Do not emit ldp/stp.  */
+enum aarch64_ldp_stp_policy {
+  AARCH64_LDP_STP_POLICY_DEFAULT,
+  AARCH64_LDP_STP_POLICY_ALIGNED,
+  AARCH64_LDP_STP_POLICY_ALWAYS,
+  AARCH64_LDP_STP_POLICY_NEVER
 };
 
 #endif
diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 5c6802b4fe8..60a55f4bc19 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -568,30 +568,9 @@ struct tune_params
   /* Place prefetch struct pointer at the end to enable type checking
  errors when tune_params misses elements (e.g., from erroneous merges).  */
   const struct cpu_prefetch_tune *prefetch;
-/* An enum specifying how to handle load pairs using a fine-grained policy:
-   - LDP_POLICY_ALIGNED: Emit ldp if the source pointer is aligned
-   to at least double the alignment of the type.
-   - LDP_POLICY_ALWAYS: Emit ldp regardless of alignment.
-   - LDP_POLICY_NEVER: Do not emit ldp.  */
 
-  enum aarch64_ldp_policy_model
-  {
-LDP_POLICY_ALIGNED,
-LDP_POLICY_ALWAYS,
-LDP_POLICY_NEVER
-  } ldp_policy_model;
-/* An enum specifying how to handle store pairs using a fine-grained policy:
-   - STP_POLICY_ALIGNED: Emit stp if the source pointer is aligned
-   to at least double the alignment of the type.
-   - STP_POLICY_ALWAYS: Emit stp regardless of alignment.
-   - STP_POLICY_NEVER: Do not emit stp.  */
-
-  enum aarch64_stp_policy_model
-  {
-STP_POLICY_ALIGNED,
-STP_POLICY_ALWAYS,
-STP_POLICY_NEVER
-  } stp_policy_model;
+  /* Define models for the aarch64_ldp_stp_policy.  */
+  enum aarch64_ldp_stp_policy ldp_policy_model, stp_policy_model;
 };
 
 /* Classifies an address.
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index aa920fc703a..dcded70c981 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -1358,8 +1358,8 @@ static const struct

[PATCH] aarch64: Improve on ldp-stp policies code structure.

2023-09-29 Thread Manos Anagnostakis
Improves on: 834fc2bf

This improves the code structure of the ldp-stp policies
patch introduced in 834fc2bf

Bootstrapped and regtested on aarch64-linux.

gcc/ChangeLog:
* config/aarch64/aarch64-opts.h (enum aarch64_ldp_policy):
Added AARCH64 prefix.
(enum aarch64_stp_policy): Added AARCH64 prefix.
* config/aarch64/aarch64-protos.h (struct tune_params):
Merged enums aarch64_ldp_policy_model and aarch64_stp_policy_model
to aarch64_ldp_stp_policy_model.
* config/aarch64/aarch64.cc (aarch64_parse_ldp_policy): Removed.
(aarch64_parse_ldp_stp_policy): New function.
(aarch64_parse_stp_policy): Removed.
(aarch64_override_options_internal): Added call to
new parsing function and removed superseded ones.
(aarch64_mem_ok_with_ldpstp_policy_model): Improved
code quality based on the new changes.
* config/aarch64/aarch64.opt: Added AARCH64 prefix.

gcc/testsuite/ChangeLog:
* gcc.target/aarch64/ldp_aligned.c: Splitted into this and
ldp_unaligned.
* gcc.target/aarch64/stp_aligned.c: Splitted into this and
stp_unaligned.
* gcc.target/aarch64/ldp_unaligned.c: New test.
* gcc.target/aarch64/stp_unaligned.c: New test.

Signed-off-by: Manos Anagnostakis 
---
 gcc/config/aarch64/aarch64-opts.h |  16 +-
 gcc/config/aarch64/aarch64-protos.h   |  30 +--
 gcc/config/aarch64/aarch64.cc | 184 --
 gcc/config/aarch64/aarch64.opt|  20 +-
 .../gcc.target/aarch64/ldp_aligned.c  |  28 ---
 .../gcc.target/aarch64/ldp_unaligned.c|  40 
 .../gcc.target/aarch64/stp_aligned.c  |  25 ---
 .../gcc.target/aarch64/stp_unaligned.c|  37 
 8 files changed, 189 insertions(+), 191 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_unaligned.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/stp_unaligned.c

diff --git a/gcc/config/aarch64/aarch64-opts.h 
b/gcc/config/aarch64/aarch64-opts.h
index db8348507a3..e23e1ea200e 100644
--- a/gcc/config/aarch64/aarch64-opts.h
+++ b/gcc/config/aarch64/aarch64-opts.h
@@ -110,18 +110,18 @@ enum aarch64_key_type {
 
 /* Load pair policy type.  */
 enum aarch64_ldp_policy {
-  LDP_POLICY_DEFAULT,
-  LDP_POLICY_ALWAYS,
-  LDP_POLICY_NEVER,
-  LDP_POLICY_ALIGNED
+  AARCH64_LDP_POLICY_DEFAULT,
+  AARCH64_LDP_POLICY_ALWAYS,
+  AARCH64_LDP_POLICY_NEVER,
+  AARCH64_LDP_POLICY_ALIGNED
 };
 
 /* Store pair policy type.  */
 enum aarch64_stp_policy {
-  STP_POLICY_DEFAULT,
-  STP_POLICY_ALWAYS,
-  STP_POLICY_NEVER,
-  STP_POLICY_ALIGNED
+  AARCH64_STP_POLICY_DEFAULT,
+  AARCH64_STP_POLICY_ALWAYS,
+  AARCH64_STP_POLICY_NEVER,
+  AARCH64_STP_POLICY_ALIGNED
 };
 
 #endif
diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 5c6802b4fe8..7d19111a215 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -568,30 +568,20 @@ struct tune_params
   /* Place prefetch struct pointer at the end to enable type checking
  errors when tune_params misses elements (e.g., from erroneous merges).  */
   const struct cpu_prefetch_tune *prefetch;
-/* An enum specifying how to handle load pairs using a fine-grained policy:
-   - LDP_POLICY_ALIGNED: Emit ldp if the source pointer is aligned
-   to at least double the alignment of the type.
-   - LDP_POLICY_ALWAYS: Emit ldp regardless of alignment.
-   - LDP_POLICY_NEVER: Do not emit ldp.  */
 
-  enum aarch64_ldp_policy_model
-  {
-LDP_POLICY_ALIGNED,
-LDP_POLICY_ALWAYS,
-LDP_POLICY_NEVER
-  } ldp_policy_model;
-/* An enum specifying how to handle store pairs using a fine-grained policy:
-   - STP_POLICY_ALIGNED: Emit stp if the source pointer is aligned
+/* An enum specifying how to handle load and store pairs using
+   a fine-grained policy:
+   - LDP_STP_POLICY_ALIGNED: Emit ldp/stp if the source pointer is aligned
to at least double the alignment of the type.
-   - STP_POLICY_ALWAYS: Emit stp regardless of alignment.
-   - STP_POLICY_NEVER: Do not emit stp.  */
+   - LDP_STP_POLICY_ALWAYS: Emit ldp/stp regardless of alignment.
+   - LDP_STP_POLICY_NEVER: Do not emit ldp/stp.  */
 
-  enum aarch64_stp_policy_model
+  enum aarch64_ldp_stp_policy_model
   {
-STP_POLICY_ALIGNED,
-STP_POLICY_ALWAYS,
-STP_POLICY_NEVER
-  } stp_policy_model;
+AARCH64_LDP_STP_POLICY_ALIGNED,
+AARCH64_LDP_STP_POLICY_ALWAYS,
+AARCH64_LDP_STP_POLICY_NEVER
+  } ldp_policy_model, stp_policy_model;
 };
 
 /* Classifies an address.
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index aa920fc703a..8edc4c18643 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -1358,8 +1358,8 @@ static const struct tune_params generic_tunings =
  have at most a very minor effect on SVE2 cores.  */
   (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS),   /* tune_flags

Re: [PATCH v2] aarch64: Fine-grained ldp and stp policies with test-cases.

2023-09-28 Thread Manos Anagnostakis
No problem!

I'll send a follow up with the requested changes.

Thanks for the input!

Manos.

On Thu, Sep 28, 2023 at 4:42 PM Richard Sandiford 
wrote:

> Manos Anagnostakis  writes:
> > Hey Richard,
> >
> > Thanks for taking the time to review this, but it has been commited since
> > yesterday after getting reviewed by Kyrill and Tamar.
> >
> > Discussions:
> > https://gcc.gnu.org/pipermail/gcc-patches/2023-September/631285.html
> > https://gcc.gnu.org/pipermail/gcc-patches/2023-September/631300.html
> > https://gcc.gnu.org/pipermail/gcc-patches/2023-September/631389.html
> >
> > Commited version:
> > https://gcc.gnu.org/pipermail/gcc-patches/2023-September/631484.html
>
> Sorry about that.  I had v3 being filtered differently and so it went
> into a different inbox.
>
> Richard
>
> >
> > Manos.
> >
> > On Thu, Sep 28, 2023 at 4:17 PM Richard Sandiford <
> richard.sandif...@arm.com>
> > wrote:
> >
> >> Thanks for the patch and sorry for the slow review.
> >>
> >> Manos Anagnostakis  writes:
> >> > This patch implements the following TODO in
> gcc/config/aarch64/aarch64.cc
> >> > to provide the requested behaviour for handling ldp and stp:
> >> >
> >> >   /* Allow the tuning structure to disable LDP instruction formation
> >> >  from combining instructions (e.g., in peephole2).
> >> >  TODO: Implement fine-grained tuning control for LDP and STP:
> >> >1. control policies for load and store separately;
> >> >2. support the following policies:
> >> >   - default (use what is in the tuning structure)
> >> >   - always
> >> >   - never
> >> >   - aligned (only if the compiler can prove that the
> >> > load will be aligned to 2 * element_size)  */
> >> >
> >> > It provides two new and concrete command-line options -mldp-policy and
> >> -mstp-policy
> >> > to give the ability to control load and store policies seperately as
> >> > stated in part 1 of the TODO.
> >> >
> >> > The accepted values for both options are:
> >> > - default: Use the ldp/stp policy defined in the corresponding tuning
> >> >   structure.
> >> > - always: Emit ldp/stp regardless of alignment.
> >> > - never: Do not emit ldp/stp.
> >> > - aligned: In order to emit ldp/stp, first check if the load/store
> will
> >> >   be aligned to 2 * element_size.
> >> >
> >> > gcc/ChangeLog:
> >> > * config/aarch64/aarch64-protos.h (struct tune_params): Add
> >> >   appropriate enums for the policies.
> >> > * config/aarch64/aarch64-tuning-flags.def
> >> >   (AARCH64_EXTRA_TUNING_OPTION): Remove superseded tuning
> >> >   options.
> >> > * config/aarch64/aarch64.cc (aarch64_parse_ldp_policy): New
> >> >   function to parse ldp-policy option.
> >> > (aarch64_parse_stp_policy): New function to parse stp-policy
> >> option.
> >> > (aarch64_override_options_internal): Call parsing functions.
> >> > (aarch64_operands_ok_for_ldpstp): Add option-value check and
> >> >   alignment check and remove superseded ones
> >> > (aarch64_operands_adjust_ok_for_ldpstp): Add option-value
> check
> >> and
> >> >   alignment check and remove superseded ones.
> >> > * config/aarch64/aarch64.opt: Add options.
> >> >
> >> > gcc/testsuite/ChangeLog:
> >> > * gcc.target/aarch64/ldp_aligned.c: New test.
> >> > * gcc.target/aarch64/ldp_always.c: New test.
> >> > * gcc.target/aarch64/ldp_never.c: New test.
> >> > * gcc.target/aarch64/stp_aligned.c: New test.
> >> > * gcc.target/aarch64/stp_always.c: New test.
> >> > * gcc.target/aarch64/stp_never.c: New test.
> >> >
> >> > Signed-off-by: Manos Anagnostakis 
> >> > ---
> >> > Changes in v2:
> >> > - Fixed commited ldp tests to correctly trigger
> >> >   and test aarch64_operands_adjust_ok_for_ldpstp in
> aarch64.cc.
> >> > - Added "-mcpu=generic" to commited tests to guarantee generic
> >> target code
> >> >   generation and not cause the regressions of v1.
> >> >
>

Re: [PATCH v2] aarch64: Fine-grained ldp and stp policies with test-cases.

2023-09-28 Thread Manos Anagnostakis
Sure, I will attend to this.

Manos.

On Thu, Sep 28, 2023 at 4:37 PM Philipp Tomsich 
wrote:

> Manos,
>
> Please submit a follow-on patch implementing the requested
> improvements of the code structure (as this reduces the maintenance
> burden).
>
> Thanks,
> Philipp.
>
>
> On Thu, 28 Sept 2023 at 15:33, Manos Anagnostakis
>  wrote:
> >
> > Hey Richard,
> >
> > Thanks for taking the time to review this, but it has been commited
> since yesterday after getting reviewed by Kyrill and Tamar.
> >
> > Discussions:
> > https://gcc.gnu.org/pipermail/gcc-patches/2023-September/631285.html
> > https://gcc.gnu.org/pipermail/gcc-patches/2023-September/631300.html
> > https://gcc.gnu.org/pipermail/gcc-patches/2023-September/631389.html
> >
> > Commited version:
> > https://gcc.gnu.org/pipermail/gcc-patches/2023-September/631484.html
> >
> > Manos.
> >
> > On Thu, Sep 28, 2023 at 4:17 PM Richard Sandiford <
> richard.sandif...@arm.com> wrote:
> >>
> >> Thanks for the patch and sorry for the slow review.
> >>
> >> Manos Anagnostakis  writes:
> >> > This patch implements the following TODO in
> gcc/config/aarch64/aarch64.cc
> >> > to provide the requested behaviour for handling ldp and stp:
> >> >
> >> >   /* Allow the tuning structure to disable LDP instruction formation
> >> >  from combining instructions (e.g., in peephole2).
> >> >  TODO: Implement fine-grained tuning control for LDP and STP:
> >> >1. control policies for load and store separately;
> >> >2. support the following policies:
> >> >   - default (use what is in the tuning structure)
> >> >   - always
> >> >   - never
> >> >   - aligned (only if the compiler can prove that the
> >> > load will be aligned to 2 * element_size)  */
> >> >
> >> > It provides two new and concrete command-line options -mldp-policy
> and -mstp-policy
> >> > to give the ability to control load and store policies seperately as
> >> > stated in part 1 of the TODO.
> >> >
> >> > The accepted values for both options are:
> >> > - default: Use the ldp/stp policy defined in the corresponding tuning
> >> >   structure.
> >> > - always: Emit ldp/stp regardless of alignment.
> >> > - never: Do not emit ldp/stp.
> >> > - aligned: In order to emit ldp/stp, first check if the load/store
> will
> >> >   be aligned to 2 * element_size.
> >> >
> >> > gcc/ChangeLog:
> >> > * config/aarch64/aarch64-protos.h (struct tune_params): Add
> >> >   appropriate enums for the policies.
> >> > * config/aarch64/aarch64-tuning-flags.def
> >> >   (AARCH64_EXTRA_TUNING_OPTION): Remove superseded tuning
> >> >   options.
> >> > * config/aarch64/aarch64.cc (aarch64_parse_ldp_policy): New
> >> >   function to parse ldp-policy option.
> >> > (aarch64_parse_stp_policy): New function to parse stp-policy
> option.
> >> > (aarch64_override_options_internal): Call parsing functions.
> >> > (aarch64_operands_ok_for_ldpstp): Add option-value check and
> >> >   alignment check and remove superseded ones
> >> > (aarch64_operands_adjust_ok_for_ldpstp): Add option-value
> check and
> >> >   alignment check and remove superseded ones.
> >> > * config/aarch64/aarch64.opt: Add options.
> >> >
> >> > gcc/testsuite/ChangeLog:
> >> > * gcc.target/aarch64/ldp_aligned.c: New test.
> >> > * gcc.target/aarch64/ldp_always.c: New test.
> >> > * gcc.target/aarch64/ldp_never.c: New test.
> >> > * gcc.target/aarch64/stp_aligned.c: New test.
> >> > * gcc.target/aarch64/stp_always.c: New test.
> >> > * gcc.target/aarch64/stp_never.c: New test.
> >> >
> >> > Signed-off-by: Manos Anagnostakis 
> >> > ---
> >> > Changes in v2:
> >> > - Fixed commited ldp tests to correctly trigger
> >> >   and test aarch64_operands_adjust_ok_for_ldpstp in
> aarch64.cc.
> >> > - Added "-mcpu=generic" to commited tests to guarantee
> generic target code
> >> >   generation and not cause the regressions of v1.
> >> 

Re: [PATCH v2] aarch64: Fine-grained ldp and stp policies with test-cases.

2023-09-28 Thread Manos Anagnostakis
Hey Richard,

Thanks for taking the time to review this, but it has been commited since
yesterday after getting reviewed by Kyrill and Tamar.

Discussions:
https://gcc.gnu.org/pipermail/gcc-patches/2023-September/631285.html
https://gcc.gnu.org/pipermail/gcc-patches/2023-September/631300.html
https://gcc.gnu.org/pipermail/gcc-patches/2023-September/631389.html

Commited version:
https://gcc.gnu.org/pipermail/gcc-patches/2023-September/631484.html

Manos.

On Thu, Sep 28, 2023 at 4:17 PM Richard Sandiford 
wrote:

> Thanks for the patch and sorry for the slow review.
>
> Manos Anagnostakis  writes:
> > This patch implements the following TODO in gcc/config/aarch64/aarch64.cc
> > to provide the requested behaviour for handling ldp and stp:
> >
> >   /* Allow the tuning structure to disable LDP instruction formation
> >  from combining instructions (e.g., in peephole2).
> >  TODO: Implement fine-grained tuning control for LDP and STP:
> >1. control policies for load and store separately;
> >2. support the following policies:
> >   - default (use what is in the tuning structure)
> >   - always
> >   - never
> >   - aligned (only if the compiler can prove that the
> > load will be aligned to 2 * element_size)  */
> >
> > It provides two new and concrete command-line options -mldp-policy and
> -mstp-policy
> > to give the ability to control load and store policies seperately as
> > stated in part 1 of the TODO.
> >
> > The accepted values for both options are:
> > - default: Use the ldp/stp policy defined in the corresponding tuning
> >   structure.
> > - always: Emit ldp/stp regardless of alignment.
> > - never: Do not emit ldp/stp.
> > - aligned: In order to emit ldp/stp, first check if the load/store will
> >   be aligned to 2 * element_size.
> >
> > gcc/ChangeLog:
> > * config/aarch64/aarch64-protos.h (struct tune_params): Add
> >   appropriate enums for the policies.
> > * config/aarch64/aarch64-tuning-flags.def
> >   (AARCH64_EXTRA_TUNING_OPTION): Remove superseded tuning
> >   options.
> > * config/aarch64/aarch64.cc (aarch64_parse_ldp_policy): New
> >   function to parse ldp-policy option.
> > (aarch64_parse_stp_policy): New function to parse stp-policy
> option.
> > (aarch64_override_options_internal): Call parsing functions.
> > (aarch64_operands_ok_for_ldpstp): Add option-value check and
> >   alignment check and remove superseded ones
> > (aarch64_operands_adjust_ok_for_ldpstp): Add option-value check
> and
> >   alignment check and remove superseded ones.
> > * config/aarch64/aarch64.opt: Add options.
> >
> > gcc/testsuite/ChangeLog:
> > * gcc.target/aarch64/ldp_aligned.c: New test.
> > * gcc.target/aarch64/ldp_always.c: New test.
> > * gcc.target/aarch64/ldp_never.c: New test.
> > * gcc.target/aarch64/stp_aligned.c: New test.
> > * gcc.target/aarch64/stp_always.c: New test.
> > * gcc.target/aarch64/stp_never.c: New test.
> >
> > Signed-off-by: Manos Anagnostakis 
> > ---
> > Changes in v2:
> > - Fixed commited ldp tests to correctly trigger
> >   and test aarch64_operands_adjust_ok_for_ldpstp in aarch64.cc.
> > - Added "-mcpu=generic" to commited tests to guarantee generic
> target code
> >   generation and not cause the regressions of v1.
> >
> >  gcc/config/aarch64/aarch64-protos.h   |  24 ++
> >  gcc/config/aarch64/aarch64-tuning-flags.def   |   8 -
> >  gcc/config/aarch64/aarch64.cc | 229 ++
> >  gcc/config/aarch64/aarch64.opt|   8 +
> >  .../gcc.target/aarch64/ldp_aligned.c  |  66 +
> >  gcc/testsuite/gcc.target/aarch64/ldp_always.c |  66 +
> >  gcc/testsuite/gcc.target/aarch64/ldp_never.c  |  66 +
> >  .../gcc.target/aarch64/stp_aligned.c  |  60 +
> >  gcc/testsuite/gcc.target/aarch64/stp_always.c |  60 +
> >  gcc/testsuite/gcc.target/aarch64/stp_never.c  |  60 +
> >  10 files changed, 586 insertions(+), 61 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_aligned.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_always.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_never.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/stp_aligned.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/stp_always.c
> 

[PATCH v5] aarch64: Fine-grained policies to control ldp-stp formation.

2023-09-27 Thread Manos Anagnostakis
This patch implements the following TODO in gcc/config/aarch64/aarch64.cc
to provide the requested behaviour for handling ldp and stp:

  /* Allow the tuning structure to disable LDP instruction formation
 from combining instructions (e.g., in peephole2).
 TODO: Implement fine-grained tuning control for LDP and STP:
   1. control policies for load and store separately;
   2. support the following policies:
  - default (use what is in the tuning structure)
  - always
  - never
  - aligned (only if the compiler can prove that the
load will be aligned to 2 * element_size)  */

It provides two new and concrete target-specific command-line parameters
-param=aarch64-ldp-policy= and -param=aarch64-stp-policy=
to give the ability to control load and store policies seperately as
stated in part 1 of the TODO.

The accepted values for both parameters are:
- default: Use the policy of the tuning structure (default).
- always: Emit ldp/stp regardless of alignment.
- never: Do not emit ldp/stp.
- aligned: In order to emit ldp/stp, first check if the load/store will
  be aligned to 2 * element_size.

Bootstrapped and regtested aarch64-linux.

gcc/ChangeLog:
* config/aarch64/aarch64-opts.h (enum aarch64_ldp_policy): New
enum type.
(enum aarch64_stp_policy): New enum type.
* config/aarch64/aarch64-protos.h (struct tune_params): Add
appropriate enums for the policies.
(aarch64_mem_ok_with_ldpstp_policy_model): New declaration.
* config/aarch64/aarch64-tuning-flags.def
(AARCH64_EXTRA_TUNING_OPTION): Remove superseded tuning
options.
* config/aarch64/aarch64.cc (aarch64_parse_ldp_policy): New
function to parse ldp-policy parameter.
(aarch64_parse_stp_policy): New function to parse stp-policy parameter.
(aarch64_override_options_internal): Call parsing functions.
(aarch64_mem_ok_with_ldpstp_policy_model): New function.
(aarch64_operands_ok_for_ldpstp): Add call to
aarch64_mem_ok_with_ldpstp_policy_model for parameter-value
check and alignment check and remove superseded ones.
(aarch64_operands_adjust_ok_for_ldpstp): Add call to
aarch64_mem_ok_with_ldpstp_policy_model for parameter-value
check and alignment check and remove superseded ones.
* config/aarch64/aarch64.opt (aarch64-ldp-policy): New param.
(aarch64-stp-policy): New param.
* doc/invoke.texi: Document the parameters accordingly.

gcc/testsuite/ChangeLog:
* gcc.target/aarch64/ampere1-no_ldp_combine.c: Removed.
* gcc.target/aarch64/ldp_aligned.c: New test.
* gcc.target/aarch64/ldp_always.c: New test.
* gcc.target/aarch64/ldp_never.c: New test.
* gcc.target/aarch64/stp_aligned.c: New test.
* gcc.target/aarch64/stp_always.c: New test.
* gcc.target/aarch64/stp_never.c: New test.

Signed-off-by: Manos Anagnostakis 
---
Changes in v5:
- Adjust ChangeLog for aarch64.opt.

 gcc/config/aarch64/aarch64-opts.h |  16 ++
 gcc/config/aarch64/aarch64-protos.h   |  25 +++
 gcc/config/aarch64/aarch64-tuning-flags.def   |   8 -
 gcc/config/aarch64/aarch64.cc | 212 +-
 gcc/config/aarch64/aarch64.opt|  38 
 gcc/doc/invoke.texi   |  20 ++
 .../aarch64/ampere1-no_ldp_combine.c  |  11 -
 .../gcc.target/aarch64/ldp_aligned.c  |  66 ++
 gcc/testsuite/gcc.target/aarch64/ldp_always.c |  66 ++
 gcc/testsuite/gcc.target/aarch64/ldp_never.c  |  66 ++
 .../gcc.target/aarch64/stp_aligned.c  |  60 +
 gcc/testsuite/gcc.target/aarch64/stp_always.c |  60 +
 gcc/testsuite/gcc.target/aarch64/stp_never.c  |  60 +
 13 files changed, 632 insertions(+), 76 deletions(-)
 delete mode 100644 gcc/testsuite/gcc.target/aarch64/ampere1-no_ldp_combine.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_aligned.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_always.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_never.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/stp_aligned.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/stp_always.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/stp_never.c

diff --git a/gcc/config/aarch64/aarch64-opts.h 
b/gcc/config/aarch64/aarch64-opts.h
index 7e8f1babed8..db8348507a3 100644
--- a/gcc/config/aarch64/aarch64-opts.h
+++ b/gcc/config/aarch64/aarch64-opts.h
@@ -108,4 +108,20 @@ enum aarch64_key_type {
   AARCH64_KEY_B
 };
 
+/* Load pair policy type.  */
+enum aarch64_ldp_policy {
+  LDP_POLICY_DEFAULT,
+  LDP_POLICY_ALWAYS,
+  LDP_POLICY_NEVER,
+  LDP_POLICY_ALIGNED
+};
+
+/* Store pair policy type.  */
+enum aarch64_stp_policy {
+  STP_POLICY_DEFAULT,
+  STP_POLICY_ALWAYS,
+  STP_POLICY_NEVER,
+  STP_POLICY_ALIGNED
+};
+
 #endif
diff --git a/gcc/config/aarch64

Re: [PATCH v4] aarch64: Fine-grained policies to control ldp-stp formation.

2023-09-27 Thread Manos Anagnostakis
Thanks Kyrill!

Submitting the obvious v5.

Manos.

On Wed, Sep 27, 2023 at 11:40 AM Kyrylo Tkachov 
wrote:

> Hi Manos,
>
> > -Original Message-
> > From: Manos Anagnostakis 
> > Sent: Tuesday, September 26, 2023 2:52 PM
> > To: gcc-patches@gcc.gnu.org
> > Cc: Kyrylo Tkachov ; Tamar Christina
> > ; Philipp Tomsich ;
> > Manos Anagnostakis 
> > Subject: [PATCH v4] aarch64: Fine-grained policies to control ldp-stp
> > formation.
> >
> > This patch implements the following TODO in gcc/config/aarch64/aarch64.cc
> > to provide the requested behaviour for handling ldp and stp:
> >
> >   /* Allow the tuning structure to disable LDP instruction formation
> >  from combining instructions (e.g., in peephole2).
> >  TODO: Implement fine-grained tuning control for LDP and STP:
> >1. control policies for load and store separately;
> >2. support the following policies:
> >   - default (use what is in the tuning structure)
> >   - always
> >   - never
> >   - aligned (only if the compiler can prove that the
> > load will be aligned to 2 * element_size)  */
> >
> > It provides two new and concrete target-specific command-line parameters
> > -param=aarch64-ldp-policy= and -param=aarch64-stp-policy=
> > to give the ability to control load and store policies seperately as
> > stated in part 1 of the TODO.
> >
> > The accepted values for both parameters are:
> > - default: Use the policy of the tuning structure (default).
> > - always: Emit ldp/stp regardless of alignment.
> > - never: Do not emit ldp/stp.
> > - aligned: In order to emit ldp/stp, first check if the load/store will
> >   be aligned to 2 * element_size.
> >
> > Bootstrapped and regtested aarch64-linux.
> >
> > gcc/ChangeLog:
> > * config/aarch64/aarch64-opts.h (enum aarch64_ldp_policy): New
> >   enum type.
> > (enum aarch64_stp_policy): New enum type.
> > * config/aarch64/aarch64-protos.h (struct tune_params): Add
> >   appropriate enums for the policies.
> >   (aarch64_mem_ok_with_ldpstp_policy_model): New declaration.
> > * config/aarch64/aarch64-tuning-flags.def
> >   (AARCH64_EXTRA_TUNING_OPTION): Remove superseded tuning
> >   options.
> > * config/aarch64/aarch64.cc (aarch64_parse_ldp_policy): New
> >   function to parse ldp-policy parameter.
> > (aarch64_parse_stp_policy): New function to parse stp-policy
> parameter.
> > (aarch64_override_options_internal): Call parsing functions.
> >   (aarch64_mem_ok_with_ldpstp_policy_model): New function.
> > (aarch64_operands_ok_for_ldpstp): Add call to
> >   aarch64_mem_ok_with_ldpstp_policy_model for parameter-value
> >   check and alignment check and remove superseded ones.
> > (aarch64_operands_adjust_ok_for_ldpstp): Add call to
> > aarch64_mem_ok_with_ldpstp_policy_model for parameter-value
> >   check and alignment check and remove superseded ones.
> > * config/aarch64/aarch64.opt: Add parameters.
> >   * doc/invoke.texi: Document the parameters accordingly.
>
> The ChangeLog entry should name the new parameters. For example:
> * config/aarch64/aarch64.opt (aarch64-ldp-policy): New param.
>
> Ok with the fixed ChangeLog.
> Thank you for the work!
> Kyrill
>
> >
> > gcc/testsuite/ChangeLog:
> >   * gcc.target/aarch64/ampere1-no_ldp_combine.c: Removed.
> > * gcc.target/aarch64/ldp_aligned.c: New test.
> > * gcc.target/aarch64/ldp_always.c: New test.
> > * gcc.target/aarch64/ldp_never.c: New test.
> > * gcc.target/aarch64/stp_aligned.c: New test.
> > * gcc.target/aarch64/stp_always.c: New test.
> > * gcc.target/aarch64/stp_never.c: New test.
> >
> > Signed-off-by: Manos Anagnostakis 
> > ---
> > Changes in v4:
> > - Changed the parameters to accept enum instead of an
> >   integer and updated documentation in doc/invoke.texi.
> > - Packed all the new checks in aarch64_operands_ok_for_ldpstp/
> >   aarch64_operands_adjust_ok_for_ldpstp in a new function
> >   called aarch64_mem_ok_with_ldpstp_policy_model.
> >
> >  gcc/config/aarch64/aarch64-opts.h |  16 ++
> >  gcc/config/aarch64/aarch64-protos.h   |  25 +++
> >  gcc/config/aarch64/aarch64-tuning-flags.def   |   8 -
> >  gcc/config/aarch64/aarch64.cc | 212 ++

[PATCH v4] aarch64: Fine-grained policies to control ldp-stp formation.

2023-09-26 Thread Manos Anagnostakis
This patch implements the following TODO in gcc/config/aarch64/aarch64.cc
to provide the requested behaviour for handling ldp and stp:

  /* Allow the tuning structure to disable LDP instruction formation
 from combining instructions (e.g., in peephole2).
 TODO: Implement fine-grained tuning control for LDP and STP:
   1. control policies for load and store separately;
   2. support the following policies:
  - default (use what is in the tuning structure)
  - always
  - never
  - aligned (only if the compiler can prove that the
load will be aligned to 2 * element_size)  */

It provides two new and concrete target-specific command-line parameters
-param=aarch64-ldp-policy= and -param=aarch64-stp-policy=
to give the ability to control load and store policies seperately as
stated in part 1 of the TODO.

The accepted values for both parameters are:
- default: Use the policy of the tuning structure (default).
- always: Emit ldp/stp regardless of alignment.
- never: Do not emit ldp/stp.
- aligned: In order to emit ldp/stp, first check if the load/store will
  be aligned to 2 * element_size.

Bootstrapped and regtested aarch64-linux.

gcc/ChangeLog:
* config/aarch64/aarch64-opts.h (enum aarch64_ldp_policy): New
enum type.
(enum aarch64_stp_policy): New enum type.
* config/aarch64/aarch64-protos.h (struct tune_params): Add
appropriate enums for the policies.
(aarch64_mem_ok_with_ldpstp_policy_model): New declaration.
* config/aarch64/aarch64-tuning-flags.def
(AARCH64_EXTRA_TUNING_OPTION): Remove superseded tuning
options.
* config/aarch64/aarch64.cc (aarch64_parse_ldp_policy): New
function to parse ldp-policy parameter.
(aarch64_parse_stp_policy): New function to parse stp-policy parameter.
(aarch64_override_options_internal): Call parsing functions.
(aarch64_mem_ok_with_ldpstp_policy_model): New function.
(aarch64_operands_ok_for_ldpstp): Add call to
aarch64_mem_ok_with_ldpstp_policy_model for parameter-value
check and alignment check and remove superseded ones.
(aarch64_operands_adjust_ok_for_ldpstp): Add call to
aarch64_mem_ok_with_ldpstp_policy_model for parameter-value
check and alignment check and remove superseded ones.
* config/aarch64/aarch64.opt: Add parameters.
* doc/invoke.texi: Document the parameters accordingly.

gcc/testsuite/ChangeLog:
* gcc.target/aarch64/ampere1-no_ldp_combine.c: Removed.
* gcc.target/aarch64/ldp_aligned.c: New test.
* gcc.target/aarch64/ldp_always.c: New test.
* gcc.target/aarch64/ldp_never.c: New test.
* gcc.target/aarch64/stp_aligned.c: New test.
* gcc.target/aarch64/stp_always.c: New test.
* gcc.target/aarch64/stp_never.c: New test.

Signed-off-by: Manos Anagnostakis 
---
Changes in v4:
- Changed the parameters to accept enum instead of an
  integer and updated documentation in doc/invoke.texi.
- Packed all the new checks in aarch64_operands_ok_for_ldpstp/
  aarch64_operands_adjust_ok_for_ldpstp in a new function
  called aarch64_mem_ok_with_ldpstp_policy_model.

 gcc/config/aarch64/aarch64-opts.h |  16 ++
 gcc/config/aarch64/aarch64-protos.h   |  25 +++
 gcc/config/aarch64/aarch64-tuning-flags.def   |   8 -
 gcc/config/aarch64/aarch64.cc | 212 +-
 gcc/config/aarch64/aarch64.opt|  38 
 gcc/doc/invoke.texi   |  20 ++
 .../aarch64/ampere1-no_ldp_combine.c  |  11 -
 .../gcc.target/aarch64/ldp_aligned.c  |  66 ++
 gcc/testsuite/gcc.target/aarch64/ldp_always.c |  66 ++
 gcc/testsuite/gcc.target/aarch64/ldp_never.c  |  66 ++
 .../gcc.target/aarch64/stp_aligned.c  |  60 +
 gcc/testsuite/gcc.target/aarch64/stp_always.c |  60 +
 gcc/testsuite/gcc.target/aarch64/stp_never.c  |  60 +
 13 files changed, 632 insertions(+), 76 deletions(-)
 delete mode 100644 gcc/testsuite/gcc.target/aarch64/ampere1-no_ldp_combine.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_aligned.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_always.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_never.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/stp_aligned.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/stp_always.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/stp_never.c

diff --git a/gcc/config/aarch64/aarch64-opts.h 
b/gcc/config/aarch64/aarch64-opts.h
index 7e8f1babed8..db8348507a3 100644
--- a/gcc/config/aarch64/aarch64-opts.h
+++ b/gcc/config/aarch64/aarch64-opts.h
@@ -108,4 +108,20 @@ enum aarch64_key_type {
   AARCH64_KEY_B
 };
 
+/* Load pair policy type.  */
+enum aarch64_ldp_policy {
+  LDP_POLICY_DEFAULT,
+  LDP_POLICY_ALWAYS,
+  LDP_POLICY_NEVER

Re: [PATCH v3] aarch64: Fine-grained policies to control ldp-stp formation.

2023-09-26 Thread Manos Anagnostakis
Thank you Andrew for the input.

I've prepared a patch using --param with enum, which seems a more suitable
approach to me as strings are more descriptive as well.

The current patch needed an adjustment on how to call the parsing functions
to match the compiler coding style.

Both are bootstrapped and regstested.

I can send a V4 of whichever is preferred.

Thanks!

Manos.

On Mon, Sep 25, 2023 at 11:57 PM Andrew Pinski  wrote:

> On Mon, Sep 25, 2023 at 1:04 PM Andrew Pinski  wrote:
> >
> > On Mon, Sep 25, 2023 at 12:59 PM Philipp Tomsich
> >  wrote:
> > >
> > > On Mon, 25 Sept 2023 at 21:54, Andrew Pinski 
> wrote:
> > > >
> > > > On Mon, Sep 25, 2023 at 12:50 PM Manos Anagnostakis
> > > >  wrote:
> > > > >
> > > > > This patch implements the following TODO in
> gcc/config/aarch64/aarch64.cc
> > > > > to provide the requested behaviour for handling ldp and stp:
> > > > >
> > > > >   /* Allow the tuning structure to disable LDP instruction
> formation
> > > > >  from combining instructions (e.g., in peephole2).
> > > > >  TODO: Implement fine-grained tuning control for LDP and STP:
> > > > >1. control policies for load and store separately;
> > > > >2. support the following policies:
> > > > >   - default (use what is in the tuning structure)
> > > > >   - always
> > > > >   - never
> > > > >   - aligned (only if the compiler can prove that the
> > > > > load will be aligned to 2 * element_size)  */
> > > > >
> > > > > It provides two new and concrete target-specific command-line
> parameters
> > > > > -param=aarch64-ldp-policy= and -param=aarch64-stp-policy=
> > > > > to give the ability to control load and store policies seperately
> as
> > > > > stated in part 1 of the TODO.
> > > > >
> > > > > The accepted values for both parameters are:
> > > > > - 0: Use the policy of the tuning structure (default).
> > > > > - 1: Emit ldp/stp regardless of alignment.
> > > > > - 2: Do not emit ldp/stp.
> > > > > - 3: In order to emit ldp/stp, first check if the load/store will
> > > > >   be aligned to 2 * element_size.
> > > >
> > > > Instead of a number, does it make sense to instead use an string
> > > > (ENUM) for this param.
> > > > Also I think using --param is a bad idea if it is going to be
> > > > documented in the user manual.
> > > > Maybe a -m option should be used instead.
> > >
> > > See
> https://gcc.gnu.org/pipermail/gcc-patches/2023-September/631283.html
> > > for the discussion triggering the change from -m... to --param and the
> > > change to using a number instead of a string.
> >
> > That is the opposite of the current GCC practice across all targets.
> > Things like this should be consistent and if one target decides to do
> > it different, then maybe it should NOT.
> > Anyways we should document the correct coding style for options so we
> > don't have these back and forths again.
>
> Kyrylo:
> >  It will have to take a number rather than a string but that should be
> okay, as long as the right values are documented in invoke.texi.
>
> No it does not need to be a number. --param=ranger-debug= does not
> take a number, it takes an enum .
> One of the benefits of moving --param support over to .opt to allow
> more than just numbers even.
>
> Thanks,
> Andrew
>
>
> >
> >
> > Thanks,
> > Andrew
> >
> > >
> > > Thanks,
> > > Philipp.
> > >
> > > >
> > > > Thanks,
> > > > Andrew
> > > >
> > > > >
> > > > > gcc/ChangeLog:
> > > > > * config/aarch64/aarch64-protos.h (struct tune_params): Add
> > > > > appropriate enums for the policies.
> > > > > * config/aarch64/aarch64-tuning-flags.def
> > > > > (AARCH64_EXTRA_TUNING_OPTION): Remove superseded tuning
> > > > > options.
> > > > > * config/aarch64/aarch64.cc (aarch64_parse_ldp_policy): New
> > > > > function to parse ldp-policy parameter.
> > > > > (aarch64_parse_stp_policy): New function to parse
> stp-policy parameter.
> > > > > (aarch64_overrid

Re: [PATCH v3] aarch64: Fine-grained policies to control ldp-stp formation.

2023-09-25 Thread Manos Anagnostakis
Hello Andrew,

what you describe was my previous version, but @Kyrylo Tkachov
 prompted me to use -param.

Thank you for taking a look anyway!

Manos Anagnostakis | Compiler Engineer
| E: manos.anagnosta...@vrull.eu

VRULL GmbH | Beatrixgasse 32 1030 Vienna | W: www.vrull.eu

Στις Δευ 25 Σεπ 2023, 22:54 ο χρήστης Andrew Pinski 
έγραψε:

> On Mon, Sep 25, 2023 at 12:50 PM Manos Anagnostakis
>  wrote:
> >
> > This patch implements the following TODO in gcc/config/aarch64/aarch64.cc
> > to provide the requested behaviour for handling ldp and stp:
> >
> >   /* Allow the tuning structure to disable LDP instruction formation
> >  from combining instructions (e.g., in peephole2).
> >  TODO: Implement fine-grained tuning control for LDP and STP:
> >1. control policies for load and store separately;
> >2. support the following policies:
> >   - default (use what is in the tuning structure)
> >   - always
> >   - never
> >   - aligned (only if the compiler can prove that the
> > load will be aligned to 2 * element_size)  */
> >
> > It provides two new and concrete target-specific command-line parameters
> > -param=aarch64-ldp-policy= and -param=aarch64-stp-policy=
> > to give the ability to control load and store policies seperately as
> > stated in part 1 of the TODO.
> >
> > The accepted values for both parameters are:
> > - 0: Use the policy of the tuning structure (default).
> > - 1: Emit ldp/stp regardless of alignment.
> > - 2: Do not emit ldp/stp.
> > - 3: In order to emit ldp/stp, first check if the load/store will
> >   be aligned to 2 * element_size.
>
> Instead of a number, does it make sense to instead use an string
> (ENUM) for this param.
> Also I think using --param is a bad idea if it is going to be
> documented in the user manual.
> Maybe a -m option should be used instead.
>
> Thanks,
> Andrew
>
> >
> > gcc/ChangeLog:
> > * config/aarch64/aarch64-protos.h (struct tune_params): Add
> > appropriate enums for the policies.
> > * config/aarch64/aarch64-tuning-flags.def
> > (AARCH64_EXTRA_TUNING_OPTION): Remove superseded tuning
> > options.
> > * config/aarch64/aarch64.cc (aarch64_parse_ldp_policy): New
> > function to parse ldp-policy parameter.
> > (aarch64_parse_stp_policy): New function to parse stp-policy
> parameter.
> > (aarch64_override_options_internal): Call parsing functions.
> > (aarch64_operands_ok_for_ldpstp): Add parameter-value check and
> > alignment check and remove superseded ones.
> > (aarch64_operands_adjust_ok_for_ldpstp): Add parameter-value
> check and
> > alignment check and remove superseded ones.
> > * config/aarch64/aarch64.opt: Add options.
> > * doc/invoke.texi: Document the parameters accordingly.
> >
> > gcc/testsuite/ChangeLog:
> > * gcc.target/aarch64/ampere1-no_ldp_combine.c: Removed.
> > * gcc.target/aarch64/ldp_aligned.c: New test.
> > * gcc.target/aarch64/ldp_always.c: New test.
> > * gcc.target/aarch64/ldp_never.c: New test.
> > * gcc.target/aarch64/stp_aligned.c: New test.
> > * gcc.target/aarch64/stp_always.c: New test.
> > * gcc.target/aarch64/stp_never.c: New test.
> >
> > Signed-off-by: Manos Anagnostakis 
> > ---
> > Changes in v3:
> > - Changed command-line options to target-specific parameters
> >   and documented them accordingly in doc/invoke.texi.
> > - Removed ampere1-no_ldp_combine.c test as superseded.
> >
> >  gcc/config/aarch64/aarch64-protos.h   |  24 ++
> >  gcc/config/aarch64/aarch64-tuning-flags.def   |   8 -
> >  gcc/config/aarch64/aarch64.cc | 215 +-
> >  gcc/config/aarch64/aarch64.opt|   8 +
> >  gcc/doc/invoke.texi   |  30 +++
> >  .../aarch64/ampere1-no_ldp_combine.c  |  11 -
> >  .../gcc.target/aarch64/ldp_aligned.c  |  66 ++
> >  gcc/testsuite/gcc.target/aarch64/ldp_always.c |  66 ++
> >  gcc/testsuite/gcc.target/aarch64/ldp_never.c  |  66 ++
> >  .../gcc.target/aarch64/stp_aligned.c  |  60 +
> >  gcc/testsuite/gcc.target/aarch64/stp_always.c |  60 +
> >  gcc/testsuite/gcc.target/aarch64/stp_never.c  |  60 +
> >  12 files changed, 600 insertions(+), 74 deletions(-)
> >  delete mode 100644
> gcc/testsuite/gcc.target/aarch64/ampere1-no_ldp_c

[PATCH v3] aarch64: Fine-grained policies to control ldp-stp formation.

2023-09-25 Thread Manos Anagnostakis
This patch implements the following TODO in gcc/config/aarch64/aarch64.cc
to provide the requested behaviour for handling ldp and stp:

  /* Allow the tuning structure to disable LDP instruction formation
 from combining instructions (e.g., in peephole2).
 TODO: Implement fine-grained tuning control for LDP and STP:
   1. control policies for load and store separately;
   2. support the following policies:
  - default (use what is in the tuning structure)
  - always
  - never
  - aligned (only if the compiler can prove that the
load will be aligned to 2 * element_size)  */

It provides two new and concrete target-specific command-line parameters
-param=aarch64-ldp-policy= and -param=aarch64-stp-policy=
to give the ability to control load and store policies seperately as
stated in part 1 of the TODO.

The accepted values for both parameters are:
- 0: Use the policy of the tuning structure (default).
- 1: Emit ldp/stp regardless of alignment.
- 2: Do not emit ldp/stp.
- 3: In order to emit ldp/stp, first check if the load/store will
  be aligned to 2 * element_size.

gcc/ChangeLog:
* config/aarch64/aarch64-protos.h (struct tune_params): Add
appropriate enums for the policies.
* config/aarch64/aarch64-tuning-flags.def
(AARCH64_EXTRA_TUNING_OPTION): Remove superseded tuning
options.
* config/aarch64/aarch64.cc (aarch64_parse_ldp_policy): New
function to parse ldp-policy parameter.
(aarch64_parse_stp_policy): New function to parse stp-policy parameter.
(aarch64_override_options_internal): Call parsing functions.
(aarch64_operands_ok_for_ldpstp): Add parameter-value check and
alignment check and remove superseded ones.
(aarch64_operands_adjust_ok_for_ldpstp): Add parameter-value check and
alignment check and remove superseded ones.
* config/aarch64/aarch64.opt: Add options.
* doc/invoke.texi: Document the parameters accordingly.

gcc/testsuite/ChangeLog:
* gcc.target/aarch64/ampere1-no_ldp_combine.c: Removed.
* gcc.target/aarch64/ldp_aligned.c: New test.
* gcc.target/aarch64/ldp_always.c: New test.
* gcc.target/aarch64/ldp_never.c: New test.
* gcc.target/aarch64/stp_aligned.c: New test.
* gcc.target/aarch64/stp_always.c: New test.
* gcc.target/aarch64/stp_never.c: New test.

Signed-off-by: Manos Anagnostakis 
---
Changes in v3:
- Changed command-line options to target-specific parameters
  and documented them accordingly in doc/invoke.texi.
- Removed ampere1-no_ldp_combine.c test as superseded.

 gcc/config/aarch64/aarch64-protos.h   |  24 ++
 gcc/config/aarch64/aarch64-tuning-flags.def   |   8 -
 gcc/config/aarch64/aarch64.cc | 215 +-
 gcc/config/aarch64/aarch64.opt|   8 +
 gcc/doc/invoke.texi   |  30 +++
 .../aarch64/ampere1-no_ldp_combine.c  |  11 -
 .../gcc.target/aarch64/ldp_aligned.c  |  66 ++
 gcc/testsuite/gcc.target/aarch64/ldp_always.c |  66 ++
 gcc/testsuite/gcc.target/aarch64/ldp_never.c  |  66 ++
 .../gcc.target/aarch64/stp_aligned.c  |  60 +
 gcc/testsuite/gcc.target/aarch64/stp_always.c |  60 +
 gcc/testsuite/gcc.target/aarch64/stp_never.c  |  60 +
 12 files changed, 600 insertions(+), 74 deletions(-)
 delete mode 100644 gcc/testsuite/gcc.target/aarch64/ampere1-no_ldp_combine.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_aligned.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_always.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_never.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/stp_aligned.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/stp_always.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/stp_never.c

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 70303d6fd95..be1d73490ed 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -568,6 +568,30 @@ struct tune_params
   /* Place prefetch struct pointer at the end to enable type checking
  errors when tune_params misses elements (e.g., from erroneous merges).  */
   const struct cpu_prefetch_tune *prefetch;
+/* An enum specifying how to handle load pairs using a fine-grained policy:
+   - LDP_POLICY_ALIGNED: Emit ldp if the source pointer is aligned
+   to at least double the alignment of the type.
+   - LDP_POLICY_ALWAYS: Emit ldp regardless of alignment.
+   - LDP_POLICY_NEVER: Do not emit ldp.  */
+
+  enum aarch64_ldp_policy_model
+  {
+LDP_POLICY_ALIGNED,
+LDP_POLICY_ALWAYS,
+LDP_POLICY_NEVER
+  } ldp_policy_model;
+/* An enum specifying how to handle store pairs using a fine-grained policy:
+   - STP_POLICY_ALIGNED: Emit stp if the source pointer is aligned
+   to at least

Re: [PATCH] aarch64: Fine-grained ldp and stp policies with test-cases.

2023-09-25 Thread Manos Anagnostakis
Thanks for the feedback, Kyrill.

I'll resend it as a V3. I believe you have also checked V2 containing just
a small test adjustment.

Manos Anagnostakis | Compiler Engineer
| E: manos.anagnosta...@vrull.eu

VRULL GmbH | Beatrixgasse 32 1030 Vienna | W: www.vrull.eu

Στις Δευ 25 Σεπ 2023, 13:59 ο χρήστης Kyrylo Tkachov 
έγραψε:

> Hi Manos,
>
> Apologies for the long delay.
>
> > -Original Message-----
> > From: Manos Anagnostakis 
> > Sent: Friday, August 18, 2023 8:50 AM
> > To: gcc-patches@gcc.gnu.org
> > Cc: Kyrylo Tkachov ; Philipp Tomsich
> > ; Manos Anagnostakis
> > 
> > Subject: [PATCH] aarch64: Fine-grained ldp and stp policies with
> test-cases.
> >
> > This patch implements the following TODO in gcc/config/aarch64/aarch64.cc
> > to provide the requested behaviour for handling ldp and stp:
> >
> >   /* Allow the tuning structure to disable LDP instruction formation
> >  from combining instructions (e.g., in peephole2).
> >  TODO: Implement fine-grained tuning control for LDP and STP:
> >1. control policies for load and store separately;
> >2. support the following policies:
> >   - default (use what is in the tuning structure)
> >   - always
> >   - never
> >   - aligned (only if the compiler can prove that the
> > load will be aligned to 2 * element_size)  */
> >
> > It provides two new and concrete command-line options -mldp-policy and -
> > mstp-policy
> > to give the ability to control load and store policies seperately as
> > stated in part 1 of the TODO.
> >
> > The accepted values for both options are:
> > - default: Use the ldp/stp policy defined in the corresponding tuning
> >   structure.
> > - always: Emit ldp/stp regardless of alignment.
> > - never: Do not emit ldp/stp.
> > - aligned: In order to emit ldp/stp, first check if the load/store will
> >   be aligned to 2 * element_size.
> >
> > gcc/ChangeLog:
> > * config/aarch64/aarch64-protos.h (struct tune_params): Add
> >   appropriate enums for the policies.
> > * config/aarch64/aarch64-tuning-flags.def
> >   (AARCH64_EXTRA_TUNING_OPTION): Remove superseded tuning
> >   options.
> > * config/aarch64/aarch64.cc (aarch64_parse_ldp_policy): New
> >   function to parse ldp-policy option.
> > (aarch64_parse_stp_policy): New function to parse stp-policy
> option.
> > (aarch64_override_options_internal): Call parsing functions.
> > (aarch64_operands_ok_for_ldpstp): Add option-value check and
> >   alignment check and remove superseded ones
> > (aarch64_operands_adjust_ok_for_ldpstp): Add option-value check
> and
> >   alignment check and remove superseded ones.
> > * config/aarch64/aarch64.opt: Add options.
> >
> > gcc/testsuite/ChangeLog:
> > * gcc.target/aarch64/ldp_aligned.c: New test.
> > * gcc.target/aarch64/ldp_always.c: New test.
> > * gcc.target/aarch64/ldp_never.c: New test.
> > * gcc.target/aarch64/stp_aligned.c: New test.
> > * gcc.target/aarch64/stp_always.c: New test.
> > * gcc.target/aarch64/stp_never.c: New test.
> >
> > Signed-off-by: Manos Anagnostakis 
> > ---
> >
> >  gcc/config/aarch64/aarch64-protos.h   |  24 ++
> >  gcc/config/aarch64/aarch64-tuning-flags.def   |   8 -
> >  gcc/config/aarch64/aarch64.cc | 229 ++
> >  gcc/config/aarch64/aarch64.opt|   8 +
> >  .../gcc.target/aarch64/ldp_aligned.c  |  64 +
> >  gcc/testsuite/gcc.target/aarch64/ldp_always.c |  64 +
> >  gcc/testsuite/gcc.target/aarch64/ldp_never.c  |  64 +
> >  .../gcc.target/aarch64/stp_aligned.c  |  60 +
> >  gcc/testsuite/gcc.target/aarch64/stp_always.c |  60 +
> >  gcc/testsuite/gcc.target/aarch64/stp_never.c  |  60 +
> >  10 files changed, 580 insertions(+), 61 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_aligned.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_always.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_never.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/stp_aligned.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/stp_always.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/stp_never.c
> >
> > diff --git a/gcc/config/aarch64/aarch64-protos.h
> > b/gcc/config/aarch64/aarch64-protos.h
> > index 70303d6fd95

[PING] [PATCH v2] aarch64: Fine-grained ldp and stp policies with test-cases.

2023-09-22 Thread Manos Anagnostakis
Kind ping for reviewing this patch. It's tested and does not cause
regressions:
https://patchwork.sourceware.org/project/gcc/patch/20230828143744.7574-1-manos.anagnosta...@vrull.eu/

Thank you in advance!

On Mon, Aug 28, 2023 at 5:37 PM Manos Anagnostakis <
manos.anagnosta...@vrull.eu> wrote:

> This patch implements the following TODO in gcc/config/aarch64/aarch64.cc
> to provide the requested behaviour for handling ldp and stp:
>
>   /* Allow the tuning structure to disable LDP instruction formation
>  from combining instructions (e.g., in peephole2).
>  TODO: Implement fine-grained tuning control for LDP and STP:
>1. control policies for load and store separately;
>2. support the following policies:
>   - default (use what is in the tuning structure)
>   - always
>   - never
>   - aligned (only if the compiler can prove that the
> load will be aligned to 2 * element_size)  */
>
> It provides two new and concrete command-line options -mldp-policy and
> -mstp-policy
> to give the ability to control load and store policies seperately as
> stated in part 1 of the TODO.
>
> The accepted values for both options are:
> - default: Use the ldp/stp policy defined in the corresponding tuning
>   structure.
> - always: Emit ldp/stp regardless of alignment.
> - never: Do not emit ldp/stp.
> - aligned: In order to emit ldp/stp, first check if the load/store will
>   be aligned to 2 * element_size.
>
> gcc/ChangeLog:
> * config/aarch64/aarch64-protos.h (struct tune_params): Add
> appropriate enums for the policies.
> * config/aarch64/aarch64-tuning-flags.def
> (AARCH64_EXTRA_TUNING_OPTION): Remove superseded tuning
> options.
> * config/aarch64/aarch64.cc (aarch64_parse_ldp_policy): New
> function to parse ldp-policy option.
> (aarch64_parse_stp_policy): New function to parse stp-policy
> option.
> (aarch64_override_options_internal): Call parsing functions.
> (aarch64_operands_ok_for_ldpstp): Add option-value check and
> alignment check and remove superseded ones
> (aarch64_operands_adjust_ok_for_ldpstp): Add option-value check and
> alignment check and remove superseded ones.
> * config/aarch64/aarch64.opt: Add options.
>
> gcc/testsuite/ChangeLog:
> * gcc.target/aarch64/ldp_aligned.c: New test.
> * gcc.target/aarch64/ldp_always.c: New test.
> * gcc.target/aarch64/ldp_never.c: New test.
> * gcc.target/aarch64/stp_aligned.c: New test.
> * gcc.target/aarch64/stp_always.c: New test.
> * gcc.target/aarch64/stp_never.c: New test.
>
> Signed-off-by: Manos Anagnostakis 
> ---
> Changes in v2:
> - Fixed commited ldp tests to correctly trigger
>   and test aarch64_operands_adjust_ok_for_ldpstp in aarch64.cc.
> - Added "-mcpu=generic" to commited tests to guarantee generic
> target code
>   generation and not cause the regressions of v1.
>
>  gcc/config/aarch64/aarch64-protos.h   |  24 ++
>  gcc/config/aarch64/aarch64-tuning-flags.def   |   8 -
>  gcc/config/aarch64/aarch64.cc | 229 ++
>  gcc/config/aarch64/aarch64.opt|   8 +
>  .../gcc.target/aarch64/ldp_aligned.c  |  66 +
>  gcc/testsuite/gcc.target/aarch64/ldp_always.c |  66 +
>  gcc/testsuite/gcc.target/aarch64/ldp_never.c  |  66 +
>  .../gcc.target/aarch64/stp_aligned.c  |  60 +
>  gcc/testsuite/gcc.target/aarch64/stp_always.c |  60 +
>  gcc/testsuite/gcc.target/aarch64/stp_never.c  |  60 +
>  10 files changed, 586 insertions(+), 61 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_aligned.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_always.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_never.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/stp_aligned.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/stp_always.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/stp_never.c
>
> diff --git a/gcc/config/aarch64/aarch64-protos.h
> b/gcc/config/aarch64/aarch64-protos.h
> index 70303d6fd95..be1d73490ed 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -568,6 +568,30 @@ struct tune_params
>/* Place prefetch struct pointer at the end to enable type checking
>   errors when tune_params misses elements (e.g., from erroneous
> merges).  */
>const struct cpu_prefetch_tune *prefetch;
> +/* An enum specifying how to handle load pairs using a fine-grained
> policy:
> +   - L

[PATCH v2] aarch64: Fine-grained ldp and stp policies with test-cases.

2023-08-28 Thread Manos Anagnostakis
This patch implements the following TODO in gcc/config/aarch64/aarch64.cc
to provide the requested behaviour for handling ldp and stp:

  /* Allow the tuning structure to disable LDP instruction formation
 from combining instructions (e.g., in peephole2).
 TODO: Implement fine-grained tuning control for LDP and STP:
   1. control policies for load and store separately;
   2. support the following policies:
  - default (use what is in the tuning structure)
  - always
  - never
  - aligned (only if the compiler can prove that the
load will be aligned to 2 * element_size)  */

It provides two new and concrete command-line options -mldp-policy and 
-mstp-policy
to give the ability to control load and store policies seperately as
stated in part 1 of the TODO.

The accepted values for both options are:
- default: Use the ldp/stp policy defined in the corresponding tuning
  structure.
- always: Emit ldp/stp regardless of alignment.
- never: Do not emit ldp/stp.
- aligned: In order to emit ldp/stp, first check if the load/store will
  be aligned to 2 * element_size.

gcc/ChangeLog:
* config/aarch64/aarch64-protos.h (struct tune_params): Add
appropriate enums for the policies.
* config/aarch64/aarch64-tuning-flags.def
(AARCH64_EXTRA_TUNING_OPTION): Remove superseded tuning
options.
* config/aarch64/aarch64.cc (aarch64_parse_ldp_policy): New
function to parse ldp-policy option.
(aarch64_parse_stp_policy): New function to parse stp-policy option.
(aarch64_override_options_internal): Call parsing functions.
(aarch64_operands_ok_for_ldpstp): Add option-value check and
alignment check and remove superseded ones
(aarch64_operands_adjust_ok_for_ldpstp): Add option-value check and
alignment check and remove superseded ones.
* config/aarch64/aarch64.opt: Add options.

gcc/testsuite/ChangeLog:
* gcc.target/aarch64/ldp_aligned.c: New test.
* gcc.target/aarch64/ldp_always.c: New test.
* gcc.target/aarch64/ldp_never.c: New test.
* gcc.target/aarch64/stp_aligned.c: New test.
* gcc.target/aarch64/stp_always.c: New test.
* gcc.target/aarch64/stp_never.c: New test.

Signed-off-by: Manos Anagnostakis 
---
Changes in v2:
- Fixed commited ldp tests to correctly trigger
  and test aarch64_operands_adjust_ok_for_ldpstp in aarch64.cc.
- Added "-mcpu=generic" to commited tests to guarantee generic target 
code
  generation and not cause the regressions of v1.

 gcc/config/aarch64/aarch64-protos.h   |  24 ++
 gcc/config/aarch64/aarch64-tuning-flags.def   |   8 -
 gcc/config/aarch64/aarch64.cc | 229 ++
 gcc/config/aarch64/aarch64.opt|   8 +
 .../gcc.target/aarch64/ldp_aligned.c  |  66 +
 gcc/testsuite/gcc.target/aarch64/ldp_always.c |  66 +
 gcc/testsuite/gcc.target/aarch64/ldp_never.c  |  66 +
 .../gcc.target/aarch64/stp_aligned.c  |  60 +
 gcc/testsuite/gcc.target/aarch64/stp_always.c |  60 +
 gcc/testsuite/gcc.target/aarch64/stp_never.c  |  60 +
 10 files changed, 586 insertions(+), 61 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_aligned.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_always.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_never.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/stp_aligned.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/stp_always.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/stp_never.c

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 70303d6fd95..be1d73490ed 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -568,6 +568,30 @@ struct tune_params
   /* Place prefetch struct pointer at the end to enable type checking
  errors when tune_params misses elements (e.g., from erroneous merges).  */
   const struct cpu_prefetch_tune *prefetch;
+/* An enum specifying how to handle load pairs using a fine-grained policy:
+   - LDP_POLICY_ALIGNED: Emit ldp if the source pointer is aligned
+   to at least double the alignment of the type.
+   - LDP_POLICY_ALWAYS: Emit ldp regardless of alignment.
+   - LDP_POLICY_NEVER: Do not emit ldp.  */
+
+  enum aarch64_ldp_policy_model
+  {
+LDP_POLICY_ALIGNED,
+LDP_POLICY_ALWAYS,
+LDP_POLICY_NEVER
+  } ldp_policy_model;
+/* An enum specifying how to handle store pairs using a fine-grained policy:
+   - STP_POLICY_ALIGNED: Emit stp if the source pointer is aligned
+   to at least double the alignment of the type.
+   - STP_POLICY_ALWAYS: Emit stp regardless of alignment.
+   - STP_POLICY_NEVER: Do not emit stp.  */
+
+  enum aarch64_stp_policy_model
+  {
+STP_POLICY_ALIGNED,
+STP_POLICY_ALWAYS,
+STP_PO

[PATCH] aarch64: Fine-grained ldp and stp policies with test-cases.

2023-08-18 Thread Manos Anagnostakis
This patch implements the following TODO in gcc/config/aarch64/aarch64.cc
to provide the requested behaviour for handling ldp and stp:

  /* Allow the tuning structure to disable LDP instruction formation
 from combining instructions (e.g., in peephole2).
 TODO: Implement fine-grained tuning control for LDP and STP:
   1. control policies for load and store separately;
   2. support the following policies:
  - default (use what is in the tuning structure)
  - always
  - never
  - aligned (only if the compiler can prove that the
load will be aligned to 2 * element_size)  */

It provides two new and concrete command-line options -mldp-policy and 
-mstp-policy
to give the ability to control load and store policies seperately as
stated in part 1 of the TODO.

The accepted values for both options are:
- default: Use the ldp/stp policy defined in the corresponding tuning
  structure.
- always: Emit ldp/stp regardless of alignment.
- never: Do not emit ldp/stp.
- aligned: In order to emit ldp/stp, first check if the load/store will
  be aligned to 2 * element_size.

gcc/ChangeLog:
* config/aarch64/aarch64-protos.h (struct tune_params): Add
appropriate enums for the policies.
* config/aarch64/aarch64-tuning-flags.def
(AARCH64_EXTRA_TUNING_OPTION): Remove superseded tuning
options.
* config/aarch64/aarch64.cc (aarch64_parse_ldp_policy): New
function to parse ldp-policy option.
(aarch64_parse_stp_policy): New function to parse stp-policy option.
(aarch64_override_options_internal): Call parsing functions.
(aarch64_operands_ok_for_ldpstp): Add option-value check and
alignment check and remove superseded ones
(aarch64_operands_adjust_ok_for_ldpstp): Add option-value check and
alignment check and remove superseded ones.
* config/aarch64/aarch64.opt: Add options.

gcc/testsuite/ChangeLog:
* gcc.target/aarch64/ldp_aligned.c: New test.
* gcc.target/aarch64/ldp_always.c: New test.
* gcc.target/aarch64/ldp_never.c: New test.
* gcc.target/aarch64/stp_aligned.c: New test.
* gcc.target/aarch64/stp_always.c: New test.
* gcc.target/aarch64/stp_never.c: New test.

Signed-off-by: Manos Anagnostakis 
---

 gcc/config/aarch64/aarch64-protos.h   |  24 ++
 gcc/config/aarch64/aarch64-tuning-flags.def   |   8 -
 gcc/config/aarch64/aarch64.cc | 229 ++
 gcc/config/aarch64/aarch64.opt|   8 +
 .../gcc.target/aarch64/ldp_aligned.c  |  64 +
 gcc/testsuite/gcc.target/aarch64/ldp_always.c |  64 +
 gcc/testsuite/gcc.target/aarch64/ldp_never.c  |  64 +
 .../gcc.target/aarch64/stp_aligned.c  |  60 +
 gcc/testsuite/gcc.target/aarch64/stp_always.c |  60 +
 gcc/testsuite/gcc.target/aarch64/stp_never.c  |  60 +
 10 files changed, 580 insertions(+), 61 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_aligned.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_always.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_never.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/stp_aligned.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/stp_always.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/stp_never.c

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 70303d6fd95..be1d73490ed 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -568,6 +568,30 @@ struct tune_params
   /* Place prefetch struct pointer at the end to enable type checking
  errors when tune_params misses elements (e.g., from erroneous merges).  */
   const struct cpu_prefetch_tune *prefetch;
+/* An enum specifying how to handle load pairs using a fine-grained policy:
+   - LDP_POLICY_ALIGNED: Emit ldp if the source pointer is aligned
+   to at least double the alignment of the type.
+   - LDP_POLICY_ALWAYS: Emit ldp regardless of alignment.
+   - LDP_POLICY_NEVER: Do not emit ldp.  */
+
+  enum aarch64_ldp_policy_model
+  {
+LDP_POLICY_ALIGNED,
+LDP_POLICY_ALWAYS,
+LDP_POLICY_NEVER
+  } ldp_policy_model;
+/* An enum specifying how to handle store pairs using a fine-grained policy:
+   - STP_POLICY_ALIGNED: Emit stp if the source pointer is aligned
+   to at least double the alignment of the type.
+   - STP_POLICY_ALWAYS: Emit stp regardless of alignment.
+   - STP_POLICY_NEVER: Do not emit stp.  */
+
+  enum aarch64_stp_policy_model
+  {
+STP_POLICY_ALIGNED,
+STP_POLICY_ALWAYS,
+STP_POLICY_NEVER
+  } stp_policy_model;
 };
 
 /* Classifies an address.
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def 
b/gcc/config/aarch64/aarch64-tuning-flags.def
index 52112ba7c48..774568e9106 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def