[Patch, rs6000] lmprove loop_unroll_adjust

2024-07-24 Thread Ajit Agarwal
Hello All:

This patch improves loop_unroll_adjust by adding mem count to calculate
unroll factor.

Bootstrapped and regtested on powerpc64-linux-gnu.

Thanks & Regards
Ajit

rs6000: Improve loop_unroll_adjust

Improves loop_unroll_adjust by adding mem count to calculate
unroll factor.

2024-07-24  Ajit Kumar Agarwal  

gcc/ChangeLog:

* config/rs6000/rs6000.cc: Improve loop_unroll_adjust
by adding mem count to calculate unroll factor.
---
 gcc/config/rs6000/rs6000.cc | 29 -
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index 0bcc6a2d0ab..3dd3857a74e 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -81,6 +81,7 @@
 #include "ppc-auxv.h"
 #include "rs6000-internal.h"
 #include "opts.h"
+#include "rtl-iter.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -5570,7 +5571,11 @@ rs6000_cost_data::finish_cost (const vector_costs 
*scalar_costs)
 static unsigned
 rs6000_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
 {
-   if (unroll_only_small_loops)
+  basic_block *bbs;
+  rtx_insn *insn;
+  unsigned i;
+  unsigned mem_count = 0;
+  if (unroll_only_small_loops)
 {
   /* TODO: These are hardcoded values right now.  We probably should use
 a PARAM here.  */
@@ -5582,6 +5587,28 @@ rs6000_loop_unroll_adjust (unsigned nunroll, struct loop 
*loop)
   return 0;
 }
 
+   /* Count the number of memory references within the loop body. */
+  subrtx_iterator::array_type array;
+  bbs = get_loop_body (loop);
+  for (i = 0; i < loop->num_nodes; i++)
+FOR_BB_INSNS (bbs[i], insn)
+  if (NONDEBUG_INSN_P (insn))
+   FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
+ if (const_rtx x = *iter)
+   if (MEM_P (x))
+ {
+   machine_mode mode = GET_MODE (x);
+   unsigned int n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
+   if (n_words > 4)
+ mem_count += 2;
+   else
+ mem_count += 1;
+ }
+  free (bbs);
+
+  if (mem_count && mem_count <=32)
+return MIN (nunroll, 32 / mem_count);
+
   return nunroll;
 }
 
-- 
2.43.5



[Patch, rs6000] Improve suggested unroll factor in finish_cost

2024-07-22 Thread Ajit Agarwal
Hello All:

This patch improve determine_suggested_unroll_factor in finish_cost
with reduction factor of loads/stores/non_load_stores.

Return unroll factor calculated as per reduction factor
with number of loads/stores/non_load_stores (general_ops).

Bootstrapped and regtested on powerpc64-linux-gnu.

Expected gains with spec 2017 benchmarks.

Thanks & Regards
Ajit


rs6000: Improve suggested unroll factor in finish_cost

Improve determine_suggested_unroll_factor in finish_cost
with reduction factor of loads/stores/non_load_stores.

Return unroll factor calculated as per reduction factor
with number of loads/stores/non_load_stores (general_ops).

2024-07-22  Ajit Kumar Agarwal  

gcc/ChangeLog:

* config/rs6000/rs6000.cc: Improve
determine_suggested_unroll_factor with reduction factor
of load/stores/general_ops.
---
 gcc/config/rs6000/rs6000.cc | 60 ++---
 1 file changed, 16 insertions(+), 44 deletions(-)

diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index 5ed64b1e686..0d69ec4cfbe 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -5458,7 +5458,6 @@ rs6000_cost_data::adjust_vect_cost_per_loop 
(loop_vec_info loop_vinfo)
 - estimated iteration count when iteration count is unknown;
 */
 
-
 unsigned int
 rs6000_cost_data::determine_suggested_unroll_factor (loop_vec_info loop_vinfo)
 {
@@ -5483,53 +5482,26 @@ rs6000_cost_data::determine_suggested_unroll_factor 
(loop_vec_info loop_vinfo)
   unsigned int issue_width = rs6000_vect_unroll_issue;
   unsigned int uf = CEIL (reduc_factor * issue_width, nstmts_nonldst);
   uf = MIN ((unsigned int) rs6000_vect_unroll_limit, uf);
-  /* Make sure it is power of 2.  */
-  uf = 1 << ceil_log2 (uf);
+  unsigned int temp;
 
-  /* If the iteration count is known, the costing would be exact enough,
- don't worry it could be worse.  */
-  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
-return uf;
-
-  /* Inspired by SPEC2017 parest_r, we want to aggressively unroll the
- loop if either condition is satisfied:
-   - reduction factor exceeds the threshold;
-   - emulated gather load adopted.  */
-  if (reduc_factor > (unsigned int) rs6000_vect_unroll_reduc_threshold
-  || m_gather_load)
-return uf;
-
-  /* Check if we can conclude it's good to unroll from the estimated
- iteration count.  */
-  HOST_WIDE_INT est_niter = get_estimated_loop_iterations_int (loop);
-  unsigned int vf = vect_vf_for_cost (loop_vinfo);
-  unsigned int unrolled_vf = vf * uf;
-  if (est_niter == -1 || est_niter < unrolled_vf)
-/* When the estimated iteration of this loop is unknown, it's possible
-   that we are able to vectorize this loop with the original VF but fail
-   to vectorize it with the unrolled VF any more if the actual iteration
-   count is in between.  */
-return 1;
-  else
+  if (m_nstores > 0)
 {
-  unsigned int epil_niter_unr = est_niter % unrolled_vf;
-  unsigned int epil_niter = est_niter % vf;
-  /* Even if we have partial vector support, it can be still inefficent
-to calculate the length when the iteration count is unknown, so
-only expect it's good to unroll when the epilogue iteration count
-is not bigger than VF (only one time length calculation).  */
-  if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
- && epil_niter_unr <= vf)
-   return uf;
-  /* Without partial vector support, conservatively unroll this when
-the epilogue iteration count is less than the original one
-(epilogue execution time wouldn't be longer than before).  */
-  else if (!LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
-  && epil_niter_unr <= epil_niter)
-   return uf;
+  temp = CEIL (reduc_factor * rs6000_vect_unroll_issue,
+  m_nstores);
+  uf = MIN (uf, temp);
 }
 
-  return 1;
+  if (m_nloads > 0)
+{
+  temp = CEIL (reduc_factor * rs6000_vect_unroll_issue,
+  m_nloads + m_nstores);
+  uf = MIN (uf, temp);
+}
+
+  /* Make sure it is power of 2.  */
+  uf = 1 << ceil_log2 (uf);
+
+  return uf;
 }
 
 void
-- 
2.43.5



Re: [Patch, rs6000, middle-end] v6: Add implementation for different targets for pair mem fusion

2024-07-19 Thread Ajit Agarwal
Hello Richard:

On 18/07/24 4:44 pm, Richard Sandiford wrote:
> Ajit Agarwal  writes:
>> [...]
>>>> +// Set subreg for OO mode pair to generate sequential registers given
>>>> +// insn_info pairs I1, I2 and LOAD_P is true iff load insn and false
>>>> +// if store insn.
>>>> +void
>>>> +rs6000_pair_fusion::set_multiword_subreg (insn_info *i1, insn_info *i2,
>>>> +bool load_p)
>>>> +{
>>>> +  if (load_p)
>>>> +{
>>>> +  bool i1_subreg_p = use_has_subreg_p (i1);
>>>> +  bool i2_subreg_p = use_has_subreg_p (i2);
>>>> +
>>>> +  if (i1_subreg_p || i2_subreg_p)
>>>> +  set_multiword_existing_subreg (i1, i2);
>>>> +  else
>>>> +  set_multiword_subreg_load (i1, i2);
>>>
>>> I don't understand this.  Why do we have both set_multiword_existing_subreg
>>> and set_multiword_subreg_load?  i1_subreg_p and i2_subreg_p are logically
>>> independent of one another (since i1 and i2 were separate instructions
>>> until now).  So "i1_subreg_p || i2_subreg_p" implies that
>>> set_multiword_existing_subreg can handle i1s that have no existing
>>> subreg (used when i2_subreg_p) and that it can handle i2s that have no
>>> existing subreg (used when i1_subreg_p).  So doesn't this mean that
>>> set_multiword_existing_subreg can handle everything?
>>>
>>
>> I will make the following change.
>>  if (load_p)
>> {
>>   bool i1_subreg_p = use_has_subreg_p (i1);
>>   bool i2_subreg_p = use_has_subreg_p (i2);
>>
>>   if (!i1_subreg_p && !i2_subreg_p) 
>> set_multiword_subreg_load (i1, i2);
>>   else
>> set_multiword_existing_subreg (i1, i2);
>> }
>>
>> Is this okay.
> 
> That's the same thing though: it's just replacing a ? A : B with !a ? B : A.
> 

Addressed in v7 of the patch.

>>> IMO, the way the update should work is that:
>>>
>>> (a) all references to the old registers should be updated via
>>> insn_propagation (regardless of whether the old references
>>> involved subregs).
>>>
>>> (b) those updates should be part of the same insn_change group as
>>> the change to the load itself.
>>>
>>> For stores, definitions of the stored register can probably be handled
>>> directly using df_refs, but there too, the updates should IMO be part
>>> of the same insn_change group as the change to the store itself.
>>>
>>> In both cases, it's the:
>>>
>>>   crtl->ssa->change_insns (changes);
>>>
>>> in pair_fusion_bb_info::fuse_pair that should be responsible for
>>> updating the rtl-ssa IR.  The changes that the pass wants to make
>>> should be described as insn_changes and passed to change_insns.
>>>
>>> The reason for funneling all changes through change_insns is that
>>> it allows rtl-ssa to maintain more complex datastructures.  Clients
>>> aren't supposed to manually update the datastructures piecemeal.
>>>
>>
>> I am afraid I am not getting this. Would you mind elaborating this.
>> Sorry for that.
> 
> See how fwprop.cc makes changes.  It:
> 
> - creates an insn_change for each change that it wants to make
> 
> - uses insn_propagation to replace the old value with the new value
> 
> - sets the new_uses of the insn_change to reflect the effect
>   of the propagation (in this case, replacing the old 128-bit
>   value with a 256-bit value)
> 
> - uses change_insn to commit the change
> 
> The process would be similar here.
>

Addressed in v7 of the patch.
 
> Thanks,
> Richard

Thanks & Regards
Ajit


[Patch, rs6000, middle-end] v7: Add implementation for different targets for pair mem fusion

2024-07-19 Thread Ajit Agarwal
Hello Richard:

All comments are addressed.

Common infrastructure using generic code for pair mem fusion of different
targets.

rs6000 target specific code implement virtual functions defined by generic code.

Target specific code are added in rs6000-mem-fusion.cc.

Bootstrapped and regtested on powerpc64-linux-gnu.

Thanks & Regards
Ajit


rs6000, middle-end: Add implementation for different targets for pair mem fusion

Common infrastructure using generic code for pair mem fusion of different
targets.

rs6000 target specific code implement virtual functions defined by generic code.

Target specific code are added in rs6000-mem-fusion.cc.

2024-07-19  Ajit Kumar Agarwal  

gcc/ChangeLog:

* config/rs6000/rs6000-passes.def: New mem fusion pass
before pass_early_remat.
* pair-fusion.h: Add additional pure virtual function
required for rs6000 target implementation.
* pair-fusion.cc: Use of virtual functions for additional
virtual function addded for rs6000 target.
* config/rs6000/rs6000-mem-fusion.cc: Add new pass.
Add target specific implementation for generic pure virtual
functions.
* config/rs6000/mma.md: Modify movoo machine description.
Add new machine description movoo1.
* config/rs6000/rs6000.cc: Modify rs6000_split_multireg_move
to expand movoo machine description for all constraints.
* config.gcc: Add new object file.
* config/rs6000/rs6000-protos.h: Add new prototype for mem
fusion pass.
* config/rs6000/t-rs6000: Add new rule.
* rtl-ssa/functions.h: Move out allocate function from private
to public and add get_m_temp_defs function.

gcc/testsuite/ChangeLog:

* g++.target/powerpc/mem-fusion.C: New test.
* g++.target/powerpc/mem-fusion-1.C: New test.
* gcc.target/powerpc/mma-builtin-1.c: Modify test.
---
 gcc/config.gcc|   2 +
 gcc/config/rs6000/mma.md  |  26 +-
 gcc/config/rs6000/rs6000-mem-fusion.cc| 746 ++
 gcc/config/rs6000/rs6000-passes.def   |   4 +-
 gcc/config/rs6000/rs6000-protos.h |   1 +
 gcc/config/rs6000/rs6000.cc   |  58 +-
 gcc/config/rs6000/rs6000.md   |   1 +
 gcc/config/rs6000/t-rs6000|   5 +
 gcc/pair-fusion.cc|  32 +-
 gcc/pair-fusion.h |  48 ++
 gcc/rtl-ssa/functions.h   |  11 +-
 .../g++.target/powerpc/mem-fusion-1.C |  22 +
 gcc/testsuite/g++.target/powerpc/mem-fusion.C |  15 +
 .../gcc.target/powerpc/mma-builtin-1.c|   4 +-
 14 files changed, 946 insertions(+), 29 deletions(-)
 create mode 100644 gcc/config/rs6000/rs6000-mem-fusion.cc
 create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion-1.C
 create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion.C

diff --git a/gcc/config.gcc b/gcc/config.gcc
index bc45615741b..12f79a78177 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -524,6 +524,7 @@ powerpc*-*-*)
extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
extra_objs="${extra_objs} rs6000-builtins.o rs6000-builtin.o"
+   extra_objs="${extra_objs} rs6000-mem-fusion.o"
extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h"
extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h"
extra_headers="${extra_headers} xmmintrin.h mm_malloc.h emmintrin.h"
@@ -560,6 +561,7 @@ rs6000*-*-*)
extra_options="${extra_options} g.opt fused-madd.opt 
rs6000/rs6000-tables.opt"
extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
+   extra_objs="${extra_objs} rs6000-mem-fusion.o"
target_gtfiles="$target_gtfiles 
\$(srcdir)/config/rs6000/rs6000-logue.cc 
\$(srcdir)/config/rs6000/rs6000-call.cc"
target_gtfiles="$target_gtfiles 
\$(srcdir)/config/rs6000/rs6000-pcrel-opt.cc"
;;
diff --git a/gcc/config/rs6000/mma.md b/gcc/config/rs6000/mma.md
index 04e2d0066df..88413926a02 100644
--- a/gcc/config/rs6000/mma.md
+++ b/gcc/config/rs6000/mma.md
@@ -294,7 +294,31 @@
 
 (define_insn_and_split "*movoo"
   [(set (match_operand:OO 0 "nonimmediate_operand" "=wa,ZwO,wa")
-   (match_operand:OO 1 "input_operand" "ZwO,wa,wa"))]
+(match_operand:OO 1 "input_operand" "ZwO,wa,wa"))]
+  "TARGET_MMA
+   && (gpc_reg_operand (operands[0], OOmode)
+   || gpc_reg_operand (operands[1], OOmode))"
+;;""
+  "@
+   #
+   #
+   #"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  rs6000_split_multireg_move (operands[0], operands[1]);
+  DONE;
+}
+  [(set_attr "type" "vecload,vecstore,veclogical")
+   (set_attr "length" "*,*,8")])
+;;   (set_attr "max_prefixed_insns" "2,2,*")])
+
+
+(define_insn_and_split "*movoo1"
+  [(set (match_operand:OO 0 

Re: [Patch, rs6000, middle-end] v6: Add implementation for different targets for pair mem fusion

2024-07-18 Thread Ajit Agarwal
Hello Richard:

On 18/07/24 2:04 pm, Ajit Agarwal wrote:
> Hello Richard:
> 
> On 18/07/24 1:17 am, Richard Sandiford wrote:
>> Ajit Agarwal  writes:
>>> Hello All:
>>>
>>> This version of patch relaxes store fusion for more use cases.
>>>
>>> Common infrastructure using generic code for pair mem fusion of different
>>> targets.
>>>
>>> rs6000 target specific code implement virtual functions defined by generic 
>>> code.
>>>
>>> Target specific code are added in rs6000-mem-fusion.cc.
>>>
>>> Bootstrapped and regtested on powerpc64-linux-gnu.
>>>
>>> Thanks & Regards
>>> Ajit
>>>
>>>
>>> rs6000, middle-end: Add implementation for different targets for pair mem 
>>> fusion
>>>
>>> Common infrastructure using generic code for pair mem fusion of different
>>> targets.
>>>
>>> rs6000 target specific code implement virtual functions defined by generic 
>>> code.
>>>
>>> Target specific code are added in rs6000-mem-fusion.cc.
>>>
>>> 2024-07-02  Ajit Kumar Agarwal  
>>>
>>> gcc/ChangeLog:
>>>
>>> * config/rs6000/rs6000-passes.def: New mem fusion pass
>>> before pass_early_remat.
>>> * pair-fusion.h: Add additional pure virtual function
>>> required for rs6000 target implementation.
>>> * pair-fusion.cc: Use of virtual functions for additional
>>> virtual function addded for rs6000 target.
>>> * config/rs6000/rs6000-mem-fusion.cc: Add new pass.
>>> Add target specific implementation for generic pure virtual
>>> functions.
>>> * config/rs6000/mma.md: Modify movoo machine description.
>>> Add new machine description movoo1.
>>> * config/rs6000/rs6000.cc: Modify rs6000_split_multireg_move
>>> to expand movoo machine description for all constraints.
>>> * config.gcc: Add new object file.
>>> * config/rs6000/rs6000-protos.h: Add new prototype for mem
>>> fusion pass.
>>> * config/rs6000/t-rs6000: Add new rule.
>>> * rtl-ssa/functions.h: Move out allocate function from private
>>> to public and add get_m_temp_defs function.
>>>
>>> gcc/testsuite/ChangeLog:
>>>
>>> * g++.target/powerpc/mem-fusion.C: New test.
>>> * g++.target/powerpc/mem-fusion-1.C: New test.
>>> * gcc.target/powerpc/mma-builtin-1.c: Modify test.
>>> ---
>>>  gcc/config.gcc|   2 +
>>>  gcc/config/rs6000/mma.md  |  26 +-
>>>  gcc/config/rs6000/rs6000-mem-fusion.cc| 708 ++
>>>  gcc/config/rs6000/rs6000-passes.def   |   4 +-
>>>  gcc/config/rs6000/rs6000-protos.h |   1 +
>>>  gcc/config/rs6000/rs6000.cc   |  57 +-
>>>  gcc/config/rs6000/rs6000.md   |   1 +
>>>  gcc/config/rs6000/t-rs6000|   5 +
>>>  gcc/pair-fusion.cc|  27 +-
>>>  gcc/pair-fusion.h |  34 +
>>>  gcc/rtl-ssa/functions.h   |  11 +-
>>>  .../g++.target/powerpc/mem-fusion-1.C |  22 +
>>>  gcc/testsuite/g++.target/powerpc/mem-fusion.C |  15 +
>>>  .../gcc.target/powerpc/mma-builtin-1.c|   4 +-
>>>  14 files changed, 890 insertions(+), 27 deletions(-)
>>>  create mode 100644 gcc/config/rs6000/rs6000-mem-fusion.cc
>>>  create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion-1.C
>>>  create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion.C
>>>
>>> diff --git a/gcc/config.gcc b/gcc/config.gcc
>>> index bc45615741b..12f79a78177 100644
>>> --- a/gcc/config.gcc
>>> +++ b/gcc/config.gcc
>>> @@ -524,6 +524,7 @@ powerpc*-*-*)
>>> extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
>>> extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
>>> extra_objs="${extra_objs} rs6000-builtins.o rs6000-builtin.o"
>>> +   extra_objs="${extra_objs} rs6000-mem-fusion.o"
>>> extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h"
>>> extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h"
>>> extra_headers="${extra_headers} xmmintrin.h mm_malloc.h emmintrin.h"
>>> @@ -560,6 +561,7 @@ rs6000*-*-*)
>>> extra_options="${extra_o

Re: [Patch, rs6000, middle-end] v6: Add implementation for different targets for pair mem fusion

2024-07-18 Thread Ajit Agarwal
Hello Richard:

On 18/07/24 1:17 am, Richard Sandiford wrote:
> Ajit Agarwal  writes:
>> Hello All:
>>
>> This version of patch relaxes store fusion for more use cases.
>>
>> Common infrastructure using generic code for pair mem fusion of different
>> targets.
>>
>> rs6000 target specific code implement virtual functions defined by generic 
>> code.
>>
>> Target specific code are added in rs6000-mem-fusion.cc.
>>
>> Bootstrapped and regtested on powerpc64-linux-gnu.
>>
>> Thanks & Regards
>> Ajit
>>
>>
>> rs6000, middle-end: Add implementation for different targets for pair mem 
>> fusion
>>
>> Common infrastructure using generic code for pair mem fusion of different
>> targets.
>>
>> rs6000 target specific code implement virtual functions defined by generic 
>> code.
>>
>> Target specific code are added in rs6000-mem-fusion.cc.
>>
>> 2024-07-02  Ajit Kumar Agarwal  
>>
>> gcc/ChangeLog:
>>
>>  * config/rs6000/rs6000-passes.def: New mem fusion pass
>>  before pass_early_remat.
>>  * pair-fusion.h: Add additional pure virtual function
>>  required for rs6000 target implementation.
>>  * pair-fusion.cc: Use of virtual functions for additional
>>  virtual function addded for rs6000 target.
>>  * config/rs6000/rs6000-mem-fusion.cc: Add new pass.
>>  Add target specific implementation for generic pure virtual
>>  functions.
>>  * config/rs6000/mma.md: Modify movoo machine description.
>>  Add new machine description movoo1.
>>  * config/rs6000/rs6000.cc: Modify rs6000_split_multireg_move
>>  to expand movoo machine description for all constraints.
>>  * config.gcc: Add new object file.
>>  * config/rs6000/rs6000-protos.h: Add new prototype for mem
>>  fusion pass.
>>  * config/rs6000/t-rs6000: Add new rule.
>>  * rtl-ssa/functions.h: Move out allocate function from private
>>  to public and add get_m_temp_defs function.
>>
>> gcc/testsuite/ChangeLog:
>>
>>  * g++.target/powerpc/mem-fusion.C: New test.
>>  * g++.target/powerpc/mem-fusion-1.C: New test.
>>  * gcc.target/powerpc/mma-builtin-1.c: Modify test.
>> ---
>>  gcc/config.gcc|   2 +
>>  gcc/config/rs6000/mma.md  |  26 +-
>>  gcc/config/rs6000/rs6000-mem-fusion.cc| 708 ++
>>  gcc/config/rs6000/rs6000-passes.def   |   4 +-
>>  gcc/config/rs6000/rs6000-protos.h |   1 +
>>  gcc/config/rs6000/rs6000.cc   |  57 +-
>>  gcc/config/rs6000/rs6000.md   |   1 +
>>  gcc/config/rs6000/t-rs6000|   5 +
>>  gcc/pair-fusion.cc|  27 +-
>>  gcc/pair-fusion.h |  34 +
>>  gcc/rtl-ssa/functions.h   |  11 +-
>>  .../g++.target/powerpc/mem-fusion-1.C |  22 +
>>  gcc/testsuite/g++.target/powerpc/mem-fusion.C |  15 +
>>  .../gcc.target/powerpc/mma-builtin-1.c|   4 +-
>>  14 files changed, 890 insertions(+), 27 deletions(-)
>>  create mode 100644 gcc/config/rs6000/rs6000-mem-fusion.cc
>>  create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion-1.C
>>  create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion.C
>>
>> diff --git a/gcc/config.gcc b/gcc/config.gcc
>> index bc45615741b..12f79a78177 100644
>> --- a/gcc/config.gcc
>> +++ b/gcc/config.gcc
>> @@ -524,6 +524,7 @@ powerpc*-*-*)
>>  extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
>>  extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
>>  extra_objs="${extra_objs} rs6000-builtins.o rs6000-builtin.o"
>> +extra_objs="${extra_objs} rs6000-mem-fusion.o"
>>  extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h"
>>  extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h"
>>  extra_headers="${extra_headers} xmmintrin.h mm_malloc.h emmintrin.h"
>> @@ -560,6 +561,7 @@ rs6000*-*-*)
>>  extra_options="${extra_options} g.opt fused-madd.opt 
>> rs6000/rs6000-tables.opt"
>>  extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
>>  extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
>> +extra_objs="${extra_objs} rs6000-mem-fusion.o"
>>  target_gtfiles="$target_gtfiles 
>> \$(srcdir)/config/rs6000/rs

[PING^1][Patch, rs6000, middle-end] v6: Add implementation for different targets for pair mem fusion

2024-07-16 Thread Ajit Agarwal
Ping^1. Ok to install?

Thanks & Regards
Ajit


 Forwarded Message 
Subject: [PING^0][Patch, rs6000, middle-end] v6: Add implementation for 
different targets for pair mem fusion
Date: Mon, 8 Jul 2024 07:55:19 +0530
From: Ajit Agarwal 
To: Alex Coplan , Richard Sandiford 
, Kewen.Lin , Segher 
Boessenkool , Michael Meissner 
, Peter Bergner , David Edelsohn 
, gcc-patches 

Ping ! Please let me know OK for trunk.

Thanks & Regards
Ajit


 Forwarded Message 
Subject: [Patch, rs6000, middle-end] v6: Add implementation for different 
targets for pair mem fusion
Date: Tue, 2 Jul 2024 14:15:02 +0530
From: Ajit Agarwal 
To: Alex Coplan , Richard Sandiford 
, Kewen.Lin , Segher 
Boessenkool , Michael Meissner 
, Peter Bergner , David Edelsohn 
, gcc-patches 

Hello All:

This version of patch relaxes store fusion for more use cases.

Common infrastructure using generic code for pair mem fusion of different
targets.

rs6000 target specific code implement virtual functions defined by generic code.

Target specific code are added in rs6000-mem-fusion.cc.

Bootstrapped and regtested on powerpc64-linux-gnu.

Thanks & Regards
Ajit


rs6000, middle-end: Add implementation for different targets for pair mem fusion

Common infrastructure using generic code for pair mem fusion of different
targets.

rs6000 target specific code implement virtual functions defined by generic code.

Target specific code are added in rs6000-mem-fusion.cc.

2024-07-02  Ajit Kumar Agarwal  

gcc/ChangeLog:

* config/rs6000/rs6000-passes.def: New mem fusion pass
before pass_early_remat.
* pair-fusion.h: Add additional pure virtual function
required for rs6000 target implementation.
* pair-fusion.cc: Use of virtual functions for additional
virtual function addded for rs6000 target.
* config/rs6000/rs6000-mem-fusion.cc: Add new pass.
Add target specific implementation for generic pure virtual
functions.
* config/rs6000/mma.md: Modify movoo machine description.
Add new machine description movoo1.
* config/rs6000/rs6000.cc: Modify rs6000_split_multireg_move
to expand movoo machine description for all constraints.
* config.gcc: Add new object file.
* config/rs6000/rs6000-protos.h: Add new prototype for mem
fusion pass.
* config/rs6000/t-rs6000: Add new rule.
* rtl-ssa/functions.h: Move out allocate function from private
to public and add get_m_temp_defs function.

gcc/testsuite/ChangeLog:

* g++.target/powerpc/mem-fusion.C: New test.
* g++.target/powerpc/mem-fusion-1.C: New test.
* gcc.target/powerpc/mma-builtin-1.c: Modify test.
---
 gcc/config.gcc|   2 +
 gcc/config/rs6000/mma.md  |  26 +-
 gcc/config/rs6000/rs6000-mem-fusion.cc| 708 ++
 gcc/config/rs6000/rs6000-passes.def   |   4 +-
 gcc/config/rs6000/rs6000-protos.h |   1 +
 gcc/config/rs6000/rs6000.cc   |  57 +-
 gcc/config/rs6000/rs6000.md   |   1 +
 gcc/config/rs6000/t-rs6000|   5 +
 gcc/pair-fusion.cc|  27 +-
 gcc/pair-fusion.h |  34 +
 gcc/rtl-ssa/functions.h   |  11 +-
 .../g++.target/powerpc/mem-fusion-1.C |  22 +
 gcc/testsuite/g++.target/powerpc/mem-fusion.C |  15 +
 .../gcc.target/powerpc/mma-builtin-1.c|   4 +-
 14 files changed, 890 insertions(+), 27 deletions(-)
 create mode 100644 gcc/config/rs6000/rs6000-mem-fusion.cc
 create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion-1.C
 create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion.C

diff --git a/gcc/config.gcc b/gcc/config.gcc
index bc45615741b..12f79a78177 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -524,6 +524,7 @@ powerpc*-*-*)
extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
extra_objs="${extra_objs} rs6000-builtins.o rs6000-builtin.o"
+   extra_objs="${extra_objs} rs6000-mem-fusion.o"
extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h"
extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h"
extra_headers="${extra_headers} xmmintrin.h mm_malloc.h emmintrin.h"
@@ -560,6 +561,7 @@ rs6000*-*-*)
extra_options="${extra_options} g.opt fused-madd.opt 
rs6000/rs6000-tables.opt"
extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
+   extra_objs="${extra_objs} rs6000-mem-fusion.o"
target_gtfiles="$target_gtfiles 
\$(srcdir)/config/rs6000/rs6000-logue.cc 
\$(srcdir)/config/rs6000/rs6000-call.cc"
t

Re: [Patch, tree-optimization, predcom] Improve unroll factor for predictive commoning

2024-07-15 Thread Ajit Agarwal
Hello Richard:

On 13/07/24 8:16 pm, Ajit Agarwal wrote:
> Hello Richard:
> 
> On 12/07/24 6:20 pm, Richard Biener wrote:
>> On Fri, Jul 12, 2024 at 12:09 PM Ajit Agarwal  wrote:
>>>
>>> Hello Richard:
>>>
>>> On 11/07/24 2:21 pm, Richard Biener wrote:
>>>> On Thu, Jul 11, 2024 at 10:30 AM Ajit Agarwal  
>>>> wrote:
>>>>>
>>>>> Hello All:
>>>>>
>>>>> Unroll factor is determined with max distance across loop iterations.
>>>>> The logic for determining the loop unroll factor is based on
>>>>> data dependency across loop iterations.
>>>>>
>>>>> The max distance across loop iterations is the unrolling factor
>>>>> that helps in predictive commoning.
>>>>
>>>> The old comment in the code says
>>>>
>>>>> -  /* The best unroll factor for this chain is equal to the number of
>>>>> -temporary variables that we create for it.  */
>>>>
>>>> why is that wrong and why is the max dependence distance more correct?
>>>>
>>>> Do you have a testcase that shows how this makes a (positive) difference?
>>>>
>>>
>>> There is nothing wrong in the existing implementation of unroll
>>> factor for predictive commoning.
>>>
>>> But with max dependence distance we get performance improvement
>>> with spec 2017 benchmarks (INT) of 0.01% (Geomean) with and without
>>> changes. Improvement in benchmarks with max dependence distance
>>> changes.
>>>
>>> I have used the following flags:
>>> -O2 -funroll-loops --param max-unroll-times=8 -fpredictive-commoning 
>>> -fno-tree-pre
>>>
>>> With above flags I ran with and without changes.
>>
>> A 0.01% geomean improvement is noise.  Why did you disable PRE?
>>
> 
> I have changed the flags. Now I changed the flags to -O3
> -fpredictive-commoning -funroll-loops. With these flags
> I am measuring the performance with spec 2017 benchmarks.
> Would let you know the results by Monday.
>

With the changes in this patch, 500.perlbench_r gave gain of
2.56% (spec 2017 INT) benchmarks. There is no degradation
with other benchmarks.

Thanks & Regards
Ajit
  
>>> There is no degradation with spec 2017 (FP benchmarks).
>>>
>>> Because in predictive commoning we reuse values computed in
>>> earlier iterations of a loop in the later ones, max distance is the
>>> better choice.
>>
>> The re-use distance is the same though.  So your change merely increases
>> the unroll factor?  Or can you explain why there is more re-use with
>> your change.
>>
> 
> With -O3 -fpredictive-commoning -funroll-loops many spec 2017 benchmarks
> increases unroll factor with my changes.
> 
> I am traversing data dependence relation vector, get the distance
> and max distance is the unroll factor.
>  
>> Richard.
>>
> Thanks & Regards
> Ajit
>>>> Richard.
>>>>
>>>
>>> Thanks & Regards
>>> Ajit
>>>
>>>>> Bootstrapped and regtested on powerpc64-linux-gnu.
>>>>>
>>>>> Thanks & Regards
>>>>> Ajit
>>>>>
>>>>> tree-optimization, predcom: Improve unroll factor for predictive commoning
>>>>>
>>>>> Unroll factor is determined with max distance across loop iterations.
>>>>> The logic for determining the loop unroll factor is based on
>>>>> data dependency across loop iterations.
>>>>>
>>>>> The max distance across loop iterations is the unrolling factor
>>>>> that helps in predictive commoning.
>>>>>
>>>>> 2024-07-11  Ajit Kumar Agarwal  
>>>>>
>>>>> gcc/ChangeLog:
>>>>>
>>>>> * tree-predcom.cc: Change in determining unroll factor with
>>>>> data dependence across loop iterations.
>>>>> ---
>>>>>  gcc/tree-predcom.cc | 51 ++---
>>>>>  1 file changed, 39 insertions(+), 12 deletions(-)
>>>>>
>>>>> diff --git a/gcc/tree-predcom.cc b/gcc/tree-predcom.cc
>>>>> index 9844fee1e97..029b02f5990 100644
>>>>> --- a/gcc/tree-predcom.cc
>>>>> +++ b/gcc/tree-predcom.cc
>>>>> @@ -409,6 +409,7 @@ public:
>>>>>/* Perform the predictive comm

Re: [Patch, tree-optimization, predcom] Improve unroll factor for predictive commoning

2024-07-13 Thread Ajit Agarwal
Hello Richard:

On 12/07/24 6:20 pm, Richard Biener wrote:
> On Fri, Jul 12, 2024 at 12:09 PM Ajit Agarwal  wrote:
>>
>> Hello Richard:
>>
>> On 11/07/24 2:21 pm, Richard Biener wrote:
>>> On Thu, Jul 11, 2024 at 10:30 AM Ajit Agarwal  
>>> wrote:
>>>>
>>>> Hello All:
>>>>
>>>> Unroll factor is determined with max distance across loop iterations.
>>>> The logic for determining the loop unroll factor is based on
>>>> data dependency across loop iterations.
>>>>
>>>> The max distance across loop iterations is the unrolling factor
>>>> that helps in predictive commoning.
>>>
>>> The old comment in the code says
>>>
>>>> -  /* The best unroll factor for this chain is equal to the number of
>>>> -temporary variables that we create for it.  */
>>>
>>> why is that wrong and why is the max dependence distance more correct?
>>>
>>> Do you have a testcase that shows how this makes a (positive) difference?
>>>
>>
>> There is nothing wrong in the existing implementation of unroll
>> factor for predictive commoning.
>>
>> But with max dependence distance we get performance improvement
>> with spec 2017 benchmarks (INT) of 0.01% (Geomean) with and without
>> changes. Improvement in benchmarks with max dependence distance
>> changes.
>>
>> I have used the following flags:
>> -O2 -funroll-loops --param max-unroll-times=8 -fpredictive-commoning 
>> -fno-tree-pre
>>
>> With above flags I ran with and without changes.
> 
> A 0.01% geomean improvement is noise.  Why did you disable PRE?
> 

I have changed the flags. Now I changed the flags to -O3
-fpredictive-commoning -funroll-loops. With these flags
I am measuring the performance with spec 2017 benchmarks.
Would let you know the results by Monday.
 
>> There is no degradation with spec 2017 (FP benchmarks).
>>
>> Because in predictive commoning we reuse values computed in
>> earlier iterations of a loop in the later ones, max distance is the
>> better choice.
> 
> The re-use distance is the same though.  So your change merely increases
> the unroll factor?  Or can you explain why there is more re-use with
> your change.
>

With -O3 -fpredictive-commoning -funroll-loops many spec 2017 benchmarks
increases unroll factor with my changes.

I am traversing data dependence relation vector, get the distance
and max distance is the unroll factor.
 
> Richard.
> 
Thanks & Regards
Ajit
>>> Richard.
>>>
>>
>> Thanks & Regards
>> Ajit
>>
>>>> Bootstrapped and regtested on powerpc64-linux-gnu.
>>>>
>>>> Thanks & Regards
>>>> Ajit
>>>>
>>>> tree-optimization, predcom: Improve unroll factor for predictive commoning
>>>>
>>>> Unroll factor is determined with max distance across loop iterations.
>>>> The logic for determining the loop unroll factor is based on
>>>> data dependency across loop iterations.
>>>>
>>>> The max distance across loop iterations is the unrolling factor
>>>> that helps in predictive commoning.
>>>>
>>>> 2024-07-11  Ajit Kumar Agarwal  
>>>>
>>>> gcc/ChangeLog:
>>>>
>>>> * tree-predcom.cc: Change in determining unroll factor with
>>>> data dependence across loop iterations.
>>>> ---
>>>>  gcc/tree-predcom.cc | 51 ++---
>>>>  1 file changed, 39 insertions(+), 12 deletions(-)
>>>>
>>>> diff --git a/gcc/tree-predcom.cc b/gcc/tree-predcom.cc
>>>> index 9844fee1e97..029b02f5990 100644
>>>> --- a/gcc/tree-predcom.cc
>>>> +++ b/gcc/tree-predcom.cc
>>>> @@ -409,6 +409,7 @@ public:
>>>>/* Perform the predictive commoning optimization for chains, make this
>>>>   public for being called in callback execute_pred_commoning_cbck.  */
>>>>void execute_pred_commoning (bitmap tmp_vars);
>>>> +  unsigned determine_unroll_factor (const vec );
>>>>
>>>>  private:
>>>>/* The pointer to the given loop.  */
>>>> @@ -2400,13 +2401,46 @@ pcom_worker::execute_pred_commoning_chain (chain_p 
>>>> chain,
>>>> copies as possible.  CHAINS is the list of chains that will be
>>>> optimized.  */
>>>>
>>>> -static unsigned
>>>> -determine

Re: [Patch, tree-optimization, predcom] Improve unroll factor for predictive commoning

2024-07-13 Thread Ajit Agarwal
Hello Richard:

On 12/07/24 6:20 pm, Richard Biener wrote:
> On Fri, Jul 12, 2024 at 12:09 PM Ajit Agarwal  wrote:
>>
>> Hello Richard:
>>
>> On 11/07/24 2:21 pm, Richard Biener wrote:
>>> On Thu, Jul 11, 2024 at 10:30 AM Ajit Agarwal  
>>> wrote:
>>>>
>>>> Hello All:
>>>>
>>>> Unroll factor is determined with max distance across loop iterations.
>>>> The logic for determining the loop unroll factor is based on
>>>> data dependency across loop iterations.
>>>>
>>>> The max distance across loop iterations is the unrolling factor
>>>> that helps in predictive commoning.
>>>
>>> The old comment in the code says
>>>
>>>> -  /* The best unroll factor for this chain is equal to the number of
>>>> -temporary variables that we create for it.  */
>>>
>>> why is that wrong and why is the max dependence distance more correct?
>>>
>>> Do you have a testcase that shows how this makes a (positive) difference?
>>>
>>
>> There is nothing wrong in the existing implementation of unroll
>> factor for predictive commoning.
>>
>> But with max dependence distance we get performance improvement
>> with spec 2017 benchmarks (INT) of 0.01% (Geomean) with and without
>> changes. Improvement in benchmarks with max dependence distance
>> changes.
>>
>> I have used the following flags:
>> -O2 -funroll-loops --param max-unroll-times=8 -fpredictive-commoning 
>> -fno-tree-pre
>>
>> With above flags I ran with and without changes.
> 
> A 0.01% geomean improvement is noise.  Why did you disable PRE?
> 

I have changed the flags used. Now I change the flags to -O3 and
-fpredictive-commoning -funroll-loops.
I am measuring the performance  with the above flags for spec 2017
benchmarks and would let you know the performance by Monday.

>> There is no degradation with spec 2017 (FP benchmarks).
>>
>> Because in predictive commoning we reuse values computed in
>> earlier iterations of a loop in the later ones, max distance is the
>> better choice.
> 
> The re-use distance is the same though.  So your change merely increases
> the unroll factor?  Or can you explain why there is more re-use with
> your change.
>

With -O3 flag and -fpredictive-commoning -funroll-loops in spec 2017 
benchmarks many benchmarks increases the unroll factor with my changes
in predictive commoning determine_unroll_factor.

I have traversed the data dependence relation vector and get the distance
and the max data dependence distance is the unroll factor.

> Richard.
>

Thanks & Regards
Ajit
 
>>> Richard.
>>>
>>
>> Thanks & Regards
>> Ajit
>>
>>>> Bootstrapped and regtested on powerpc64-linux-gnu.
>>>>
>>>> Thanks & Regards
>>>> Ajit
>>>>
>>>> tree-optimization, predcom: Improve unroll factor for predictive commoning
>>>>
>>>> Unroll factor is determined with max distance across loop iterations.
>>>> The logic for determining the loop unroll factor is based on
>>>> data dependency across loop iterations.
>>>>
>>>> The max distance across loop iterations is the unrolling factor
>>>> that helps in predictive commoning.
>>>>
>>>> 2024-07-11  Ajit Kumar Agarwal  
>>>>
>>>> gcc/ChangeLog:
>>>>
>>>> * tree-predcom.cc: Change in determining unroll factor with
>>>> data dependence across loop iterations.
>>>> ---
>>>>  gcc/tree-predcom.cc | 51 ++---
>>>>  1 file changed, 39 insertions(+), 12 deletions(-)
>>>>
>>>> diff --git a/gcc/tree-predcom.cc b/gcc/tree-predcom.cc
>>>> index 9844fee1e97..029b02f5990 100644
>>>> --- a/gcc/tree-predcom.cc
>>>> +++ b/gcc/tree-predcom.cc
>>>> @@ -409,6 +409,7 @@ public:
>>>>/* Perform the predictive commoning optimization for chains, make this
>>>>   public for being called in callback execute_pred_commoning_cbck.  */
>>>>void execute_pred_commoning (bitmap tmp_vars);
>>>> +  unsigned determine_unroll_factor (const vec );
>>>>
>>>>  private:
>>>>/* The pointer to the given loop.  */
>>>> @@ -2400,13 +2401,46 @@ pcom_worker::execute_pred_commoning_chain (chain_p 
>>>> chain,
>>>> copies as possible.  CHAINS is the list of chains that will be
>>>>  

Re: [Patch, tree-optimization, predcom] Improve unroll factor for predictive commoning

2024-07-12 Thread Ajit Agarwal
Hello Richard:

On 11/07/24 2:21 pm, Richard Biener wrote:
> On Thu, Jul 11, 2024 at 10:30 AM Ajit Agarwal  wrote:
>>
>> Hello All:
>>
>> Unroll factor is determined with max distance across loop iterations.
>> The logic for determining the loop unroll factor is based on
>> data dependency across loop iterations.
>>
>> The max distance across loop iterations is the unrolling factor
>> that helps in predictive commoning.
> 
> The old comment in the code says
> 
>> -  /* The best unroll factor for this chain is equal to the number of
>> -temporary variables that we create for it.  */
> 
> why is that wrong and why is the max dependence distance more correct?
> 
> Do you have a testcase that shows how this makes a (positive) difference?
>

There is nothing wrong in the existing implementation of unroll
factor for predictive commoning.

But with max dependence distance we get performance improvement
with spec 2017 benchmarks (INT) of 0.01% (Geomean) with and without
changes. Improvement in benchmarks with max dependence distance
changes.

I have used the following flags:
-O2 -funroll-loops --param max-unroll-times=8 -fpredictive-commoning 
-fno-tree-pre

With above flags I ran with and without changes.

There is no degradation with spec 2017 (FP benchmarks).

Because in predictive commoning we reuse values computed in
earlier iterations of a loop in the later ones, max distance is the
better choice.

> Richard.
>

Thanks & Regards
Ajit
 
>> Bootstrapped and regtested on powerpc64-linux-gnu.
>>
>> Thanks & Regards
>> Ajit
>>
>> tree-optimization, predcom: Improve unroll factor for predictive commoning
>>
>> Unroll factor is determined with max distance across loop iterations.
>> The logic for determining the loop unroll factor is based on
>> data dependency across loop iterations.
>>
>> The max distance across loop iterations is the unrolling factor
>> that helps in predictive commoning.
>>
>> 2024-07-11  Ajit Kumar Agarwal  
>>
>> gcc/ChangeLog:
>>
>> * tree-predcom.cc: Change in determining unroll factor with
>> data dependence across loop iterations.
>> ---
>>  gcc/tree-predcom.cc | 51 ++---
>>  1 file changed, 39 insertions(+), 12 deletions(-)
>>
>> diff --git a/gcc/tree-predcom.cc b/gcc/tree-predcom.cc
>> index 9844fee1e97..029b02f5990 100644
>> --- a/gcc/tree-predcom.cc
>> +++ b/gcc/tree-predcom.cc
>> @@ -409,6 +409,7 @@ public:
>>/* Perform the predictive commoning optimization for chains, make this
>>   public for being called in callback execute_pred_commoning_cbck.  */
>>void execute_pred_commoning (bitmap tmp_vars);
>> +  unsigned determine_unroll_factor (const vec );
>>
>>  private:
>>/* The pointer to the given loop.  */
>> @@ -2400,13 +2401,46 @@ pcom_worker::execute_pred_commoning_chain (chain_p 
>> chain,
>> copies as possible.  CHAINS is the list of chains that will be
>> optimized.  */
>>
>> -static unsigned
>> -determine_unroll_factor (const vec )
>> +unsigned
>> +pcom_worker::determine_unroll_factor (const vec )
>>  {
>>chain_p chain;
>> -  unsigned factor = 1, af, nfactor, i;
>> +  unsigned factor = 1, i;
>>unsigned max = param_max_unroll_times;
>> +  struct data_dependence_relation *ddr;
>> +  unsigned nfactor = 0;
>> +  int nzfactor = 0;
>> +
>> +  /* Best unroll factor is the maximum distance across loop
>> + iterations.  */
>> +  FOR_EACH_VEC_ELT (m_dependences, i, ddr)
>> +{
>> +  for (unsigned j = 0; j < DDR_NUM_DIST_VECTS (ddr); j++)
>> +   {
>> + lambda_vector vec = DDR_DIST_VECT (ddr, j);
>> + widest_int distance = vec[j];
>> + unsigned offset = distance.to_uhwi ();
>> + if (offset == 0)
>> +   continue;
>> +
>> + int dist = offset - nzfactor;
>> + if (dist  == 0)
>> +   continue;
>>
>> + if (nfactor == 0)
>> +   {
>> + nfactor = offset;
>> + nzfactor = offset;
>> +   }
>> + else if (dist <= nzfactor)
>> +   nfactor = offset;
>> +
>> + if (nfactor > 0 && nfactor <= max)
>> +   factor = nfactor;
>> +   }
>> +}
>> +
>> +  int max_use = 0;
>>FOR_EACH_VEC_ELT (chains, i, chain)
>>  {
>>if (chain->type == CT_INVARIANT)
>> @@ -2427,17 +2461,10 @@ determine_unroll_factor (const vec )
>>   continue;
>> }
>>
>> -  /* The best unroll factor for this chain is equal to the number of
>> -temporary variables that we create for it.  */
>> -  af = chain->length;
>>if (chain->has_max_use_after)
>> -   af++;
>> -
>> -  nfactor = factor * af / gcd (factor, af);
>> -  if (nfactor <= max)
>> -   factor = nfactor;
>> +   max_use++;
>>  }
>> -
>> +  factor += max_use;
>>return factor;
>>  }
>>
>> --
>> 2.43.5
>>


[Patch, tree-optimization, predcom] Improve unroll factor for predictive commoning

2024-07-11 Thread Ajit Agarwal
Hello All:

Unroll factor is determined with max distance across loop iterations.
The logic for determining the loop unroll factor is based on
data dependency across loop iterations.

The max distance across loop iterations is the unrolling factor
that helps in predictive commoning.

Bootstrapped and regtested on powerpc64-linux-gnu.

Thanks & Regards
Ajit

tree-optimization, predcom: Improve unroll factor for predictive commoning

Unroll factor is determined with max distance across loop iterations.
The logic for determining the loop unroll factor is based on
data dependency across loop iterations.

The max distance across loop iterations is the unrolling factor
that helps in predictive commoning.

2024-07-11  Ajit Kumar Agarwal  

gcc/ChangeLog:

* tree-predcom.cc: Change in determining unroll factor with
data dependence across loop iterations.
---
 gcc/tree-predcom.cc | 51 ++---
 1 file changed, 39 insertions(+), 12 deletions(-)

diff --git a/gcc/tree-predcom.cc b/gcc/tree-predcom.cc
index 9844fee1e97..029b02f5990 100644
--- a/gcc/tree-predcom.cc
+++ b/gcc/tree-predcom.cc
@@ -409,6 +409,7 @@ public:
   /* Perform the predictive commoning optimization for chains, make this
  public for being called in callback execute_pred_commoning_cbck.  */
   void execute_pred_commoning (bitmap tmp_vars);
+  unsigned determine_unroll_factor (const vec );
 
 private:
   /* The pointer to the given loop.  */
@@ -2400,13 +2401,46 @@ pcom_worker::execute_pred_commoning_chain (chain_p 
chain,
copies as possible.  CHAINS is the list of chains that will be
optimized.  */
 
-static unsigned
-determine_unroll_factor (const vec )
+unsigned
+pcom_worker::determine_unroll_factor (const vec )
 {
   chain_p chain;
-  unsigned factor = 1, af, nfactor, i;
+  unsigned factor = 1, i;
   unsigned max = param_max_unroll_times;
+  struct data_dependence_relation *ddr;
+  unsigned nfactor = 0;
+  int nzfactor = 0;
+
+  /* Best unroll factor is the maximum distance across loop
+ iterations.  */
+  FOR_EACH_VEC_ELT (m_dependences, i, ddr)
+{
+  for (unsigned j = 0; j < DDR_NUM_DIST_VECTS (ddr); j++)
+   {
+ lambda_vector vec = DDR_DIST_VECT (ddr, j);
+ widest_int distance = vec[j];
+ unsigned offset = distance.to_uhwi ();
+ if (offset == 0)
+   continue;
+
+ int dist = offset - nzfactor;
+ if (dist  == 0)
+   continue;
 
+ if (nfactor == 0)
+   {
+ nfactor = offset;
+ nzfactor = offset;
+   }
+ else if (dist <= nzfactor)
+   nfactor = offset;
+
+ if (nfactor > 0 && nfactor <= max)
+   factor = nfactor;
+   }
+}
+
+  int max_use = 0;
   FOR_EACH_VEC_ELT (chains, i, chain)
 {
   if (chain->type == CT_INVARIANT)
@@ -2427,17 +2461,10 @@ determine_unroll_factor (const vec )
  continue;
}
 
-  /* The best unroll factor for this chain is equal to the number of
-temporary variables that we create for it.  */
-  af = chain->length;
   if (chain->has_max_use_after)
-   af++;
-
-  nfactor = factor * af / gcd (factor, af);
-  if (nfactor <= max)
-   factor = nfactor;
+   max_use++;
 }
-
+  factor += max_use;
   return factor;
 }
 
-- 
2.43.5



[PING^0][Patch, rs6000, middle-end] v6: Add implementation for different targets for pair mem fusion

2024-07-07 Thread Ajit Agarwal
Ping ! Please let me know OK for trunk.

Thanks & Regards
Ajit


 Forwarded Message 
Subject: [Patch, rs6000, middle-end] v6: Add implementation for different 
targets for pair mem fusion
Date: Tue, 2 Jul 2024 14:15:02 +0530
From: Ajit Agarwal 
To: Alex Coplan , Richard Sandiford 
, Kewen.Lin , Segher 
Boessenkool , Michael Meissner 
, Peter Bergner , David Edelsohn 
, gcc-patches 

Hello All:

This version of patch relaxes store fusion for more use cases.

Common infrastructure using generic code for pair mem fusion of different
targets.

rs6000 target specific code implement virtual functions defined by generic code.

Target specific code are added in rs6000-mem-fusion.cc.

Bootstrapped and regtested on powerpc64-linux-gnu.

Thanks & Regards
Ajit


rs6000, middle-end: Add implementation for different targets for pair mem fusion

Common infrastructure using generic code for pair mem fusion of different
targets.

rs6000 target specific code implement virtual functions defined by generic code.

Target specific code are added in rs6000-mem-fusion.cc.

2024-07-02  Ajit Kumar Agarwal  

gcc/ChangeLog:

* config/rs6000/rs6000-passes.def: New mem fusion pass
before pass_early_remat.
* pair-fusion.h: Add additional pure virtual function
required for rs6000 target implementation.
* pair-fusion.cc: Use of virtual functions for additional
virtual function addded for rs6000 target.
* config/rs6000/rs6000-mem-fusion.cc: Add new pass.
Add target specific implementation for generic pure virtual
functions.
* config/rs6000/mma.md: Modify movoo machine description.
Add new machine description movoo1.
* config/rs6000/rs6000.cc: Modify rs6000_split_multireg_move
to expand movoo machine description for all constraints.
* config.gcc: Add new object file.
* config/rs6000/rs6000-protos.h: Add new prototype for mem
fusion pass.
* config/rs6000/t-rs6000: Add new rule.
* rtl-ssa/functions.h: Move out allocate function from private
to public and add get_m_temp_defs function.

gcc/testsuite/ChangeLog:

* g++.target/powerpc/mem-fusion.C: New test.
* g++.target/powerpc/mem-fusion-1.C: New test.
* gcc.target/powerpc/mma-builtin-1.c: Modify test.
---
 gcc/config.gcc|   2 +
 gcc/config/rs6000/mma.md  |  26 +-
 gcc/config/rs6000/rs6000-mem-fusion.cc| 708 ++
 gcc/config/rs6000/rs6000-passes.def   |   4 +-
 gcc/config/rs6000/rs6000-protos.h |   1 +
 gcc/config/rs6000/rs6000.cc   |  57 +-
 gcc/config/rs6000/rs6000.md   |   1 +
 gcc/config/rs6000/t-rs6000|   5 +
 gcc/pair-fusion.cc|  27 +-
 gcc/pair-fusion.h |  34 +
 gcc/rtl-ssa/functions.h   |  11 +-
 .../g++.target/powerpc/mem-fusion-1.C |  22 +
 gcc/testsuite/g++.target/powerpc/mem-fusion.C |  15 +
 .../gcc.target/powerpc/mma-builtin-1.c|   4 +-
 14 files changed, 890 insertions(+), 27 deletions(-)
 create mode 100644 gcc/config/rs6000/rs6000-mem-fusion.cc
 create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion-1.C
 create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion.C

diff --git a/gcc/config.gcc b/gcc/config.gcc
index bc45615741b..12f79a78177 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -524,6 +524,7 @@ powerpc*-*-*)
extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
extra_objs="${extra_objs} rs6000-builtins.o rs6000-builtin.o"
+   extra_objs="${extra_objs} rs6000-mem-fusion.o"
extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h"
extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h"
extra_headers="${extra_headers} xmmintrin.h mm_malloc.h emmintrin.h"
@@ -560,6 +561,7 @@ rs6000*-*-*)
extra_options="${extra_options} g.opt fused-madd.opt 
rs6000/rs6000-tables.opt"
extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
+   extra_objs="${extra_objs} rs6000-mem-fusion.o"
target_gtfiles="$target_gtfiles 
\$(srcdir)/config/rs6000/rs6000-logue.cc 
\$(srcdir)/config/rs6000/rs6000-call.cc"
target_gtfiles="$target_gtfiles 
\$(srcdir)/config/rs6000/rs6000-pcrel-opt.cc"
;;
diff --git a/gcc/config/rs6000/mma.md b/gcc/config/rs6000/mma.md
index 04e2d0066df..88413926a02 100644
--- a/gcc/config/rs6000/mma.md
+++ b/gcc/config/rs6000/mma.md
@@ -294,7 +294,31 @@
 
 (define_insn_and_split "*movoo"
   [(set (match_operand:OO 0 "nonimmediate_ope

Re: [Patch, rtl-optimization]: Loop unroll factor based on register pressure

2024-07-04 Thread Ajit Agarwal
Hello Richard:

On 03/07/24 2:18 pm, Richard Biener wrote:
> On Sun, Jun 30, 2024 at 4:15 AM Ajit Agarwal  wrote:
>>
>> Hello All:
>>
>> This patch determines Unroll factor based on loop register pressure.
>>
>> Unroll factor is quotient of max of available registers in loop
>> by number of liveness.
>>
>> If available registers increases unroll factor increases.
>> Wherein unroll factor decreases if number of liveness increases.
> 
> Unrolling as implemented does not increase register lifetime unless
> -fsplit-ivs-in-unroller or -fvariable-expansion-in-unroller.  But I do not
> see you looking at those transforms at all.
> 
Based on your feedback I have changed the logic of determining
unroll factor for loops in version-1 of the patch.

Unroll factor is calculated based on available registers and regs
needed inside the loops.

Unroll factor is quotient of max of available registers in loop
over regs needed inside the loops.

Considered -fsplit-ivs-in-unroller or -fvariable-expansion-in-unroller
for controlling unroll factor for loops.

Unroll factor is directly proportional to available register.
Wherein unroll factor is inversely proportional to register
needed inside loops.

Registers needed are the Loop variables/Loop induction
variables inside the loops.

Available registers are determined by the number of hard registers
available for each register class minus register needed inside the
loops for given register class.

Thanks & Regards
Ajit

> Richard.
> 
>> Loop unrolling is based on loop variables that determines unroll
>> factor. Loop variables of the loop are the variables that increases
>> register pressure and take advantage of existing register pressure
>> calculation.
>>
>> Available registers are determined by the number of hard registers
>> available for each register class minus max reg pressure of loop
>> for given register class.
>>
>> Bootstrapped and regtested on powerpc64-linux-gnu.
>>
>> Thanks & Regards
>> Ajit
>>
>>
>> rtl-optimization: Loop unroll factor based on register pressure
>>
>> Unroll factor is calculated based on loop register pressure.
>>
>> Unroll factor is quotient of max of available registers in loop
>> by number of liveness.
>>
>> If available registers increases unroll factor increases.
>> Wherein unroll factor decreases if number of liveness increases.
>>
>> Loop unrolling is based on loop variables that determines unroll
>> factor. Loop variables of the loop are the variables that increases
>> register pressure and take advantage of existing register pressure
>> calculation.
>>
>> Available registers are determined by the number of hard registers
>> available for each register class minus max reg pressure of loop
>> for given register class.
>>
>> 2024-06-29  Ajit Kumar Agarwal  
>>
>> gcc/ChangeLog:
>>
>> * loop-unroll.cc: Add calculation of register pressure of
>> the loop and use of that to calculate unroll factor.
>> ---
>>  gcc/loop-unroll.cc | 331 -
>>  1 file changed, 328 insertions(+), 3 deletions(-)
>>
>> diff --git a/gcc/loop-unroll.cc b/gcc/loop-unroll.cc
>> index bfdfe6c2bb7..6936ba7afb9 100644
>> --- a/gcc/loop-unroll.cc
>> +++ b/gcc/loop-unroll.cc
>> @@ -35,6 +35,11 @@ along with GCC; see the file COPYING3.  If not see
>>  #include "dojump.h"
>>  #include "expr.h"
>>  #include "dumpfile.h"
>> +#include "regs.h"
>> +#include "ira.h"
>> +#include "rtl-iter.h"
>> +#include "regset.h"
>> +#include "df.h"
>>
>>  /* This pass performs loop unrolling.  We only perform this
>> optimization on innermost loops (with single exception) because
>> @@ -65,6 +70,38 @@ along with GCC; see the file COPYING3.  If not see
>> showed that this choice may affect performance in order of several %.
>> */
>>
>> +class loop_data
>> +{
>> +public:
>> +  class loop *outermost_exit;  /* The outermost exit of the loop.  */
>> +  bool has_call;   /* True if the loop contains a call.  */
>> +  /* Maximal register pressure inside loop for given register class
>> + (defined only for the pressure classes).  */
>> +  int max_reg_pressure[N_REG_CLASSES];
>> +  /* Loop regs referenced and live pseudo-registers.  */
>> +  bitmap_head regs_ref;
>> +  bitmap_head regs_live;
>> +};
>> +
>> +#define LOOP_DATA(LOOP) ((class loop_data *) (LOOP)->aux)
>> +
>>

[patch, rtl-optimization, loop-unroll] v1: Loop unroll factor based on,available registers over reg needed inside loops

2024-07-04 Thread Ajit Agarwal
Hello Richard:

Based on your feedback I have changed the logic of determining
unroll factor for loops.

Unroll factor is calculated based on available registers and regs
needed inside the loops.

Unroll factor is quotient of max of available registers in loop
over regs needed inside the loops.

Considered -fsplit-ivs-in-unroller or -fvariable-expansion-in-unroller
for controlling unroll factor for loops.

Unroll factor is directly proportional to available register.
Wherein unroll factor is inversely proportional to register
needed inside loops.

Registers needed are the Loop variables/Loop induction
variables inside the loops.

Available registers are determined by the number of hard registers
available for each register class minus register needed inside the
loops for given register class.


Bootstrapped and regtested on powerpc64-linux-gnu.

Thanks & Regards
Ajit


rtl-optimization, loop-unroll: Loop unroll factor based on
available registers over reg needed inside loops

Unroll factor is calculated based on available register and regs
needed inside the loops.

Unroll factor is quotient of max of available registers in loop
over regs needed inside the loops.

Considered -fsplit-ivs-in-unroller or -fvariable-expansion-in-unroller
for controlling unroll factor for loops.

Unroll factor is directly proportional to available register.
Wherein unroll factor is inversely proportional to register
needed inside loops.

Registers needed are the Loop variables/Loop induction
variables inside the loops.

Available registers are determined by the number of hard registers
available for each register class minus register needed inside the
loops for given register class.

2024-07-04  Ajit Kumar Agarwal  

gcc/ChangeLog:

* loop-unroll.cc: Add calculation of unroll factor and use
of unroll factor on loop unrolling.
---
 gcc/loop-unroll.cc | 427 -
 1 file changed, 424 insertions(+), 3 deletions(-)

diff --git a/gcc/loop-unroll.cc b/gcc/loop-unroll.cc
index bfdfe6c2bb7..86e0cc84eec 100644
--- a/gcc/loop-unroll.cc
+++ b/gcc/loop-unroll.cc
@@ -35,6 +35,11 @@ along with GCC; see the file COPYING3.  If not see
 #include "dojump.h"
 #include "expr.h"
 #include "dumpfile.h"
+#include "regs.h"
+#include "ira.h"
+#include "rtl-iter.h"
+#include "regset.h"
+#include "df.h"
 
 /* This pass performs loop unrolling.  We only perform this
optimization on innermost loops (with single exception) because
@@ -65,6 +70,39 @@ along with GCC; see the file COPYING3.  If not see
showed that this choice may affect performance in order of several %.
*/
 
+class loop_data
+{
+public:
+  class loop *outermost_exit;  /* The outermost exit of the loop.  */
+  bool has_call;   /* True if the loop contains a call.  */
+  /* Maximal register pressure inside loop for given register class
+ (defined only for the pressure classes).  */
+  int max_reg_pressure[N_REG_CLASSES];
+  int regs_needed[N_REG_CLASSES];
+  /* Loop regs referenced and live pseudo-registers.  */
+  bitmap_head regs_ref;
+  bitmap_head regs_live;
+};
+
+#define LOOP_DATA(LOOP) ((class loop_data *) (LOOP)->aux)
+
+/* Record all regs that are set in any one insn.  Communication from
+   mark_reg_{store,clobber} and global_conflicts.  Asm can refer to
+   all hard-registers.  */
+static rtx regs_set[(FIRST_PSEUDO_REGISTER > MAX_RECOG_OPERANDS
+? FIRST_PSEUDO_REGISTER : MAX_RECOG_OPERANDS) * 2];
+/* Number of regs stored in the previous array.  */
+static int n_regs_set;
+
+/* Currently processed loop.  */
+static class loop *curr_loop;
+
+/* Registers currently living.  */
+static bitmap_head curr_regs_live;
+
+/* Current reg pressure for each pressure class.  */
+static int curr_reg_pressure[N_REG_CLASSES];
+
 /* Information about induction variables to split.  */
 
 struct iv_to_split
@@ -102,6 +140,7 @@ struct iv_split_hasher : free_ptr_hash 
   static inline bool equal (const iv_to_split *, const iv_to_split *);
 };
 
+void mark_insn_with_unroller (class loop *loop, basic_block bb);
 
 /* A hash function for information about insns to split.  */
 
@@ -272,11 +311,263 @@ decide_unrolling (int flags)
 }
 }
 
+/* Return pressure class and number of needed hard registers (through
+   *NREGS) of register REGNO.  */
+static enum reg_class
+get_regno_pressure_class (int regno, int *nregs)
+{
+  if (regno >= FIRST_PSEUDO_REGISTER)
+{
+  enum reg_class pressure_class;
+  pressure_class = reg_allocno_class (regno);
+  pressure_class = ira_pressure_class_translate[pressure_class];
+  *nregs
+   = ira_reg_class_max_nregs[pressure_class][PSEUDO_REGNO_MODE (regno)];
+  return pressure_class;
+}
+  else if (! TEST_HARD_REG_BIT (ira_no_alloc_regs, regno)
+  && ! TEST_HARD_REG_BIT (eliminable_regset, regno))
+{
+  *nregs = 1;
+  return ira_pressure_class_translate[REGNO_REG_CLASS (regno)];
+}
+  else
+{
+  *nregs = 0;
+  

[Patch, rs6000, middle-end] v6: Add implementation for different targets for pair mem fusion

2024-07-02 Thread Ajit Agarwal
Hello All:

This version of patch relaxes store fusion for more use cases.

Common infrastructure using generic code for pair mem fusion of different
targets.

rs6000 target specific code implement virtual functions defined by generic code.

Target specific code are added in rs6000-mem-fusion.cc.

Bootstrapped and regtested on powerpc64-linux-gnu.

Thanks & Regards
Ajit


rs6000, middle-end: Add implementation for different targets for pair mem fusion

Common infrastructure using generic code for pair mem fusion of different
targets.

rs6000 target specific code implement virtual functions defined by generic code.

Target specific code are added in rs6000-mem-fusion.cc.

2024-07-02  Ajit Kumar Agarwal  

gcc/ChangeLog:

* config/rs6000/rs6000-passes.def: New mem fusion pass
before pass_early_remat.
* pair-fusion.h: Add additional pure virtual function
required for rs6000 target implementation.
* pair-fusion.cc: Use of virtual functions for additional
virtual function addded for rs6000 target.
* config/rs6000/rs6000-mem-fusion.cc: Add new pass.
Add target specific implementation for generic pure virtual
functions.
* config/rs6000/mma.md: Modify movoo machine description.
Add new machine description movoo1.
* config/rs6000/rs6000.cc: Modify rs6000_split_multireg_move
to expand movoo machine description for all constraints.
* config.gcc: Add new object file.
* config/rs6000/rs6000-protos.h: Add new prototype for mem
fusion pass.
* config/rs6000/t-rs6000: Add new rule.
* rtl-ssa/functions.h: Move out allocate function from private
to public and add get_m_temp_defs function.

gcc/testsuite/ChangeLog:

* g++.target/powerpc/mem-fusion.C: New test.
* g++.target/powerpc/mem-fusion-1.C: New test.
* gcc.target/powerpc/mma-builtin-1.c: Modify test.
---
 gcc/config.gcc|   2 +
 gcc/config/rs6000/mma.md  |  26 +-
 gcc/config/rs6000/rs6000-mem-fusion.cc| 708 ++
 gcc/config/rs6000/rs6000-passes.def   |   4 +-
 gcc/config/rs6000/rs6000-protos.h |   1 +
 gcc/config/rs6000/rs6000.cc   |  57 +-
 gcc/config/rs6000/rs6000.md   |   1 +
 gcc/config/rs6000/t-rs6000|   5 +
 gcc/pair-fusion.cc|  27 +-
 gcc/pair-fusion.h |  34 +
 gcc/rtl-ssa/functions.h   |  11 +-
 .../g++.target/powerpc/mem-fusion-1.C |  22 +
 gcc/testsuite/g++.target/powerpc/mem-fusion.C |  15 +
 .../gcc.target/powerpc/mma-builtin-1.c|   4 +-
 14 files changed, 890 insertions(+), 27 deletions(-)
 create mode 100644 gcc/config/rs6000/rs6000-mem-fusion.cc
 create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion-1.C
 create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion.C

diff --git a/gcc/config.gcc b/gcc/config.gcc
index bc45615741b..12f79a78177 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -524,6 +524,7 @@ powerpc*-*-*)
extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
extra_objs="${extra_objs} rs6000-builtins.o rs6000-builtin.o"
+   extra_objs="${extra_objs} rs6000-mem-fusion.o"
extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h"
extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h"
extra_headers="${extra_headers} xmmintrin.h mm_malloc.h emmintrin.h"
@@ -560,6 +561,7 @@ rs6000*-*-*)
extra_options="${extra_options} g.opt fused-madd.opt 
rs6000/rs6000-tables.opt"
extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
+   extra_objs="${extra_objs} rs6000-mem-fusion.o"
target_gtfiles="$target_gtfiles 
\$(srcdir)/config/rs6000/rs6000-logue.cc 
\$(srcdir)/config/rs6000/rs6000-call.cc"
target_gtfiles="$target_gtfiles 
\$(srcdir)/config/rs6000/rs6000-pcrel-opt.cc"
;;
diff --git a/gcc/config/rs6000/mma.md b/gcc/config/rs6000/mma.md
index 04e2d0066df..88413926a02 100644
--- a/gcc/config/rs6000/mma.md
+++ b/gcc/config/rs6000/mma.md
@@ -294,7 +294,31 @@
 
 (define_insn_and_split "*movoo"
   [(set (match_operand:OO 0 "nonimmediate_operand" "=wa,ZwO,wa")
-   (match_operand:OO 1 "input_operand" "ZwO,wa,wa"))]
+(match_operand:OO 1 "input_operand" "ZwO,wa,wa"))]
+  "TARGET_MMA
+   && (gpc_reg_operand (operands[0], OOmode)
+   || gpc_reg_operand (operands[1], OOmode))"
+;;""
+  "@
+   #
+   #
+   #"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  rs6000_split_multireg_move (operands[0], operands[1]);
+  DONE;
+}
+  [(set_attr "type" "vecload,vecstore,veclogical")
+   (set_attr "length" "*,*,8")])
+;;   (set_attr "max_prefixed_insns" "2,2,*")])
+
+
+(define_insn_and_split "*movoo1"
+ 

[Patch, rtl-optimization, loop-unroll] Loop unroll factor based on register pressure

2024-06-30 Thread Ajit Agarwal
Hello All:

This patch determines unroll factor based on loop register pressure.

Unroll factor is quotient of max of available registers in loop
by number of liveness.

If available registers increases unroll factor increases.
Wherein unroll factor decreases if number of liveness increases.

Loop unrolling is based on loop variables that determines unroll
factor. Loop variables of the loop are the variables that increases
register pressure and take advantage of existing register pressure
calculation.

Available registers are determined by the number of hard registers
available for each register class minus max reg pressure of loop
for given register class.

Bootstrapped and regtested on powerpc64-linux-gnu.

Thanks & Regards
Ajit


rtl-optimization: Loop unroll factor based on register pressure

Unroll factor is calculated based on loop register pressure.

Unroll factor is quotient of max of available registers in loop
by number of liveness.

If available registers increases unroll factor increases.
Wherein unroll factor decreases if number of liveness increases.

Loop unrolling is based on loop variables that determines unroll
factor. Loop variables of the loop are the variables that increases
register pressure and take advantage of existing register pressure
calculation.

Available registers are determined by the number of hard registers
available for each register class minus max reg pressure of loop
for given register class.

2024-06-29  Ajit Kumar Agarwal  

gcc/ChangeLog:

* loop-unroll.cc: Add calculation of register pressure of
the loop and use of that to calculate unroll factor.
---
 gcc/loop-unroll.cc | 331 -
 1 file changed, 328 insertions(+), 3 deletions(-)

diff --git a/gcc/loop-unroll.cc b/gcc/loop-unroll.cc
index bfdfe6c2bb7..6936ba7afb9 100644
--- a/gcc/loop-unroll.cc
+++ b/gcc/loop-unroll.cc
@@ -35,6 +35,11 @@ along with GCC; see the file COPYING3.  If not see
 #include "dojump.h"
 #include "expr.h"
 #include "dumpfile.h"
+#include "regs.h"
+#include "ira.h"
+#include "rtl-iter.h"
+#include "regset.h"
+#include "df.h"
 
 /* This pass performs loop unrolling.  We only perform this
optimization on innermost loops (with single exception) because
@@ -65,6 +70,38 @@ along with GCC; see the file COPYING3.  If not see
showed that this choice may affect performance in order of several %.
*/
 
+class loop_data
+{
+public:
+  class loop *outermost_exit;  /* The outermost exit of the loop.  */
+  bool has_call;   /* True if the loop contains a call.  */
+  /* Maximal register pressure inside loop for given register class
+ (defined only for the pressure classes).  */
+  int max_reg_pressure[N_REG_CLASSES];
+  /* Loop regs referenced and live pseudo-registers.  */
+  bitmap_head regs_ref;
+  bitmap_head regs_live;
+};
+
+#define LOOP_DATA(LOOP) ((class loop_data *) (LOOP)->aux)
+
+/* Record all regs that are set in any one insn.  Communication from
+   mark_reg_{store,clobber} and global_conflicts.  Asm can refer to
+   all hard-registers.  */
+static rtx regs_set[(FIRST_PSEUDO_REGISTER > MAX_RECOG_OPERANDS
+? FIRST_PSEUDO_REGISTER : MAX_RECOG_OPERANDS) * 2];
+/* Number of regs stored in the previous array.  */
+static int n_regs_set;
+
+/* Currently processed loop.  */
+static class loop *curr_loop;
+
+/* Registers currently living.  */
+static bitmap_head curr_regs_live;
+
+/* Current reg pressure for each pressure class.  */
+static int curr_reg_pressure[N_REG_CLASSES];
+
 /* Information about induction variables to split.  */
 
 struct iv_to_split
@@ -272,11 +309,262 @@ decide_unrolling (int flags)
 }
 }
 
+/* Return pressure class and number of needed hard registers (through
+   *NREGS) of register REGNO.  */
+static enum reg_class
+get_regno_pressure_class (int regno, int *nregs)
+{
+  if (regno >= FIRST_PSEUDO_REGISTER)
+{
+  enum reg_class pressure_class;
+  pressure_class = reg_allocno_class (regno);
+  pressure_class = ira_pressure_class_translate[pressure_class];
+  *nregs
+   = ira_reg_class_max_nregs[pressure_class][PSEUDO_REGNO_MODE (regno)];
+  return pressure_class;
+}
+  else if (! TEST_HARD_REG_BIT (ira_no_alloc_regs, regno)
+  && ! TEST_HARD_REG_BIT (eliminable_regset, regno))
+{
+  *nregs = 1;
+  return ira_pressure_class_translate[REGNO_REG_CLASS (regno)];
+}
+  else
+{
+  *nregs = 0;
+  return NO_REGS;
+}
+}
+
+/* Increase (if INCR_P) or decrease current register pressure for
+   register REGNO.  */
+static void
+change_pressure (int regno, bool incr_p)
+{
+  int nregs;
+  enum reg_class pressure_class;
+
+  pressure_class = get_regno_pressure_class (regno, );
+  if (! incr_p)
+curr_reg_pressure[pressure_class] -= nregs;
+  else
+{
+  curr_reg_pressure[pressure_class] += nregs;
+  if (LOOP_DATA (curr_loop)->max_reg_pressure[pressure_class]
+ < 

test

2024-06-30 Thread Ajit Agarwal


[Patch, rtl-optimization]: Loop unroll factor based on register pressure

2024-06-29 Thread Ajit Agarwal
Hello All:

This patch determines Unroll factor based on loop register pressure.

Unroll factor is quotient of max of available registers in loop
by number of liveness.

If available registers increases unroll factor increases.
Wherein unroll factor decreases if number of liveness increases.

Loop unrolling is based on loop variables that determines unroll
factor. Loop variables of the loop are the variables that increases
register pressure and take advantage of existing register pressure
calculation.

Available registers are determined by the number of hard registers
available for each register class minus max reg pressure of loop
for given register class.

Bootstrapped and regtested on powerpc64-linux-gnu.

Thanks & Regards
Ajit


rtl-optimization: Loop unroll factor based on register pressure

Unroll factor is calculated based on loop register pressure.

Unroll factor is quotient of max of available registers in loop
by number of liveness.

If available registers increases unroll factor increases.
Wherein unroll factor decreases if number of liveness increases.

Loop unrolling is based on loop variables that determines unroll
factor. Loop variables of the loop are the variables that increases
register pressure and take advantage of existing register pressure
calculation.

Available registers are determined by the number of hard registers
available for each register class minus max reg pressure of loop
for given register class.

2024-06-29  Ajit Kumar Agarwal  

gcc/ChangeLog:

* loop-unroll.cc: Add calculation of register pressure of
the loop and use of that to calculate unroll factor.
---
 gcc/loop-unroll.cc | 331 -
 1 file changed, 328 insertions(+), 3 deletions(-)

diff --git a/gcc/loop-unroll.cc b/gcc/loop-unroll.cc
index bfdfe6c2bb7..6936ba7afb9 100644
--- a/gcc/loop-unroll.cc
+++ b/gcc/loop-unroll.cc
@@ -35,6 +35,11 @@ along with GCC; see the file COPYING3.  If not see
 #include "dojump.h"
 #include "expr.h"
 #include "dumpfile.h"
+#include "regs.h"
+#include "ira.h"
+#include "rtl-iter.h"
+#include "regset.h"
+#include "df.h"
 
 /* This pass performs loop unrolling.  We only perform this
optimization on innermost loops (with single exception) because
@@ -65,6 +70,38 @@ along with GCC; see the file COPYING3.  If not see
showed that this choice may affect performance in order of several %.
*/
 
+class loop_data
+{
+public:
+  class loop *outermost_exit;  /* The outermost exit of the loop.  */
+  bool has_call;   /* True if the loop contains a call.  */
+  /* Maximal register pressure inside loop for given register class
+ (defined only for the pressure classes).  */
+  int max_reg_pressure[N_REG_CLASSES];
+  /* Loop regs referenced and live pseudo-registers.  */
+  bitmap_head regs_ref;
+  bitmap_head regs_live;
+};
+
+#define LOOP_DATA(LOOP) ((class loop_data *) (LOOP)->aux)
+
+/* Record all regs that are set in any one insn.  Communication from
+   mark_reg_{store,clobber} and global_conflicts.  Asm can refer to
+   all hard-registers.  */
+static rtx regs_set[(FIRST_PSEUDO_REGISTER > MAX_RECOG_OPERANDS
+? FIRST_PSEUDO_REGISTER : MAX_RECOG_OPERANDS) * 2];
+/* Number of regs stored in the previous array.  */
+static int n_regs_set;
+
+/* Currently processed loop.  */
+static class loop *curr_loop;
+
+/* Registers currently living.  */
+static bitmap_head curr_regs_live;
+
+/* Current reg pressure for each pressure class.  */
+static int curr_reg_pressure[N_REG_CLASSES];
+
 /* Information about induction variables to split.  */
 
 struct iv_to_split
@@ -272,11 +309,262 @@ decide_unrolling (int flags)
 }
 }
 
+/* Return pressure class and number of needed hard registers (through
+   *NREGS) of register REGNO.  */
+static enum reg_class
+get_regno_pressure_class (int regno, int *nregs)
+{
+  if (regno >= FIRST_PSEUDO_REGISTER)
+{
+  enum reg_class pressure_class;
+  pressure_class = reg_allocno_class (regno);
+  pressure_class = ira_pressure_class_translate[pressure_class];
+  *nregs
+   = ira_reg_class_max_nregs[pressure_class][PSEUDO_REGNO_MODE (regno)];
+  return pressure_class;
+}
+  else if (! TEST_HARD_REG_BIT (ira_no_alloc_regs, regno)
+  && ! TEST_HARD_REG_BIT (eliminable_regset, regno))
+{
+  *nregs = 1;
+  return ira_pressure_class_translate[REGNO_REG_CLASS (regno)];
+}
+  else
+{
+  *nregs = 0;
+  return NO_REGS;
+}
+}
+
+/* Increase (if INCR_P) or decrease current register pressure for
+   register REGNO.  */
+static void
+change_pressure (int regno, bool incr_p)
+{
+  int nregs;
+  enum reg_class pressure_class;
+
+  pressure_class = get_regno_pressure_class (regno, );
+  if (! incr_p)
+curr_reg_pressure[pressure_class] -= nregs;
+  else
+{
+  curr_reg_pressure[pressure_class] += nregs;
+  if (LOOP_DATA (curr_loop)->max_reg_pressure[pressure_class]
+ < 

test mail

2024-06-29 Thread Ajit Agarwal


test mail

2024-06-29 Thread Ajit Agarwal


[Patch, rs6000, middle-end] v5: Add implementation for different targets for pair mem fusion

2024-06-25 Thread Ajit Agarwal
Hello All:

This patch addressed cleanup of the code and fix linaro failures.

All comments are addressed.

Common infrastructure using generic code for pair mem fusion of different
targets.

rs6000 target specific code implement virtual functions defined by generic code.

Target specific code are added in rs6000-mem-fusion.cc.

Bootstrapped and regtested on powerpc64-linux-gnu.

Thanks & Regards
Ajit


rs6000, middle-end: Add implementation for different targets for pair mem fusion

Common infrastructure using generic code for pair mem fusion of different
targets.

rs6000 target specific code implement virtual functions defined by generic code.

Target specific code are added in rs6000-mem-fusion.cc.

2024-06-25  Ajit Kumar Agarwal  

gcc/ChangeLog:

* config/rs6000/rs6000-passes.def: New mem fusion pass
before pass_early_remat.
* pair-fusion.h: Add additional pure virtual function
required for rs6000 target implementation.
* pair-fusion.cc: Use of virtual functions for additional
virtual function addded for rs6000 target.
* config/rs6000/rs6000-mem-fusion.cc: Add new pass.
Add target specific implementation for generic pure virtual
functions.
* config/rs6000/mma.md: Modify movoo machine description.
Add new machine description movoo1.
* config/rs6000/rs6000.cc: Modify rs6000_split_multireg_move
to expand movoo machine description for all constraints.
* config.gcc: Add new object file.
* config/rs6000/rs6000-protos.h: Add new prototype for mem
fusion pass.
* config/rs6000/t-rs6000: Add new rule.
* rtl-ssa/functions.h: Move out allocate function from private
to public and add get_m_temp_defs function.

gcc/testsuite/ChangeLog:

* g++.target/powerpc/mem-fusion.C: New test.
* g++.target/powerpc/mem-fusion-1.C: New test.
* gcc.target/powerpc/mma-builtin-1.c: Modify test.
---
 gcc/config.gcc|   2 +
 gcc/config/rs6000/mma.md  |  26 +-
 gcc/config/rs6000/rs6000-mem-fusion.cc| 724 ++
 gcc/config/rs6000/rs6000-passes.def   |   4 +-
 gcc/config/rs6000/rs6000-protos.h |   1 +
 gcc/config/rs6000/rs6000.cc   |  56 +-
 gcc/config/rs6000/rs6000.md   |   1 +
 gcc/config/rs6000/t-rs6000|   5 +
 gcc/pair-fusion.cc|  27 +-
 gcc/pair-fusion.h |  34 +
 gcc/rtl-ssa/functions.h   |  11 +-
 .../g++.target/powerpc/mem-fusion-1.C |  22 +
 gcc/testsuite/g++.target/powerpc/mem-fusion.C |  15 +
 .../gcc.target/powerpc/mma-builtin-1.c|   4 +-
 14 files changed, 905 insertions(+), 27 deletions(-)
 create mode 100644 gcc/config/rs6000/rs6000-mem-fusion.cc
 create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion-1.C
 create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion.C

diff --git a/gcc/config.gcc b/gcc/config.gcc
index 644c456290d..a032723152f 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -524,6 +524,7 @@ powerpc*-*-*)
extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
extra_objs="${extra_objs} rs6000-builtins.o rs6000-builtin.o"
+   extra_objs="${extra_objs} rs6000-mem-fusion.o"
extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h"
extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h"
extra_headers="${extra_headers} xmmintrin.h mm_malloc.h emmintrin.h"
@@ -560,6 +561,7 @@ rs6000*-*-*)
extra_options="${extra_options} g.opt fused-madd.opt 
rs6000/rs6000-tables.opt"
extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
+   extra_objs="${extra_objs} rs6000-mem-fusion.o"
target_gtfiles="$target_gtfiles 
\$(srcdir)/config/rs6000/rs6000-logue.cc 
\$(srcdir)/config/rs6000/rs6000-call.cc"
target_gtfiles="$target_gtfiles 
\$(srcdir)/config/rs6000/rs6000-pcrel-opt.cc"
;;
diff --git a/gcc/config/rs6000/mma.md b/gcc/config/rs6000/mma.md
index 04e2d0066df..88413926a02 100644
--- a/gcc/config/rs6000/mma.md
+++ b/gcc/config/rs6000/mma.md
@@ -294,7 +294,31 @@
 
 (define_insn_and_split "*movoo"
   [(set (match_operand:OO 0 "nonimmediate_operand" "=wa,ZwO,wa")
-   (match_operand:OO 1 "input_operand" "ZwO,wa,wa"))]
+(match_operand:OO 1 "input_operand" "ZwO,wa,wa"))]
+  "TARGET_MMA
+   && (gpc_reg_operand (operands[0], OOmode)
+   || gpc_reg_operand (operands[1], OOmode))"
+;;""
+  "@
+   #
+   #
+   #"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  rs6000_split_multireg_move (operands[0], operands[1]);
+  DONE;
+}
+  [(set_attr "type" "vecload,vecstore,veclogical")
+   (set_attr "length" "*,*,8")])
+;;   (set_attr "max_prefixed_insns" "2,2,*")])
+
+

Re: [Patch, rs6000, middle-end] v2: Add implementation for different targets for pair mem fusion

2024-06-20 Thread Ajit Agarwal
Hello Richard:

On 19/06/24 3:26 pm, Richard Sandiford wrote:
> Ajit Agarwal  writes:
>> On 19/06/24 2:52 pm, Richard Sandiford wrote:
>>> Ajit Agarwal  writes:
>>>> On 19/06/24 2:40 pm, Richard Sandiford wrote:
>>>>> Ajit Agarwal  writes:
>>>>>> Hello Richard:
>>>>>>
>>>>>> On 19/06/24 1:54 pm, Richard Sandiford wrote:
>>>>>>> Ajit Agarwal  writes:
>>>>>>>>> What happens if you leave the assert alone?  When does it fire?  Is it
>>>>>>>>> still for uses in debug insns?  If so, it's the fusion pass's 
>>>>>>>>> responsibility
>>>>>>>>> to update those, as mentioned above.  And it must update them before,
>>>>>>>>> or at the same time as, it deletes the definition.
>>>>>>>>>
>>>>>>>>
>>>>>>>> For debug insn I call reset_debug_use and now I dont see issues
>>>>>>>> with debug insn and issues I see with  non debug insn where
>>>>>>>> def is there in old_defs and use has to be removed for the insn
>>>>>>>> that we modify load with OO UNSPEC to generate lxvp.
>>>>>>>
>>>>>>> Can you walk me through it step-by-step?  If you leave the assert
>>>>>>> alone, when does it fire?  What set of insn_changes are being made
>>>>>>> when the assert fires?  (Calling debug on the changes will show this.)
>>>>>>> And what use does the assert fire on?  (Again, calling debug on the use
>>>>>>> will show this.)
>>>>>>>
>>>>>>
>>>>>> (insn 660 735 739 50 (set (reg:OO 405 [ MEM[(_Float128 *)src_196] ])
>>>>>> (unspec:OO [
>>>>>> (mem:OO (reg/v/f:DI 197 [ base ]) [9 MEM[(_Float128 
>>>>>> *)src_196]+0 S16 A128])
>>>>>> ] UNSPEC_LXVP))  2188 {*movoo1}
>>>>>>  (nil))
>>>>>>
>>>>>> This is definition.
>>>>>>
>>>>>> (insn 661 659 662 50 (set (reg:TF 179 [ result$imag ])
>>>>>> (plus:TF (reg:TF 179 [ result$imag ])
>>>>>> (subreg:TF (reg:OO 405 [ MEM[(_Float128 *)src_196] ]) 0)))  
>>>>>> {addtf3}
>>>>>>
>>>>>> This is use.
>>>>>>
>>>>>> change has the above definition and the assert fires at the
>>>>>> above use.
>>>>>
>>>>> But can you call debug on the insn_change that contains the deleted def,
>>>>> and call debug on the access_info that triggers the assert?
>>>>>
>>>>
>>>> I am afraid I am not getting what exactly you meant here.
>>>
>>> One way would be to apply a patch like the below and quote the
>>> output from the last "Changes:" onward.
>>>
>>> Richard
>>>
>>
>> Thanks.
>>
>> This is the dump of use at assert point.
>>
>> use of superceded set r178:i131 (V2DI pseudo) by insn i133 in bb2 [ebb2] at 
>> point 180
>>   defined in bb2 [ebb2] at point 108
>>
>> This is the dump of change.
>>
>> deletion of insn i130 in bb2 [ebb2] at point 106:
>>   deleted
>>   uses:
>> use of set r219:i291 (DI pseudo)
>>   appears inside an address
>> superceded use of set mem:i114
>>   defines:
>> set r177:i131 (OO pseudo)
>>   used by insn i132 in bb2 [ebb2] at point 178
>> change to insn i131 in bb2 [ebb2] at point 108:
>>   +--
>>   |  131: r177:OO=unspec[[r219:DI]] 101
>>   +--
>>   uses:
>> superceded use of set r219:i291 (DI pseudo)
>>   appears inside an address
>> superceded use of set mem:i114
>>   defines:
>> superceded set r178:i131 (V2DI pseudo)
>>   used by insn i133 in bb2 [ebb2] at point 180
>>   ~~~
>>   new cost: 2147483647
>>   new uses:
>> use of set r219:i291 (DI pseudo) by insn i131 in bb2 [ebb2] at point 108
>>   defined in bb2 [ebb2] at point 62
>>   appears inside an address
>> use of set mem:i114 by insn i131 in bb2 [ebb2] at point 108
>>   defined in bb2 [ebb2] at point 104
>>   new defs:
>> set r177:i131 (OO pseudo) in bb2 [ebb2] at point 108
>>   used by insn i132 in bb2 [ebb2] at point 178
>>   first insert-after candidate: insn i131 in bb2 [ebb2] at point 108
>>   last insert-after candidate: insn i131 in bb2 [ebb2] at point 108
> 
> Thanks.  It looks like you're updating just the definitions,
> and then later updating the uses.  That isn't the way that rtl-ssa
> is supposed to be used.  Each change set -- in other words, each call
> to function_info::change_insns -- must go from a valid state to a valid
> state.  That is, the RTL must be self-consistent before every individual
> call to function_info::change_insns and must be self-consistent after
> every individual call to function_info::change_insns.
> 
> This is what I meant before about:
> 
>   ... if we're removing a definition, all uses in "real"
>   debug and non-debug insns must be removed either earlier than the
>   definition or at the same time as the definition.  No such uses
>   should remain.
> 
> Since you want to update all uses of register 178, you need to include
> those updates in the same change set as the change to insns 130 and 131,
> rather than doing them later.
>

Thanks for the suggestions.

Addressed in v4 of the patch.
 
> Richard

Thanks & Regards
Ajit


[Patch, rs6000, middle-end] v4: Add implementation for different targets for pair mem fusion

2024-06-20 Thread Ajit Agarwal
Hello Richard:

All review comments are incorporated.

Common infrastructure using generic code for pair mem fusion of different
targets.

rs6000 target specific code implement virtual functions defined by generic code.

Target specific code are added in rs6000-mem-fusion.cc.

Bootstrapped and regtested on powerpc64-linux-gnu.

Thanks & Regards
Ajit


rs6000, middle-end: Add implementation for different targets for pair mem fusion

Common infrastructure using generic code for pair mem fusion of different
targets.

rs6000 target specific code implement virtual functions defined by generic code.

Target specific code are added in rs6000-mem-fusion.cc.

2024-06-20  Ajit Kumar Agarwal  

gcc/ChangeLog:

* config/rs6000/rs6000-passes.def: New mem fusion pass
before pass_early_remat.
* pair-fusion.h: Add additional pure virtual function
required for rs6000 target implementation.
* pair-fusion.cc: Use of virtual functions for additional
virtual function addded for rs6000 target.
* config/rs6000/rs6000-mem-fusion.cc: Add new pass.
Add target specific implementation for generic pure virtual
functions.
* config/rs6000/mma.md: Modify movoo machine description.
Add new machine description movoo1.
* config/rs6000/rs6000.cc: Modify rs6000_split_multireg_move
to expand movoo machine description for all constraints.
* config.gcc: Add new object file.
* config/rs6000/rs6000-protos.h: Add new prototype for mem
fusion pass.
* config/rs6000/t-rs6000: Add new rule.
* rtl-ssa/functions.h: Move out allocate function from private
to public and add get_m_temp_defs function.

gcc/testsuite/ChangeLog:

* g++.target/powerpc/mem-fusion.C: New test.
* g++.target/powerpc/mem-fusion-1.C: New test.
* gcc.target/powerpc/mma-builtin-1.c: Modify test.
---
 gcc/config.gcc|   2 +
 gcc/config/rs6000/mma.md  |  26 +-
 gcc/config/rs6000/rs6000-mem-fusion.cc| 748 ++
 gcc/config/rs6000/rs6000-passes.def   |   4 +-
 gcc/config/rs6000/rs6000-protos.h |   1 +
 gcc/config/rs6000/rs6000.cc   |  55 +-
 gcc/config/rs6000/rs6000.md   |   1 +
 gcc/config/rs6000/t-rs6000|   5 +
 gcc/pair-fusion.cc|  28 +-
 gcc/pair-fusion.h |  31 +
 gcc/rtl-ssa/functions.h   |   9 +-
 .../g++.target/powerpc/mem-fusion-1.C |  22 +
 gcc/testsuite/g++.target/powerpc/mem-fusion.C |  15 +
 .../gcc.target/powerpc/mma-builtin-1.c|   4 +-
 14 files changed, 925 insertions(+), 26 deletions(-)
 create mode 100644 gcc/config/rs6000/rs6000-mem-fusion.cc
 create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion-1.C
 create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion.C

diff --git a/gcc/config.gcc b/gcc/config.gcc
index 644c456290d..a032723152f 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -524,6 +524,7 @@ powerpc*-*-*)
extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
extra_objs="${extra_objs} rs6000-builtins.o rs6000-builtin.o"
+   extra_objs="${extra_objs} rs6000-mem-fusion.o"
extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h"
extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h"
extra_headers="${extra_headers} xmmintrin.h mm_malloc.h emmintrin.h"
@@ -560,6 +561,7 @@ rs6000*-*-*)
extra_options="${extra_options} g.opt fused-madd.opt 
rs6000/rs6000-tables.opt"
extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
+   extra_objs="${extra_objs} rs6000-mem-fusion.o"
target_gtfiles="$target_gtfiles 
\$(srcdir)/config/rs6000/rs6000-logue.cc 
\$(srcdir)/config/rs6000/rs6000-call.cc"
target_gtfiles="$target_gtfiles 
\$(srcdir)/config/rs6000/rs6000-pcrel-opt.cc"
;;
diff --git a/gcc/config/rs6000/mma.md b/gcc/config/rs6000/mma.md
index 04e2d0066df..88413926a02 100644
--- a/gcc/config/rs6000/mma.md
+++ b/gcc/config/rs6000/mma.md
@@ -294,7 +294,31 @@
 
 (define_insn_and_split "*movoo"
   [(set (match_operand:OO 0 "nonimmediate_operand" "=wa,ZwO,wa")
-   (match_operand:OO 1 "input_operand" "ZwO,wa,wa"))]
+(match_operand:OO 1 "input_operand" "ZwO,wa,wa"))]
+  "TARGET_MMA
+   && (gpc_reg_operand (operands[0], OOmode)
+   || gpc_reg_operand (operands[1], OOmode))"
+;;""
+  "@
+   #
+   #
+   #"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  rs6000_split_multireg_move (operands[0], operands[1]);
+  DONE;
+}
+  [(set_attr "type" "vecload,vecstore,veclogical")
+   (set_attr "length" "*,*,8")])
+;;   (set_attr "max_prefixed_insns" "2,2,*")])
+
+
+(define_insn_and_split "*movoo1"
+  [(set 

Re: [Patch, rs6000, middle-end] v2: Add implementation for different targets for pair mem fusion

2024-06-19 Thread Ajit Agarwal
Hello Richard:

On 19/06/24 2:52 pm, Richard Sandiford wrote:
> Ajit Agarwal  writes:
>> On 19/06/24 2:40 pm, Richard Sandiford wrote:
>>> Ajit Agarwal  writes:
>>>> Hello Richard:
>>>>
>>>> On 19/06/24 1:54 pm, Richard Sandiford wrote:
>>>>> Ajit Agarwal  writes:
>>>>>>> What happens if you leave the assert alone?  When does it fire?  Is it
>>>>>>> still for uses in debug insns?  If so, it's the fusion pass's 
>>>>>>> responsibility
>>>>>>> to update those, as mentioned above.  And it must update them before,
>>>>>>> or at the same time as, it deletes the definition.
>>>>>>>
>>>>>>
>>>>>> For debug insn I call reset_debug_use and now I dont see issues
>>>>>> with debug insn and issues I see with  non debug insn where
>>>>>> def is there in old_defs and use has to be removed for the insn
>>>>>> that we modify load with OO UNSPEC to generate lxvp.
>>>>>
>>>>> Can you walk me through it step-by-step?  If you leave the assert
>>>>> alone, when does it fire?  What set of insn_changes are being made
>>>>> when the assert fires?  (Calling debug on the changes will show this.)
>>>>> And what use does the assert fire on?  (Again, calling debug on the use
>>>>> will show this.)
>>>>>
>>>>
>>>> (insn 660 735 739 50 (set (reg:OO 405 [ MEM[(_Float128 *)src_196] ])
>>>> (unspec:OO [
>>>> (mem:OO (reg/v/f:DI 197 [ base ]) [9 MEM[(_Float128 
>>>> *)src_196]+0 S16 A128])
>>>> ] UNSPEC_LXVP))  2188 {*movoo1}
>>>>  (nil))
>>>>
>>>> This is definition.
>>>>
>>>> (insn 661 659 662 50 (set (reg:TF 179 [ result$imag ])
>>>> (plus:TF (reg:TF 179 [ result$imag ])
>>>> (subreg:TF (reg:OO 405 [ MEM[(_Float128 *)src_196] ]) 0)))  
>>>> {addtf3}
>>>>
>>>> This is use.
>>>>
>>>> change has the above definition and the assert fires at the
>>>> above use.
>>>
>>> But can you call debug on the insn_change that contains the deleted def,
>>> and call debug on the access_info that triggers the assert?
>>>
>>
>> I am afraid I am not getting what exactly you meant here.
> 
> One way would be to apply a patch like the below and quote the
> output from the last "Changes:" onward.
> 
> Richard
> 

Thanks.

This is the dump of use at assert point.

use of superceded set r178:i131 (V2DI pseudo) by insn i133 in bb2 [ebb2] at 
point 180
  defined in bb2 [ebb2] at point 108

This is the dump of change.

deletion of insn i130 in bb2 [ebb2] at point 106:
  deleted
  uses:
use of set r219:i291 (DI pseudo)
  appears inside an address
superceded use of set mem:i114
  defines:
set r177:i131 (OO pseudo)
  used by insn i132 in bb2 [ebb2] at point 178
change to insn i131 in bb2 [ebb2] at point 108:
  +--
  |  131: r177:OO=unspec[[r219:DI]] 101
  +--
  uses:
superceded use of set r219:i291 (DI pseudo)
  appears inside an address
superceded use of set mem:i114
  defines:
superceded set r178:i131 (V2DI pseudo)
  used by insn i133 in bb2 [ebb2] at point 180
  ~~~
  new cost: 2147483647
  new uses:
use of set r219:i291 (DI pseudo) by insn i131 in bb2 [ebb2] at point 108
  defined in bb2 [ebb2] at point 62
  appears inside an address
use of set mem:i114 by insn i131 in bb2 [ebb2] at point 108
  defined in bb2 [ebb2] at point 104
  new defs:
set r177:i131 (OO pseudo) in bb2 [ebb2] at point 108
  used by insn i132 in bb2 [ebb2] at point 178
  first insert-after candidate: insn i131 in bb2 [ebb2] at point 108
  last insert-after candidate: insn i131 in bb2 [ebb2] at point 108

Thanks & Regards
Ajit
> 
> diff --git a/gcc/rtl-ssa/changes.cc b/gcc/rtl-ssa/changes.cc
> index 11639e81bb7..694760138bb 100644
> --- a/gcc/rtl-ssa/changes.cc
> +++ b/gcc/rtl-ssa/changes.cc
> @@ -249,6 +249,8 @@ function_info::process_uses_of_deleted_def (set_info *set)
>   }
>else
>   {
> +   if (!use->is_live_out_use ())
> + debug (use);
> gcc_assert (use->is_live_out_use ());
> remove_use (use);
>   }
> @@ -830,6 +832,9 @@ function_info::change_insns (array_slice 
> changes)
>//
>// In particular, this means that consumers must handle debug
>// instructions before removing a set.
> +  fprintf (stderr, "Changes:\n");
> +  for (insn_change *change : changes)
> +debug (*change);
>for (insn_change *change : changes)
>  for (def_info *def : change->old_defs ())
>if (def->m_has_been_superceded)


Re: [Patch, rs6000, middle-end] v2: Add implementation for different targets for pair mem fusion

2024-06-19 Thread Ajit Agarwal
Hello Richard:

On 19/06/24 2:40 pm, Richard Sandiford wrote:
> Ajit Agarwal  writes:
>> Hello Richard:
>>
>> On 19/06/24 1:54 pm, Richard Sandiford wrote:
>>> Ajit Agarwal  writes:
>>>>> What happens if you leave the assert alone?  When does it fire?  Is it
>>>>> still for uses in debug insns?  If so, it's the fusion pass's 
>>>>> responsibility
>>>>> to update those, as mentioned above.  And it must update them before,
>>>>> or at the same time as, it deletes the definition.
>>>>>
>>>>
>>>> For debug insn I call reset_debug_use and now I dont see issues
>>>> with debug insn and issues I see with  non debug insn where
>>>> def is there in old_defs and use has to be removed for the insn
>>>> that we modify load with OO UNSPEC to generate lxvp.
>>>
>>> Can you walk me through it step-by-step?  If you leave the assert
>>> alone, when does it fire?  What set of insn_changes are being made
>>> when the assert fires?  (Calling debug on the changes will show this.)
>>> And what use does the assert fire on?  (Again, calling debug on the use
>>> will show this.)
>>>
>>
>> (insn 660 735 739 50 (set (reg:OO 405 [ MEM[(_Float128 *)src_196] ])
>> (unspec:OO [
>> (mem:OO (reg/v/f:DI 197 [ base ]) [9 MEM[(_Float128 
>> *)src_196]+0 S16 A128])
>> ] UNSPEC_LXVP))  2188 {*movoo1}
>>  (nil))
>>
>> This is definition.
>>
>> (insn 661 659 662 50 (set (reg:TF 179 [ result$imag ])
>> (plus:TF (reg:TF 179 [ result$imag ])
>> (subreg:TF (reg:OO 405 [ MEM[(_Float128 *)src_196] ]) 0)))  
>> {addtf3}
>>
>> This is use.
>>
>> change has the above definition and the assert fires at the
>> above use.
> 
> But can you call debug on the insn_change that contains the deleted def,
> and call debug on the access_info that triggers the assert?
>

I am afraid I am not getting what exactly you meant here.


 
> Thanks,
> Richard

Thanks & Regards
Ajit


Re: [Patch, rs6000, middle-end] v2: Add implementation for different targets for pair mem fusion

2024-06-19 Thread Ajit Agarwal
Hello Richard:

On 19/06/24 1:54 pm, Richard Sandiford wrote:
> Ajit Agarwal  writes:
>>> What happens if you leave the assert alone?  When does it fire?  Is it
>>> still for uses in debug insns?  If so, it's the fusion pass's responsibility
>>> to update those, as mentioned above.  And it must update them before,
>>> or at the same time as, it deletes the definition.
>>>
>>
>> For debug insn I call reset_debug_use and now I dont see issues
>> with debug insn and issues I see with  non debug insn where
>> def is there in old_defs and use has to be removed for the insn
>> that we modify load with OO UNSPEC to generate lxvp.
> 
> Can you walk me through it step-by-step?  If you leave the assert
> alone, when does it fire?  What set of insn_changes are being made
> when the assert fires?  (Calling debug on the changes will show this.)
> And what use does the assert fire on?  (Again, calling debug on the use
> will show this.)
> 

(insn 660 735 739 50 (set (reg:OO 405 [ MEM[(_Float128 *)src_196] ])
(unspec:OO [
(mem:OO (reg/v/f:DI 197 [ base ]) [9 MEM[(_Float128 
*)src_196]+0 S16 A128])
] UNSPEC_LXVP))  2188 {*movoo1}
 (nil))

This is definition.

(insn 661 659 662 50 (set (reg:TF 179 [ result$imag ])
(plus:TF (reg:TF 179 [ result$imag ])
(subreg:TF (reg:OO 405 [ MEM[(_Float128 *)src_196] ]) 0)))  {addtf3}

This is use.

change has the above definition and the assert fires at the
above use.
> Thanks,
> Richard
> 

Thanks & Regards
Ajit
> 
> 


Re: [Patch, rs6000, middle-end] v2: Add implementation for different targets for pair mem fusion

2024-06-19 Thread Ajit Agarwal
Hello Richard:

On 19/06/24 12:52 pm, Ajit Agarwal wrote:
> Hello Richard:
> 
> On 19/06/24 2:01 am, Richard Sandiford wrote:
>> Ajit Agarwal  writes:
>>> Hello Richard:
>>>
>>> On 14/06/24 4:26 pm, Richard Sandiford wrote:
>>>> Ajit Agarwal  writes:
>>>>> Hello Richard:
>>>>>
>>>>> All comments are addressed.
>>>>
>>>> I don't think this addresses the following comments from the previous
>>>> reviews:
>>>>
>>>> (1) It is not correct to mark existing insn uses as live-out.
>>>> The patch mustn't try to do this.
>>>>
>>>
>>> Addressed in v3 of the patch.
>>
>> The new version still tries to circumvent the live-out assert though.
>> While the old patch brute-forced the assert to false by setting
>> the live-out flag, the new patch just removes the assert.
>>
>> Like I said earlier, the assert is showing up a real bug and we
>> should fix the bug rather than suppress the assert.
>>
>> rtl-ssa live-out uses are somewhat like DF_LIVE_OUT in df.
>> They occur at the end of a basic block, in an artificial insn_info
>> that does not correspond to an actual rtl insn.
>>
>> The comment above process_uses_of_deleted_def says:
>>
>> // SET has been deleted.  Clean up all remaining uses.  Such uses are
>> // either dead phis or now-redundant live-out uses.
>>
>> In other words, if we're removing a definition, all uses in "real"
>> debug and non-debug insns must be removed either earlier than the
>> definition or at the same time as the definition.  No such uses
>> should remain.  The only uses that should be left are phis and
>> the fake end-of-block live-out uses that I described above.  These
>> uses are just "plumbing" that support something that is now neither
>> defined nor used by real instructions.  It's therefore safe to delete
>> the plumbing.
>>
> 
> In rtl-ssa/changes.cc I calculate live-out for uses based
> on DF_LIVE_OUT in new function that I defined as is_live_out().
> This function calculates live out for uses based on DF_LIVE_OUT
> for uses. If found to be live out and use is not marked as
> live_out_use then I mark it as use->set_is_live_out_use (true).
> In that case assert is not required.
> 
> If not found to be live out and marked as live_out_use false
> than we dont need to check with the assert and should go
> ahead and remove the use.
> 
> Did you find any issues with above implementation in the 
> version 3 of the patch.
> 
>> Please see the previous discussion about this:
>>
>> 
>>>>>>> +// Check whether load can be fusable or not.
>>>>>>> +// Return true if fuseable otherwise false.
>>>>>>> +bool
>>>>>>> +rs6000_pair_fusion::fuseable_load_p (insn_info *info)
>>>>>>> +{
>>>>>>> +  for (auto def : info->defs())
>>>>>>> +{
>>>>>>> +  auto set = dyn_cast (def);
>>>>>>> +  for (auto use1 : set->nondebug_insn_uses ())
>>>>>>> + use1->set_is_live_out_use (true);
>>>>>>> +}
>>>>>>
>>>>>> What was the reason for adding this loop?
>>>>>>
>>>>>
>>>>> The purpose of adding is to avoid assert failure in 
>>>>> gcc/rtl-ssa/changes.cc:252
>>>>
>>>> That assert is making sure that we don't delete a definition of a
>>>> register (or memory) while a real insn still uses it.  If the assert
>>>> is firing then something has gone wrong.
>>>>
>>>> Live-out uses are a particular kind of use that occur at the end of
>>>> basic blocks.  It's incorrect to mark normal insn uses as live-out.
>>>>
>>>> When an assert fails, it's important to understand why the failure
>>>> occurs, rather than brute-force the assert condition to true.
>>>>
>>>
>>> The above assert failure occurs when there is a debug insn and its
>>> use is not live-out.
>>
>> Uses in debug insns are never live-out uses.
>>
>> It sounds like the bug is that we're failing to update all debug uses of
>> the original register.  We need to do that, or "reset" the debug insn if
>> substitution fails for some reason.
>>
>> See fixup_debug_uses for what the target-independent part of 

Re: [Patch, rs6000, middle-end] v2: Add implementation for different targets for pair mem fusion

2024-06-19 Thread Ajit Agarwal
Hello Richard:

On 19/06/24 2:01 am, Richard Sandiford wrote:
> Ajit Agarwal  writes:
>> Hello Richard:
>>
>> On 14/06/24 4:26 pm, Richard Sandiford wrote:
>>> Ajit Agarwal  writes:
>>>> Hello Richard:
>>>>
>>>> All comments are addressed.
>>>
>>> I don't think this addresses the following comments from the previous
>>> reviews:
>>>
>>> (1) It is not correct to mark existing insn uses as live-out.
>>> The patch mustn't try to do this.
>>>
>>
>> Addressed in v3 of the patch.
> 
> The new version still tries to circumvent the live-out assert though.
> While the old patch brute-forced the assert to false by setting
> the live-out flag, the new patch just removes the assert.
> 
> Like I said earlier, the assert is showing up a real bug and we
> should fix the bug rather than suppress the assert.
> 
> rtl-ssa live-out uses are somewhat like DF_LIVE_OUT in df.
> They occur at the end of a basic block, in an artificial insn_info
> that does not correspond to an actual rtl insn.
> 
> The comment above process_uses_of_deleted_def says:
> 
> // SET has been deleted.  Clean up all remaining uses.  Such uses are
> // either dead phis or now-redundant live-out uses.
> 
> In other words, if we're removing a definition, all uses in "real"
> debug and non-debug insns must be removed either earlier than the
> definition or at the same time as the definition.  No such uses
> should remain.  The only uses that should be left are phis and
> the fake end-of-block live-out uses that I described above.  These
> uses are just "plumbing" that support something that is now neither
> defined nor used by real instructions.  It's therefore safe to delete
> the plumbing.
> 

In rtl-ssa/changes.cc I calculate live-out for uses based
on DF_LIVE_OUT in new function that I defined as is_live_out().
This function calculates live out for uses based on DF_LIVE_OUT
for uses. If found to be live out and use is not marked as
live_out_use then I mark it as use->set_is_live_out_use (true).
In that case assert is not required.

If not found to be live out and marked as live_out_use false
than we dont need to check with the assert and should go
ahead and remove the use.

Did you find any issues with above implementation in the 
version 3 of the patch.

> Please see the previous discussion about this:
> 
> 
>>>>>> +// Check whether load can be fusable or not.
>>>>>> +// Return true if fuseable otherwise false.
>>>>>> +bool
>>>>>> +rs6000_pair_fusion::fuseable_load_p (insn_info *info)
>>>>>> +{
>>>>>> +  for (auto def : info->defs())
>>>>>> +{
>>>>>> +  auto set = dyn_cast (def);
>>>>>> +  for (auto use1 : set->nondebug_insn_uses ())
>>>>>> + use1->set_is_live_out_use (true);
>>>>>> +}
>>>>>
>>>>> What was the reason for adding this loop?
>>>>>
>>>>
>>>> The purpose of adding is to avoid assert failure in 
>>>> gcc/rtl-ssa/changes.cc:252
>>>
>>> That assert is making sure that we don't delete a definition of a
>>> register (or memory) while a real insn still uses it.  If the assert
>>> is firing then something has gone wrong.
>>>
>>> Live-out uses are a particular kind of use that occur at the end of
>>> basic blocks.  It's incorrect to mark normal insn uses as live-out.
>>>
>>> When an assert fails, it's important to understand why the failure
>>> occurs, rather than brute-force the assert condition to true.
>>>
>>
>> The above assert failure occurs when there is a debug insn and its
>> use is not live-out.
> 
> Uses in debug insns are never live-out uses.
> 
> It sounds like the bug is that we're failing to update all debug uses of
> the original register.  We need to do that, or "reset" the debug insn if
> substitution fails for some reason.
> 
> See fixup_debug_uses for what the target-independent part of the pass
> does for debug insns that are affected by movement.  Hopefully the
> update needed here will be simpler than that.
> 
> 
> What happens if you leave the assert alone?  When does it fire?  Is it
> still for uses in debug insns?  If so, it's the fusion pass's responsibility
> to update those, as mentioned above.  And it must update them before,
> or at the same time as, i

Re: [Patch, rs6000, middle-end] v2: Add implementation for different targets for pair mem fusion

2024-06-18 Thread Ajit Agarwal
Hello Richard:

On 14/06/24 4:26 pm, Richard Sandiford wrote:
> Ajit Agarwal  writes:
>> Hello Richard:
>>
>> All comments are addressed.
> 
> I don't think this addresses the following comments from the previous
> reviews:
> 
> (1) It is not correct to mark existing insn uses as live-out.
> The patch mustn't try to do this.
> 

Addressed in v3 of the patch.

> (2) To quote a previous review:
> 
> It's probably better to create a fresh OO register, rather than
> change an existing 128-bit register to 256 bits.  If we do that,
> and if reg:V16QI 125 is the destination of the second load
> (which I assume it is from the 16 offset in the subreg),
> then the new RTL should be:
> 
>   (vec_select:HI (subreg:V8HI (reg:OO NEW_REG) 16) ...)
> 
> It's possible to get this by using insn_propagation to replace
> (reg:V16QI 125) with (subreg:V16QI (reg:OO NEW_REG) 16).
> insn_propagation should then take care of the rest.
> 
> There are no existing rtl-ssa routines for handling new registers
> though.  (The idea was to add things as the need arose.)
> 
> The reason for (2) is that changing the mode of an existing pseudo
> invalidates all existing references to that pseudo.  Although the
> patch tries to fix things up, it's doing that at a stage where
> there is already "garbage in" (in the sense that the starting
> RTL is invalid).  Just changing the mode would also invalidate
> things like REG_EXPR, for example.
> 
> In contrast, the advantage of creating a new pseudo means that every
> insn transformation is from structurally valid RTL to structurally
> valid RTL.  It also prevents information being incorrectly carried
> over from the old pseudo.
> x

Addressed in v3 of the patch.

> Thanks,
> Richard

Thanks & Regards
Ajit


[Patch, rs6000, middle-end] v3: Add implementation for different targets for pair mem fusion

2024-06-18 Thread Ajit Agarwal
Hello Richard:

All comments are addressed.

Common infrastructure using generic code for pair mem fusion of different
targets.

rs6000 target specific code implement virtual functions defined by generic code.

Target specific code are added in rs6000-mem-fusion.cc.

Bootstrapped and regtested on powerpc64-linux-gnu.

Thanks & Regards
Ajit


rs6000, middle-end: Add implementation for different targets for pair mem fusion

Common infrastructure using generic code for pair mem fusion of different
targets.

rs6000 target specific code implement virtual functions defined by generic code.

Target specific code are added in rs6000-mem-fusion.cc.

2024-06-18  Ajit Kumar Agarwal  

gcc/ChangeLog:

* config/rs6000/rs6000-passes.def: New mem fusion pass
before pass_early_remat.
* pair-fusion.h: Add additional pure virtual function
required for rs6000 target implementation.
* pair-fusion.cc: Use of virtual functions for additional
virtual function addded for rs6000 target.
* config/rs6000/rs6000-mem-fusion.cc: Add new pass.
Add target specific implementation for generic pure virtual
functions.
* config/rs6000/mma.md: Modify movoo machine description.
Add new machine description movoo1.
* config/rs6000/rs6000.cc: Modify rs6000_split_multireg_move
to expand movoo machine description for all constraints.
* config.gcc: Add new object file.
* config/rs6000/rs6000-protos.h: Add new prototype for mem
fusion pass.
* config/rs6000/t-rs6000: Add new rule.
* rtl-ssa/changes.cc: Add new is_live_out function and use of
same.
* rtl-ssa/functions.h: Move out allocate function from private
to public and add get_m_temp_defs function.

gcc/testsuite/ChangeLog:

* g++.target/powerpc/mem-fusion.C: New test.
* g++.target/powerpc/mem-fusion-1.C: New test.
* gcc.target/powerpc/mma-builtin-1.c: Modify test.
---
 gcc/config.gcc|   2 +
 gcc/config/rs6000/mma.md  |  26 +-
 gcc/config/rs6000/rs6000-mem-fusion.cc| 731 ++
 gcc/config/rs6000/rs6000-passes.def   |   4 +-
 gcc/config/rs6000/rs6000-protos.h |   1 +
 gcc/config/rs6000/rs6000.cc   |  55 +-
 gcc/config/rs6000/rs6000.md   |   1 +
 gcc/config/rs6000/t-rs6000|   5 +
 gcc/pair-fusion.cc|  26 +-
 gcc/pair-fusion.h |  28 +
 gcc/rtl-ssa/changes.cc|  34 +-
 gcc/rtl-ssa/functions.h   |   7 +-
 .../g++.target/powerpc/mem-fusion-1.C |  22 +
 gcc/testsuite/g++.target/powerpc/mem-fusion.C |  15 +
 .../gcc.target/powerpc/mma-builtin-1.c|   4 +-
 15 files changed, 935 insertions(+), 26 deletions(-)
 create mode 100644 gcc/config/rs6000/rs6000-mem-fusion.cc
 create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion-1.C
 create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion.C

diff --git a/gcc/config.gcc b/gcc/config.gcc
index e500ba63e32..348308b2e93 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -524,6 +524,7 @@ powerpc*-*-*)
extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
extra_objs="${extra_objs} rs6000-builtins.o rs6000-builtin.o"
+   extra_objs="${extra_objs} rs6000-mem-fusion.o"
extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h"
extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h"
extra_headers="${extra_headers} xmmintrin.h mm_malloc.h emmintrin.h"
@@ -560,6 +561,7 @@ rs6000*-*-*)
extra_options="${extra_options} g.opt fused-madd.opt 
rs6000/rs6000-tables.opt"
extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
+   extra_objs="${extra_objs} rs6000-mem-fusion.o"
target_gtfiles="$target_gtfiles 
\$(srcdir)/config/rs6000/rs6000-logue.cc 
\$(srcdir)/config/rs6000/rs6000-call.cc"
target_gtfiles="$target_gtfiles 
\$(srcdir)/config/rs6000/rs6000-pcrel-opt.cc"
;;
diff --git a/gcc/config/rs6000/mma.md b/gcc/config/rs6000/mma.md
index 04e2d0066df..88413926a02 100644
--- a/gcc/config/rs6000/mma.md
+++ b/gcc/config/rs6000/mma.md
@@ -294,7 +294,31 @@
 
 (define_insn_and_split "*movoo"
   [(set (match_operand:OO 0 "nonimmediate_operand" "=wa,ZwO,wa")
-   (match_operand:OO 1 "input_operand" "ZwO,wa,wa"))]
+(match_operand:OO 1 "input_operand" "ZwO,wa,wa"))]
+  "TARGET_MMA
+   && (gpc_reg_operand (operands[0], OOmode)
+   || gpc_reg_operand (operands[1], OOmode))"
+;;""
+  "@
+   #
+   #
+   #"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  rs6000_split_multireg_move (operands[0], operands[1]);
+  DONE;
+}
+  [(set_attr "type" "vecload,vecstore,veclogical")
+   

Re: [patch, rs6000, middle-end 0/1] v1: Add implementation for different targets for pair mem fusion

2024-06-12 Thread Ajit Agarwal
Hello Richard:

On 12/06/24 3:02 am, Richard Sandiford wrote:
> Ajit Agarwal  writes:
>> Hello Richard:
>>
>> On 11/06/24 9:41 pm, Richard Sandiford wrote:
>>> Ajit Agarwal  writes:
>>>>>> Thanks a lot. Can I know what should we be doing with neg (fma)
>>>>>> correctness failures with load fusion.
>>>>>
>>>>> I think it would involve:
>>>>>
>>>>> - describing lxvp and stxvp as unspec patterns, as I mentioned
>>>>>   in the previous reply
>>>>>
>>>>> - making plain movoo split loads and stores into individual
>>>>>   lxv and stxvs.  (Or, alternative, it could use lxvp and stxvp,
>>>>>   but internally swap the registers after load and before store.)
>>>>>   That is, movoo should load the lower-numbered register from the
>>>>>   lower address and the higher-numbered register from the higher
>>>>>   address, and likewise for stores.
>>>>>
>>>>
>>>> Would you mind elaborating the above.
>>>
>>> I think movoo should use rs6000_split_multireg_move for all alternatives,
>>> like movxo does.  movoo should split into 2 V1TI loads/stores and movxo
>>> should split into 4 V1TI loads/stores.  lxvp and stxvp would be
>>> independent patterns of the form:
>>>
>>>   (set ...
>>>(unspec [...] UNSPEC_FOO))
>>>
>>> ---
>>>
>>
>> In load fusion pass I generate the above pattern for adjacent merge
>> pairs.
>>
>>> rs6000_split_multireg_move has:
>>>
>>>   /* The __vector_pair and __vector_quad modes are multi-register
>>>  modes, so if we have to load or store the registers, we have to be
>>>  careful to properly swap them if we're in little endian mode
>>>  below.  This means the last register gets the first memory
>>>  location.  We also need to be careful of using the right register
>>>  numbers if we are splitting XO to OO.  */
>>>
>>> But I don't see how this can work reliably if we allow the kind of
>>> subregs that you want to create here.  The register order is the opposite
>>> from the one that GCC expects.
>>>
>>> This is more a question for the PowerPC maintainers though.
>>>
>>
>> Above unspec pattern generated and modified the movoo pattern to accept
>> the above spec it goes through the rs6000_split_multireg_move
>> it splits into 2 VITI loads and generate consecutive loads with sequential
>> registers. In load_fusion pass I generate the subreg along with load results 
>> subreg (reg OO R) 16 and subreg (reg OO R) 0.
>>
>> But it doesnt generate lxvp instruction. If above unspec instruction
>> pattern and write separate pattern in md file to generate lxvp instead of
>> normal movoo, then it won't go through rs6000_split_multireg_move
> 
> I don't understand the last bit, sorry.  Under the scheme I described,
> lxvp should be generated only through an unspec (and no other way).
> Same for stxvp.  The fusion pass should generate those unspecs.
> 
> If the fusion pass has generated the code correctly, the lxvp unspec
> will remain throughout compilation, unless all uses of it are later
> deleted as dead.
> 
> The movoo rtl pattern should continue to be:
> 
>   [(set (match_operand:OO 0 "nonimmediate_operand" "=wa,ZwO,wa")
>   (match_operand:OO 1 "input_operand" "ZwO,wa,wa"))]
> 
> But movoo should generate individual loads, stores and moves.  By design,
> it should never generate lxvp or stxvp.
> 
> This means that, if a fused load is spilled, the sequence will be
> something like:
> 
>   lxvp ...   // original fused load (unspec)
>   ...
>   stxv ...   // store one half to the stack (split from movoo)
>   stxv ...   // store the other half to the stack (split from movoo)
> 
> Then insns that use the pair will load whichever half they need
> from the stack.
> 
> I realise that isn't great, but it should at least be correct.
> 

Thanks a lot. It worked.

> Thanks,
> Richard

Thanks & Regards
Ajit


Re: [patch, rs6000, middle-end 0/1] v1: Add implementation for different targets for pair mem fusion

2024-06-12 Thread Ajit Agarwal
Hello Richard:

On 12/06/24 3:02 am, Richard Sandiford wrote:
> Ajit Agarwal  writes:
>> Hello Richard:
>>
>> On 11/06/24 9:41 pm, Richard Sandiford wrote:
>>> Ajit Agarwal  writes:
>>>>>> Thanks a lot. Can I know what should we be doing with neg (fma)
>>>>>> correctness failures with load fusion.
>>>>>
>>>>> I think it would involve:
>>>>>
>>>>> - describing lxvp and stxvp as unspec patterns, as I mentioned
>>>>>   in the previous reply
>>>>>
>>>>> - making plain movoo split loads and stores into individual
>>>>>   lxv and stxvs.  (Or, alternative, it could use lxvp and stxvp,
>>>>>   but internally swap the registers after load and before store.)
>>>>>   That is, movoo should load the lower-numbered register from the
>>>>>   lower address and the higher-numbered register from the higher
>>>>>   address, and likewise for stores.
>>>>>
>>>>
>>>> Would you mind elaborating the above.
>>>
>>> I think movoo should use rs6000_split_multireg_move for all alternatives,
>>> like movxo does.  movoo should split into 2 V1TI loads/stores and movxo
>>> should split into 4 V1TI loads/stores.  lxvp and stxvp would be
>>> independent patterns of the form:
>>>
>>>   (set ...
>>>(unspec [...] UNSPEC_FOO))
>>>
>>> ---
>>>
>>
>> In load fusion pass I generate the above pattern for adjacent merge
>> pairs.
>>
>>> rs6000_split_multireg_move has:
>>>
>>>   /* The __vector_pair and __vector_quad modes are multi-register
>>>  modes, so if we have to load or store the registers, we have to be
>>>  careful to properly swap them if we're in little endian mode
>>>  below.  This means the last register gets the first memory
>>>  location.  We also need to be careful of using the right register
>>>  numbers if we are splitting XO to OO.  */
>>>
>>> But I don't see how this can work reliably if we allow the kind of
>>> subregs that you want to create here.  The register order is the opposite
>>> from the one that GCC expects.
>>>
>>> This is more a question for the PowerPC maintainers though.
>>>
>>
>> Above unspec pattern generated and modified the movoo pattern to accept
>> the above spec it goes through the rs6000_split_multireg_move
>> it splits into 2 VITI loads and generate consecutive loads with sequential
>> registers. In load_fusion pass I generate the subreg along with load results 
>> subreg (reg OO R) 16 and subreg (reg OO R) 0.
>>
>> But it doesnt generate lxvp instruction. If above unspec instruction
>> pattern and write separate pattern in md file to generate lxvp instead of
>> normal movoo, then it won't go through rs6000_split_multireg_move
> 
> I don't understand the last bit, sorry.  Under the scheme I described,
> lxvp should be generated only through an unspec (and no other way).
> Same for stxvp.  The fusion pass should generate those unspecs.
> 
> If the fusion pass has generated the code correctly, the lxvp unspec
> will remain throughout compilation, unless all uses of it are later
> deleted as dead.
> 
> The movoo rtl pattern should continue to be:
> 
>   [(set (match_operand:OO 0 "nonimmediate_operand" "=wa,ZwO,wa")
>   (match_operand:OO 1 "input_operand" "ZwO,wa,wa"))]
> 
> But movoo should generate individual loads, stores and moves.  By design,
> it should never generate lxvp or stxvp.
> 
> This means that, if a fused load is spilled, the sequence will be
> something like:
> 
>   lxvp ...   // original fused load (unspec)
>   ...
>   stxv ...   // store one half to the stack (split from movoo)
>   stxv ...   // store the other half to the stack (split from movoo)
> 
> Then insns that use the pair will load whichever half they need
> from the stack.
> 
> I realise that isn't great, but it should at least be correct.
> 

Thanks a lot. It worked.

> Thanks,
> Richard

Thanks & Regards
Ajit


Re: [patch, rs6000, middle-end 0/1] v1: Add implementation for different targets for pair mem fusion

2024-06-11 Thread Ajit Agarwal
Hello Richard:

On 11/06/24 9:41 pm, Richard Sandiford wrote:
> Ajit Agarwal  writes:
>>>> Thanks a lot. Can I know what should we be doing with neg (fma)
>>>> correctness failures with load fusion.
>>>
>>> I think it would involve:
>>>
>>> - describing lxvp and stxvp as unspec patterns, as I mentioned
>>>   in the previous reply
>>>
>>> - making plain movoo split loads and stores into individual
>>>   lxv and stxvs.  (Or, alternative, it could use lxvp and stxvp,
>>>   but internally swap the registers after load and before store.)
>>>   That is, movoo should load the lower-numbered register from the
>>>   lower address and the higher-numbered register from the higher
>>>   address, and likewise for stores.
>>>
>>
>> Would you mind elaborating the above.
> 
> I think movoo should use rs6000_split_multireg_move for all alternatives,
> like movxo does.  movoo should split into 2 V1TI loads/stores and movxo
> should split into 4 V1TI loads/stores.  lxvp and stxvp would be
> independent patterns of the form:
> 
>   (set ...
>(unspec [...] UNSPEC_FOO))
> 
> ---
> 

In load fusion pass I generate the above pattern for adjacent merge
pairs.

> rs6000_split_multireg_move has:
> 
>   /* The __vector_pair and __vector_quad modes are multi-register
>  modes, so if we have to load or store the registers, we have to be
>  careful to properly swap them if we're in little endian mode
>  below.  This means the last register gets the first memory
>  location.  We also need to be careful of using the right register
>  numbers if we are splitting XO to OO.  */
> 
> But I don't see how this can work reliably if we allow the kind of
> subregs that you want to create here.  The register order is the opposite
> from the one that GCC expects.
> 
> This is more a question for the PowerPC maintainers though.
>

Above unspec pattern generated and modified the movoo pattern to accept
the above spec it goes through the rs6000_split_multireg_move
it splits into 2 VITI loads and generate consecutive loads with sequential
registers. In load_fusion pass I generate the subreg along with load results 
subreg (reg OO R) 16 and subreg (reg OO R) 0.

But it doesnt generate lxvp instruction. If above unspec instruction
pattern and write separate pattern in md file to generate lxvp instead of
normal movoo, then it won't go through rs6000_split_multireg_move

> And this is one of the (admittedly many) times when I wish GCC's
> subreg model was more like LLVM's. :)
> 
> Thanks,
> Richard

Thanks & Regards
Ajit


Re: [patch, rs6000, middle-end 0/1] v1: Add implementation for different targets for pair mem fusion

2024-06-11 Thread Ajit Agarwal
Hello Richard:

On 11/06/24 8:59 pm, Richard Sandiford wrote:
> Ajit Agarwal  writes:
>> On 11/06/24 7:07 pm, Richard Sandiford wrote:
>>> Ajit Agarwal  writes:
>>>> Hello Richard:
>>>> On 11/06/24 6:12 pm, Richard Sandiford wrote:
>>>>> Ajit Agarwal  writes:
>>>>>> Hello Richard:
>>>>>>
>>>>>> On 11/06/24 5:15 pm, Richard Sandiford wrote:
>>>>>>> Ajit Agarwal  writes:
>>>>>>>> Hello Richard:
>>>>>>>> On 11/06/24 4:56 pm, Ajit Agarwal wrote:
>>>>>>>>> Hello Richard:
>>>>>>>>>
>>>>>>>>> On 11/06/24 4:36 pm, Richard Sandiford wrote:
>>>>>>>>>> Ajit Agarwal  writes:
>>>>>>>>>>>>>>> After LRA reload:
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> (insn 9299 2472 2412 187 (set (reg:V2DF 51 19 [orig:240 
>>>>>>>>>>>>>>> vect__302.545 ] [240])
>>>>>>>>>>>>>>> (mem:V2DF (plus:DI (reg:DI 8 8 [orig:1285 ivtmp.886 ] 
>>>>>>>>>>>>>>> [1285])
>>>>>>>>>>>>>>> (const_int 16 [0x10])) [1 MEM >>>>>>>>>>>>>> real(kind=8)> [(real(kind=8) *)_4188]+16 S16 A64])) 
>>>>>>>>>>>>>>> "shell_lam.fppized.f":238:72 1190 {vsx_movv2df_64bit}
>>>>>>>>>>>>>>>  (nil))
>>>>>>>>>>>>>>> (insn 2412 9299 2477 187 (set (reg:V2DF 51 19 [orig:240 
>>>>>>>>>>>>>>> vect__302.545 ] [240])
>>>>>>>>>>>>>>> (neg:V2DF (fma:V2DF (reg:V2DF 39 7 [ MEM >>>>>>>>>>>>>> real(kind=8)> [(real(kind=8) *)_4050]+16 ])
>>>>>>>>>>>>>>> (reg:V2DF 44 12 [3119])
>>>>>>>>>>>>>>> (neg:V2DF (reg:V2DF 51 19 [orig:240 
>>>>>>>>>>>>>>> vect__302.545 ] [240]) {*vsx_nfmsv2df4}
>>>>>>>>>>>>>>>  (nil))
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> (insn 2473 9311 9312 187 (set (reg:V2DF 38 6 [orig:905 
>>>>>>>>>>>>>>> vect__302.545 ] [905])
>>>>>>>>>>>>>>> (neg:V2DF (fma:V2DF (reg:V2DF 44 12 [3119])
>>>>>>>>>>>>>>> (reg:V2DF 38 6 [orig:2561 MEM >>>>>>>>>>>>>> real(kind=8)> [(real(kind=8) *)_4050] ] [2561])
>>>>>>>>>>>>>>> (neg:V2DF (reg:V2DF 47 15 [5266]) 
>>>>>>>>>>>>>>> {*vsx_nfmsv2df4}
>>>>>>>>>>>>>>>  (nil))
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> In the above allocated code it assign registers 51 and 47 and 
>>>>>>>>>>>>>>> they are not sequential.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> The reload for 2412 looks valid.  What was the original 
>>>>>>>>>>>>>> pre-reload
>>>>>>>>>>>>>> version of insn 2473?  Also, what happened to insn 2472?  Was it 
>>>>>>>>>>>>>> deleted?
>>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> This is preload version of 2473:
>>>>>>>>>>>>>
>>>>>>>>>>>>> (insn 2473 2396 2478 161 (set (reg:V2DF 905 [ vect__302.545 ])
>>>>>>>>>>>>> (neg:V2DF (fma:V2DF (reg:V2DF 4283 [3119])
>>>>>>>>>>>>> (subreg:V2DF (reg:OO 2561 [ MEM >>>>>>>>>>>> real(kind=8)> [(real(kind=8) *)_4050] ]) 0)
>>>>>>>>>>>>> (neg:V2DF (subreg:V2DF (reg:OO 2572 [ 
>>>>>>>>>>>

Re: [patch, rs6000, middle-end 0/1] v1: Add implementation for different targets for pair mem fusion

2024-06-11 Thread Ajit Agarwal
Hello Richard:

On 11/06/24 7:07 pm, Richard Sandiford wrote:
> Ajit Agarwal  writes:
>> Hello Richard:
>> On 11/06/24 6:12 pm, Richard Sandiford wrote:
>>> Ajit Agarwal  writes:
>>>> Hello Richard:
>>>>
>>>> On 11/06/24 5:15 pm, Richard Sandiford wrote:
>>>>> Ajit Agarwal  writes:
>>>>>> Hello Richard:
>>>>>> On 11/06/24 4:56 pm, Ajit Agarwal wrote:
>>>>>>> Hello Richard:
>>>>>>>
>>>>>>> On 11/06/24 4:36 pm, Richard Sandiford wrote:
>>>>>>>> Ajit Agarwal  writes:
>>>>>>>>>>>>> After LRA reload:
>>>>>>>>>>>>>
>>>>>>>>>>>>> (insn 9299 2472 2412 187 (set (reg:V2DF 51 19 [orig:240 
>>>>>>>>>>>>> vect__302.545 ] [240])
>>>>>>>>>>>>> (mem:V2DF (plus:DI (reg:DI 8 8 [orig:1285 ivtmp.886 ] 
>>>>>>>>>>>>> [1285])
>>>>>>>>>>>>> (const_int 16 [0x10])) [1 MEM >>>>>>>>>>>> real(kind=8)> [(real(kind=8) *)_4188]+16 S16 A64])) 
>>>>>>>>>>>>> "shell_lam.fppized.f":238:72 1190 {vsx_movv2df_64bit}
>>>>>>>>>>>>>  (nil))
>>>>>>>>>>>>> (insn 2412 9299 2477 187 (set (reg:V2DF 51 19 [orig:240 
>>>>>>>>>>>>> vect__302.545 ] [240])
>>>>>>>>>>>>> (neg:V2DF (fma:V2DF (reg:V2DF 39 7 [ MEM >>>>>>>>>>>> real(kind=8)> [(real(kind=8) *)_4050]+16 ])
>>>>>>>>>>>>> (reg:V2DF 44 12 [3119])
>>>>>>>>>>>>> (neg:V2DF (reg:V2DF 51 19 [orig:240 vect__302.545 
>>>>>>>>>>>>> ] [240]) {*vsx_nfmsv2df4}
>>>>>>>>>>>>>  (nil))
>>>>>>>>>>>>>
>>>>>>>>>>>>> (insn 2473 9311 9312 187 (set (reg:V2DF 38 6 [orig:905 
>>>>>>>>>>>>> vect__302.545 ] [905])
>>>>>>>>>>>>> (neg:V2DF (fma:V2DF (reg:V2DF 44 12 [3119])
>>>>>>>>>>>>> (reg:V2DF 38 6 [orig:2561 MEM >>>>>>>>>>>> real(kind=8)> [(real(kind=8) *)_4050] ] [2561])
>>>>>>>>>>>>> (neg:V2DF (reg:V2DF 47 15 [5266]) 
>>>>>>>>>>>>> {*vsx_nfmsv2df4}
>>>>>>>>>>>>>  (nil))
>>>>>>>>>>>>>
>>>>>>>>>>>>> In the above allocated code it assign registers 51 and 47 and 
>>>>>>>>>>>>> they are not sequential.
>>>>>>>>>>>>
>>>>>>>>>>>> The reload for 2412 looks valid.  What was the original pre-reload
>>>>>>>>>>>> version of insn 2473?  Also, what happened to insn 2472?  Was it 
>>>>>>>>>>>> deleted?
>>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> This is preload version of 2473:
>>>>>>>>>>>
>>>>>>>>>>> (insn 2473 2396 2478 161 (set (reg:V2DF 905 [ vect__302.545 ])
>>>>>>>>>>> (neg:V2DF (fma:V2DF (reg:V2DF 4283 [3119])
>>>>>>>>>>> (subreg:V2DF (reg:OO 2561 [ MEM >>>>>>>>>> real(kind=8)> [(real(kind=8) *)_4050] ]) 0)
>>>>>>>>>>> (neg:V2DF (subreg:V2DF (reg:OO 2572 [ 
>>>>>>>>>>> vect__300.543_236 ]) 0) {*vsx_nfmsv2df4}
>>>>>>>>>>>  (expr_list:REG_DEAD (reg:OO 2572 [ vect__300.543_236 ])
>>>>>>>>>>> (expr_list:REG_DEAD (reg:OO 2561 [ MEM >>>>>>>>>> real(kind=8)> [(real(kind=8) *)_4050] ])
>>>>>>>>>>> (nil
>>>>>>>>>>>
>>>>>>>>>>> insn 2472 is replaced with 9299 after reload.
>>>>>>>>>>
>>>>>&g

Re: [patch, rs6000, middle-end 0/1] v1: Add implementation for different targets for pair mem fusion

2024-06-11 Thread Ajit Agarwal
Hello Richard:

On 11/06/24 6:12 pm, Richard Sandiford wrote:
> Ajit Agarwal  writes:
>> Hello Richard:
>>
>> On 11/06/24 5:15 pm, Richard Sandiford wrote:
>>> Ajit Agarwal  writes:
>>>> Hello Richard:
>>>> On 11/06/24 4:56 pm, Ajit Agarwal wrote:
>>>>> Hello Richard:
>>>>>
>>>>> On 11/06/24 4:36 pm, Richard Sandiford wrote:
>>>>>> Ajit Agarwal  writes:
>>>>>>>>>>> After LRA reload:
>>>>>>>>>>>
>>>>>>>>>>> (insn 9299 2472 2412 187 (set (reg:V2DF 51 19 [orig:240 
>>>>>>>>>>> vect__302.545 ] [240])
>>>>>>>>>>> (mem:V2DF (plus:DI (reg:DI 8 8 [orig:1285 ivtmp.886 ] 
>>>>>>>>>>> [1285])
>>>>>>>>>>> (const_int 16 [0x10])) [1 MEM >>>>>>>>>> real(kind=8)> [(real(kind=8) *)_4188]+16 S16 A64])) 
>>>>>>>>>>> "shell_lam.fppized.f":238:72 1190 {vsx_movv2df_64bit}
>>>>>>>>>>>  (nil))
>>>>>>>>>>> (insn 2412 9299 2477 187 (set (reg:V2DF 51 19 [orig:240 
>>>>>>>>>>> vect__302.545 ] [240])
>>>>>>>>>>> (neg:V2DF (fma:V2DF (reg:V2DF 39 7 [ MEM >>>>>>>>>> real(kind=8)> [(real(kind=8) *)_4050]+16 ])
>>>>>>>>>>> (reg:V2DF 44 12 [3119])
>>>>>>>>>>> (neg:V2DF (reg:V2DF 51 19 [orig:240 vect__302.545 ] 
>>>>>>>>>>> [240]) {*vsx_nfmsv2df4}
>>>>>>>>>>>  (nil))
>>>>>>>>>>>
>>>>>>>>>>> (insn 2473 9311 9312 187 (set (reg:V2DF 38 6 [orig:905 
>>>>>>>>>>> vect__302.545 ] [905])
>>>>>>>>>>> (neg:V2DF (fma:V2DF (reg:V2DF 44 12 [3119])
>>>>>>>>>>> (reg:V2DF 38 6 [orig:2561 MEM >>>>>>>>>> real(kind=8)> [(real(kind=8) *)_4050] ] [2561])
>>>>>>>>>>> (neg:V2DF (reg:V2DF 47 15 [5266]) 
>>>>>>>>>>> {*vsx_nfmsv2df4}
>>>>>>>>>>>  (nil))
>>>>>>>>>>>
>>>>>>>>>>> In the above allocated code it assign registers 51 and 47 and they 
>>>>>>>>>>> are not sequential.
>>>>>>>>>>
>>>>>>>>>> The reload for 2412 looks valid.  What was the original pre-reload
>>>>>>>>>> version of insn 2473?  Also, what happened to insn 2472?  Was it 
>>>>>>>>>> deleted?
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> This is preload version of 2473:
>>>>>>>>>
>>>>>>>>> (insn 2473 2396 2478 161 (set (reg:V2DF 905 [ vect__302.545 ])
>>>>>>>>> (neg:V2DF (fma:V2DF (reg:V2DF 4283 [3119])
>>>>>>>>> (subreg:V2DF (reg:OO 2561 [ MEM >>>>>>>> real(kind=8)> [(real(kind=8) *)_4050] ]) 0)
>>>>>>>>> (neg:V2DF (subreg:V2DF (reg:OO 2572 [ 
>>>>>>>>> vect__300.543_236 ]) 0) {*vsx_nfmsv2df4}
>>>>>>>>>  (expr_list:REG_DEAD (reg:OO 2572 [ vect__300.543_236 ])
>>>>>>>>> (expr_list:REG_DEAD (reg:OO 2561 [ MEM >>>>>>>> real(kind=8)> [(real(kind=8) *)_4050] ])
>>>>>>>>> (nil
>>>>>>>>>
>>>>>>>>> insn 2472 is replaced with 9299 after reload.
>>>>>>>>
>>>>>>>> You'd have to check the dumps to be sure, but I think 9299 is instead
>>>>>>>> generated as an input reload of 2412, rather than being a replacement
>>>>>>>> of insn 2472.  T
>>>>>>>
>>>>>>> Yes it is generated for 2412. The predecessor of 2412 is load from
>>>>>>> plus offset as in 2412 we have subreg:V2DF (reg OO 2572) 16).
>>>>>>>
>>>>>>> This is not correct as we are not generating lxvp and it is 
>>>>>>>

Re: [patch, rs6000, middle-end 0/1] v1: Add implementation for different targets for pair mem fusion

2024-06-11 Thread Ajit Agarwal
Hello Richard:

On 11/06/24 4:56 pm, Ajit Agarwal wrote:
> Hello Richard:
> 
> On 11/06/24 4:36 pm, Richard Sandiford wrote:
>> Ajit Agarwal  writes:
>>>>>>> After LRA reload:
>>>>>>>
>>>>>>> (insn 9299 2472 2412 187 (set (reg:V2DF 51 19 [orig:240 vect__302.545 ] 
>>>>>>> [240])
>>>>>>> (mem:V2DF (plus:DI (reg:DI 8 8 [orig:1285 ivtmp.886 ] [1285])
>>>>>>> (const_int 16 [0x10])) [1 MEM  
>>>>>>> [(real(kind=8) *)_4188]+16 S16 A64])) "shell_lam.fppized.f":238:72 1190 
>>>>>>> {vsx_movv2df_64bit}
>>>>>>>  (nil))
>>>>>>> (insn 2412 9299 2477 187 (set (reg:V2DF 51 19 [orig:240 vect__302.545 ] 
>>>>>>> [240])
>>>>>>> (neg:V2DF (fma:V2DF (reg:V2DF 39 7 [ MEM >>>>>> real(kind=8)> [(real(kind=8) *)_4050]+16 ])
>>>>>>> (reg:V2DF 44 12 [3119])
>>>>>>> (neg:V2DF (reg:V2DF 51 19 [orig:240 vect__302.545 ] 
>>>>>>> [240]) {*vsx_nfmsv2df4}
>>>>>>>  (nil))
>>>>>>>
>>>>>>> (insn 2473 9311 9312 187 (set (reg:V2DF 38 6 [orig:905 vect__302.545 ] 
>>>>>>> [905])
>>>>>>> (neg:V2DF (fma:V2DF (reg:V2DF 44 12 [3119])
>>>>>>> (reg:V2DF 38 6 [orig:2561 MEM  
>>>>>>> [(real(kind=8) *)_4050] ] [2561])
>>>>>>> (neg:V2DF (reg:V2DF 47 15 [5266]) {*vsx_nfmsv2df4}
>>>>>>>  (nil))
>>>>>>>
>>>>>>> In the above allocated code it assign registers 51 and 47 and they are 
>>>>>>> not sequential.
>>>>>>
>>>>>> The reload for 2412 looks valid.  What was the original pre-reload
>>>>>> version of insn 2473?  Also, what happened to insn 2472?  Was it deleted?
>>>>>>
>>>>>
>>>>> This is preload version of 2473:
>>>>>
>>>>> (insn 2473 2396 2478 161 (set (reg:V2DF 905 [ vect__302.545 ])
>>>>> (neg:V2DF (fma:V2DF (reg:V2DF 4283 [3119])
>>>>> (subreg:V2DF (reg:OO 2561 [ MEM  
>>>>> [(real(kind=8) *)_4050] ]) 0)
>>>>> (neg:V2DF (subreg:V2DF (reg:OO 2572 [ vect__300.543_236 
>>>>> ]) 0) {*vsx_nfmsv2df4}
>>>>>  (expr_list:REG_DEAD (reg:OO 2572 [ vect__300.543_236 ])
>>>>> (expr_list:REG_DEAD (reg:OO 2561 [ MEM  
>>>>> [(real(kind=8) *)_4050] ])
>>>>> (nil
>>>>>
>>>>> insn 2472 is replaced with 9299 after reload.
>>>>
>>>> You'd have to check the dumps to be sure, but I think 9299 is instead
>>>> generated as an input reload of 2412, rather than being a replacement
>>>> of insn 2472.  T
>>>
>>> Yes it is generated for 2412. The predecessor of 2412 is load from
>>> plus offset as in 2412 we have subreg:V2DF (reg OO 2572) 16).
>>>
>>> This is not correct as we are not generating lxvp and it is 
>>> normal load lxv.
>>> As normal load is generated in predecessor insn of 2412 with
>>> plus constant offset it breaks the correctness.
>>
>> Not using lxvp is a deliberate choice though.
>>
>> If a (reg:OO R) has been spilled, there's no requirement for LRA
>> to load both halves of R when only one half is needed.  LRA just
>> loads what it needs into whichever registers happen to be free.
>>
>> If the reload of R instead used lxvp, LRA would be forced to free
>> up another register for the other half of R, even though that value
>> would never be used.
>>
> 
> If a (reg:OO R ) 16 is loaded when it is spilled then loaded value
> will be from plus offset 16 instead it should be loaded value 
> from zero offset. As in load fusion pass we are replacing
> (reg:V2DI R) with subreg (reg:OO R) 16 and hence loaded value
> is from plus 16 offsets and thats why its breaking the correctness.
> 
> Similarly we are replacing (reg:V2DI R) 16 with subreg (reg:OO R) 0
> and loaded value is from 16 offset instead its loading from zero
> offset and thats why we are breaking the correctness.
> 

If a (reg:OO R ) 16 is loaded when it is spilled then loaded value
will be from plus offset 16 instead it should be loaded value 
from zero offset. As in load fusion pass we are replacing
(reg:V2

Re: [patch, rs6000, middle-end 0/1] v1: Add implementation for different targets for pair mem fusion

2024-06-11 Thread Ajit Agarwal
Hello Richard:

On 11/06/24 4:56 pm, Ajit Agarwal wrote:
> Hello Richard:
> 
> On 11/06/24 4:36 pm, Richard Sandiford wrote:
>> Ajit Agarwal  writes:
>>>>>>> After LRA reload:
>>>>>>>
>>>>>>> (insn 9299 2472 2412 187 (set (reg:V2DF 51 19 [orig:240 vect__302.545 ] 
>>>>>>> [240])
>>>>>>> (mem:V2DF (plus:DI (reg:DI 8 8 [orig:1285 ivtmp.886 ] [1285])
>>>>>>> (const_int 16 [0x10])) [1 MEM  
>>>>>>> [(real(kind=8) *)_4188]+16 S16 A64])) "shell_lam.fppized.f":238:72 1190 
>>>>>>> {vsx_movv2df_64bit}
>>>>>>>  (nil))
>>>>>>> (insn 2412 9299 2477 187 (set (reg:V2DF 51 19 [orig:240 vect__302.545 ] 
>>>>>>> [240])
>>>>>>> (neg:V2DF (fma:V2DF (reg:V2DF 39 7 [ MEM >>>>>> real(kind=8)> [(real(kind=8) *)_4050]+16 ])
>>>>>>> (reg:V2DF 44 12 [3119])
>>>>>>> (neg:V2DF (reg:V2DF 51 19 [orig:240 vect__302.545 ] 
>>>>>>> [240]) {*vsx_nfmsv2df4}
>>>>>>>  (nil))
>>>>>>>
>>>>>>> (insn 2473 9311 9312 187 (set (reg:V2DF 38 6 [orig:905 vect__302.545 ] 
>>>>>>> [905])
>>>>>>> (neg:V2DF (fma:V2DF (reg:V2DF 44 12 [3119])
>>>>>>> (reg:V2DF 38 6 [orig:2561 MEM  
>>>>>>> [(real(kind=8) *)_4050] ] [2561])
>>>>>>> (neg:V2DF (reg:V2DF 47 15 [5266]) {*vsx_nfmsv2df4}
>>>>>>>  (nil))
>>>>>>>
>>>>>>> In the above allocated code it assign registers 51 and 47 and they are 
>>>>>>> not sequential.
>>>>>>
>>>>>> The reload for 2412 looks valid.  What was the original pre-reload
>>>>>> version of insn 2473?  Also, what happened to insn 2472?  Was it deleted?
>>>>>>
>>>>>
>>>>> This is preload version of 2473:
>>>>>
>>>>> (insn 2473 2396 2478 161 (set (reg:V2DF 905 [ vect__302.545 ])
>>>>> (neg:V2DF (fma:V2DF (reg:V2DF 4283 [3119])
>>>>> (subreg:V2DF (reg:OO 2561 [ MEM  
>>>>> [(real(kind=8) *)_4050] ]) 0)
>>>>> (neg:V2DF (subreg:V2DF (reg:OO 2572 [ vect__300.543_236 
>>>>> ]) 0) {*vsx_nfmsv2df4}
>>>>>  (expr_list:REG_DEAD (reg:OO 2572 [ vect__300.543_236 ])
>>>>> (expr_list:REG_DEAD (reg:OO 2561 [ MEM  
>>>>> [(real(kind=8) *)_4050] ])
>>>>> (nil
>>>>>
>>>>> insn 2472 is replaced with 9299 after reload.
>>>>
>>>> You'd have to check the dumps to be sure, but I think 9299 is instead
>>>> generated as an input reload of 2412, rather than being a replacement
>>>> of insn 2472.  T
>>>
>>> Yes it is generated for 2412. The predecessor of 2412 is load from
>>> plus offset as in 2412 we have subreg:V2DF (reg OO 2572) 16).
>>>
>>> This is not correct as we are not generating lxvp and it is 
>>> normal load lxv.
>>> As normal load is generated in predecessor insn of 2412 with
>>> plus constant offset it breaks the correctness.
>>
>> Not using lxvp is a deliberate choice though.
>>
>> If a (reg:OO R) has been spilled, there's no requirement for LRA
>> to load both halves of R when only one half is needed.  LRA just
>> loads what it needs into whichever registers happen to be free.
>>
>> If the reload of R instead used lxvp, LRA would be forced to free
>> up another register for the other half of R, even though that value
>> would never be used.
>>
> 
> If a (reg:OO R ) 16 is loaded when it is spilled then loaded value
> will be from plus offset 16 instead it should be loaded value 
> from zero offset. As in load fusion pass we are replacing
> (reg:V2DI R) with subreg (reg:OO R) 16 and hence loaded value
> is from plus 16 offsets and thats why its breaking the correctness.
> 
> Similarly we are replacing (reg:V2DI R) 16 with subreg (reg:OO R) 0
> and loaded value is from 16 offset instead its loading from zero
> offset and thats why we are breaking the correctness.
> 
If a (reg:OO R ) 16 is loaded when it is spilled then loaded value
will be from plus offset 16 instead it should be loaded value 
from zero offset. As in load fusion pass we are replacing
(reg:V2D

Re: [patch, rs6000, middle-end 0/1] v1: Add implementation for different targets for pair mem fusion

2024-06-11 Thread Ajit Agarwal
Hello Richard:

On 11/06/24 5:15 pm, Richard Sandiford wrote:
> Ajit Agarwal  writes:
>> Hello Richard:
>> On 11/06/24 4:56 pm, Ajit Agarwal wrote:
>>> Hello Richard:
>>>
>>> On 11/06/24 4:36 pm, Richard Sandiford wrote:
>>>> Ajit Agarwal  writes:
>>>>>>>>> After LRA reload:
>>>>>>>>>
>>>>>>>>> (insn 9299 2472 2412 187 (set (reg:V2DF 51 19 [orig:240 vect__302.545 
>>>>>>>>> ] [240])
>>>>>>>>> (mem:V2DF (plus:DI (reg:DI 8 8 [orig:1285 ivtmp.886 ] [1285])
>>>>>>>>> (const_int 16 [0x10])) [1 MEM >>>>>>>> real(kind=8)> [(real(kind=8) *)_4188]+16 S16 A64])) 
>>>>>>>>> "shell_lam.fppized.f":238:72 1190 {vsx_movv2df_64bit}
>>>>>>>>>  (nil))
>>>>>>>>> (insn 2412 9299 2477 187 (set (reg:V2DF 51 19 [orig:240 vect__302.545 
>>>>>>>>> ] [240])
>>>>>>>>> (neg:V2DF (fma:V2DF (reg:V2DF 39 7 [ MEM >>>>>>>> real(kind=8)> [(real(kind=8) *)_4050]+16 ])
>>>>>>>>> (reg:V2DF 44 12 [3119])
>>>>>>>>> (neg:V2DF (reg:V2DF 51 19 [orig:240 vect__302.545 ] 
>>>>>>>>> [240]) {*vsx_nfmsv2df4}
>>>>>>>>>  (nil))
>>>>>>>>>
>>>>>>>>> (insn 2473 9311 9312 187 (set (reg:V2DF 38 6 [orig:905 vect__302.545 
>>>>>>>>> ] [905])
>>>>>>>>> (neg:V2DF (fma:V2DF (reg:V2DF 44 12 [3119])
>>>>>>>>> (reg:V2DF 38 6 [orig:2561 MEM >>>>>>>> real(kind=8)> [(real(kind=8) *)_4050] ] [2561])
>>>>>>>>> (neg:V2DF (reg:V2DF 47 15 [5266]) {*vsx_nfmsv2df4}
>>>>>>>>>  (nil))
>>>>>>>>>
>>>>>>>>> In the above allocated code it assign registers 51 and 47 and they 
>>>>>>>>> are not sequential.
>>>>>>>>
>>>>>>>> The reload for 2412 looks valid.  What was the original pre-reload
>>>>>>>> version of insn 2473?  Also, what happened to insn 2472?  Was it 
>>>>>>>> deleted?
>>>>>>>>
>>>>>>>
>>>>>>> This is preload version of 2473:
>>>>>>>
>>>>>>> (insn 2473 2396 2478 161 (set (reg:V2DF 905 [ vect__302.545 ])
>>>>>>> (neg:V2DF (fma:V2DF (reg:V2DF 4283 [3119])
>>>>>>> (subreg:V2DF (reg:OO 2561 [ MEM >>>>>> real(kind=8)> [(real(kind=8) *)_4050] ]) 0)
>>>>>>> (neg:V2DF (subreg:V2DF (reg:OO 2572 [ vect__300.543_236 
>>>>>>> ]) 0) {*vsx_nfmsv2df4}
>>>>>>>  (expr_list:REG_DEAD (reg:OO 2572 [ vect__300.543_236 ])
>>>>>>> (expr_list:REG_DEAD (reg:OO 2561 [ MEM  
>>>>>>> [(real(kind=8) *)_4050] ])
>>>>>>> (nil
>>>>>>>
>>>>>>> insn 2472 is replaced with 9299 after reload.
>>>>>>
>>>>>> You'd have to check the dumps to be sure, but I think 9299 is instead
>>>>>> generated as an input reload of 2412, rather than being a replacement
>>>>>> of insn 2472.  T
>>>>>
>>>>> Yes it is generated for 2412. The predecessor of 2412 is load from
>>>>> plus offset as in 2412 we have subreg:V2DF (reg OO 2572) 16).
>>>>>
>>>>> This is not correct as we are not generating lxvp and it is 
>>>>> normal load lxv.
>>>>> As normal load is generated in predecessor insn of 2412 with
>>>>> plus constant offset it breaks the correctness.
>>>>
>>>> Not using lxvp is a deliberate choice though.
>>>>
>>>> If a (reg:OO R) has been spilled, there's no requirement for LRA
>>>> to load both halves of R when only one half is needed.  LRA just
>>>> loads what it needs into whichever registers happen to be free.
>>>>
>>>> If the reload of R instead used lxvp, LRA would be forced to free
>>>> up another register for the other half of R, even though that value
>>>> would never be used.
>>&

Re: [patch, rs6000, middle-end 0/1] v1: Add implementation for different targets for pair mem fusion

2024-06-11 Thread Ajit Agarwal
Hello Richard:

On 11/06/24 4:36 pm, Richard Sandiford wrote:
> Ajit Agarwal  writes:
>>>>>> After LRA reload:
>>>>>>
>>>>>> (insn 9299 2472 2412 187 (set (reg:V2DF 51 19 [orig:240 vect__302.545 ] 
>>>>>> [240])
>>>>>> (mem:V2DF (plus:DI (reg:DI 8 8 [orig:1285 ivtmp.886 ] [1285])
>>>>>> (const_int 16 [0x10])) [1 MEM  
>>>>>> [(real(kind=8) *)_4188]+16 S16 A64])) "shell_lam.fppized.f":238:72 1190 
>>>>>> {vsx_movv2df_64bit}
>>>>>>  (nil))
>>>>>> (insn 2412 9299 2477 187 (set (reg:V2DF 51 19 [orig:240 vect__302.545 ] 
>>>>>> [240])
>>>>>> (neg:V2DF (fma:V2DF (reg:V2DF 39 7 [ MEM >>>>> real(kind=8)> [(real(kind=8) *)_4050]+16 ])
>>>>>> (reg:V2DF 44 12 [3119])
>>>>>> (neg:V2DF (reg:V2DF 51 19 [orig:240 vect__302.545 ] 
>>>>>> [240]) {*vsx_nfmsv2df4}
>>>>>>  (nil))
>>>>>>
>>>>>> (insn 2473 9311 9312 187 (set (reg:V2DF 38 6 [orig:905 vect__302.545 ] 
>>>>>> [905])
>>>>>> (neg:V2DF (fma:V2DF (reg:V2DF 44 12 [3119])
>>>>>> (reg:V2DF 38 6 [orig:2561 MEM  
>>>>>> [(real(kind=8) *)_4050] ] [2561])
>>>>>> (neg:V2DF (reg:V2DF 47 15 [5266]) {*vsx_nfmsv2df4}
>>>>>>  (nil))
>>>>>>
>>>>>> In the above allocated code it assign registers 51 and 47 and they are 
>>>>>> not sequential.
>>>>>
>>>>> The reload for 2412 looks valid.  What was the original pre-reload
>>>>> version of insn 2473?  Also, what happened to insn 2472?  Was it deleted?
>>>>>
>>>>
>>>> This is preload version of 2473:
>>>>
>>>> (insn 2473 2396 2478 161 (set (reg:V2DF 905 [ vect__302.545 ])
>>>> (neg:V2DF (fma:V2DF (reg:V2DF 4283 [3119])
>>>> (subreg:V2DF (reg:OO 2561 [ MEM  
>>>> [(real(kind=8) *)_4050] ]) 0)
>>>> (neg:V2DF (subreg:V2DF (reg:OO 2572 [ vect__300.543_236 ]) 
>>>> 0) {*vsx_nfmsv2df4}
>>>>  (expr_list:REG_DEAD (reg:OO 2572 [ vect__300.543_236 ])
>>>> (expr_list:REG_DEAD (reg:OO 2561 [ MEM  
>>>> [(real(kind=8) *)_4050] ])
>>>> (nil
>>>>
>>>> insn 2472 is replaced with 9299 after reload.
>>>
>>> You'd have to check the dumps to be sure, but I think 9299 is instead
>>> generated as an input reload of 2412, rather than being a replacement
>>> of insn 2472.  T
>>
>> Yes it is generated for 2412. The predecessor of 2412 is load from
>> plus offset as in 2412 we have subreg:V2DF (reg OO 2572) 16).
>>
>> This is not correct as we are not generating lxvp and it is 
>> normal load lxv.
>> As normal load is generated in predecessor insn of 2412 with
>> plus constant offset it breaks the correctness.
> 
> Not using lxvp is a deliberate choice though.
> 
> If a (reg:OO R) has been spilled, there's no requirement for LRA
> to load both halves of R when only one half is needed.  LRA just
> loads what it needs into whichever registers happen to be free.
> 
> If the reload of R instead used lxvp, LRA would be forced to free
> up another register for the other half of R, even though that value
> would never be used.
> 

If a (reg:OO R ) 16 is loaded when it is spilled then loaded value
will be from plus offset 16 instead it should be loaded value 
from zero offset. As in load fusion pass we are replacing
(reg:V2DI R) with subreg (reg:OO R) 16 and hence loaded value
is from plus 16 offsets and thats why its breaking the correctness.

Similarly we are replacing (reg:V2DI R) 16 with subreg (reg:OO R) 0
and loaded value is from 16 offset instead its loading from zero
offset and thats why we are breaking the correctness.

To generate lxvp this is the semantics of replacing in load fusion pass. 
>>> That is, LRA needs to reload (subreg:V2DF (reg:OO 2572) 16)
>>> from memory for insn 2412.  It can use the destination of insn 2412 (r51)
>>> as a temporary to do that.  It doesn't need to load the other half of
>>> reg:OO 2572 for this instruction.  That in itself looks ok.
>>>
>>> So it looks like the problem is specific to insn 2473.  Perhaps LRA
>>> thinks that r47 already contains the low half of (reg:OO 2572),
>>> left behind by some p

Re: [patch, rs6000, middle-end 0/1] v1: Add implementation for different targets for pair mem fusion

2024-06-11 Thread Ajit Agarwal
Hello Richard:

On 10/06/24 3:58 pm, Richard Sandiford wrote:
> Ajit Agarwal  writes:
>> On 10/06/24 3:20 pm, Richard Sandiford wrote:
>>> Ajit Agarwal  writes:
>>>> On 10/06/24 2:52 pm, Richard Sandiford wrote:
>>>>> Ajit Agarwal  writes:
>>>>>> On 10/06/24 2:12 pm, Richard Sandiford wrote:
>>>>>>> Ajit Agarwal  writes:
>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>> +rtx set = single_set (insn);
>>>>>>>>>>>>>>>> +if (set == NULL_RTX)
>>>>>>>>>>>>>>>> +  return false;
>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>> +rtx op0 = SET_SRC (set);
>>>>>>>>>>>>>>>> +rtx_code code = GET_CODE (op0);
>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>> +// This check is added as register pairs are not 
>>>>>>>>>>>>>>>> generated
>>>>>>>>>>>>>>>> +// by RA for neg:V2DF (fma: V2DF (reg1)
>>>>>>>>>>>>>>>> +//  (reg2)
>>>>>>>>>>>>>>>> +//  (neg:V2DF (reg3)))
>>>>>>>>>>>>>>>> +if (GET_RTX_CLASS (code) == RTX_UNARY)
>>>>>>>>>>>>>>>> +  return false;
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> What's special about (neg (fma ...))?
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> I am not sure why register allocator fails allocating register 
>>>>>>>>>>>>>> pairs with
>>>>>>>>>>>>>> NEG Unary operation with fma operand. I have not debugged 
>>>>>>>>>>>>>> register allocator why the NEG
>>>>>>>>>>>>>> Unary operation with fma operand. 
>>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> For neg (fma ...) cases because of subreg 128 bits from OOmode 256 
>>>>>>>>>>>> bits are
>>>>>>>>>>>> set correctly. 
>>>>>>>>>>>> IRA marked them spill candidates as spill priority is zero.
>>>>>>>>>>>>
>>>>>>>>>>>> Due to this LRA reload pass couldn't allocate register pairs.
>>>>>>>>>>>
>>>>>>>>>>> I think this is just restating the symptom though.  I suppose the 
>>>>>>>>>>> same
>>>>>>>>>>> kind of questions apply here too: what was the instruction before 
>>>>>>>>>>> the
>>>>>>>>>>> pass runs, what was the instruction after the pass runs, and why is
>>>>>>>>>>> the rtl change incorrect (by the meaning above)?
>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> Original case where we dont do load fusion, spill happens, in that
>>>>>>>>>> case we dont require sequential register pairs to be generated for 2 
>>>>>>>>>> loads
>>>>>>>>>> for. Hence it worked.
>>>>>>>>>>
>>>>>>>>>> rtl change is correct and there is no error.
>>>>>>>>>>
>>>>>>>>>> for load fusion spill happens and we dont generate sequential 
>>>>>>>>>> register pairs
>>>>>>>>>> because pf spill candidate and lxvp gives incorrect results as 
>>>>>>>>>> sequential register
>>>>>>>>>> pairs are required for lxvp.
>>>>>>>>>
>>>>>>>>> Can you go into more detail?  How is the lxvp represented?  And how do
>>>>>>>>> we end up not getting a sequential register pair? 

Re: [patch, rs6000, middle-end 0/1] v1: Add implementation for different targets for pair mem fusion

2024-06-10 Thread Ajit Agarwal
Hello Richard:

On 10/06/24 3:20 pm, Richard Sandiford wrote:
> Ajit Agarwal  writes:
>> Hello Richard:
>>
>> On 10/06/24 2:52 pm, Richard Sandiford wrote:
>>> Ajit Agarwal  writes:
>>>> On 10/06/24 2:12 pm, Richard Sandiford wrote:
>>>>> Ajit Agarwal  writes:
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> +  rtx set = single_set (insn);
>>>>>>>>>>>>>> +  if (set == NULL_RTX)
>>>>>>>>>>>>>> +return false;
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> +  rtx op0 = SET_SRC (set);
>>>>>>>>>>>>>> +  rtx_code code = GET_CODE (op0);
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> +  // This check is added as register pairs are not 
>>>>>>>>>>>>>> generated
>>>>>>>>>>>>>> +  // by RA for neg:V2DF (fma: V2DF (reg1)
>>>>>>>>>>>>>> +  //  (reg2)
>>>>>>>>>>>>>> +  //  (neg:V2DF (reg3)))
>>>>>>>>>>>>>> +  if (GET_RTX_CLASS (code) == RTX_UNARY)
>>>>>>>>>>>>>> +return false;
>>>>>>>>>>>>>
>>>>>>>>>>>>> What's special about (neg (fma ...))?
>>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> I am not sure why register allocator fails allocating register 
>>>>>>>>>>>> pairs with
>>>>>>>>>>>> NEG Unary operation with fma operand. I have not debugged register 
>>>>>>>>>>>> allocator why the NEG
>>>>>>>>>>>> Unary operation with fma operand. 
>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> For neg (fma ...) cases because of subreg 128 bits from OOmode 256 
>>>>>>>>>> bits are
>>>>>>>>>> set correctly. 
>>>>>>>>>> IRA marked them spill candidates as spill priority is zero.
>>>>>>>>>>
>>>>>>>>>> Due to this LRA reload pass couldn't allocate register pairs.
>>>>>>>>>
>>>>>>>>> I think this is just restating the symptom though.  I suppose the same
>>>>>>>>> kind of questions apply here too: what was the instruction before the
>>>>>>>>> pass runs, what was the instruction after the pass runs, and why is
>>>>>>>>> the rtl change incorrect (by the meaning above)?
>>>>>>>>>
>>>>>>>>
>>>>>>>> Original case where we dont do load fusion, spill happens, in that
>>>>>>>> case we dont require sequential register pairs to be generated for 2 
>>>>>>>> loads
>>>>>>>> for. Hence it worked.
>>>>>>>>
>>>>>>>> rtl change is correct and there is no error.
>>>>>>>>
>>>>>>>> for load fusion spill happens and we dont generate sequential register 
>>>>>>>> pairs
>>>>>>>> because pf spill candidate and lxvp gives incorrect results as 
>>>>>>>> sequential register
>>>>>>>> pairs are required for lxvp.
>>>>>>>
>>>>>>> Can you go into more detail?  How is the lxvp represented?  And how do
>>>>>>> we end up not getting a sequential register pair?  What does the rtl
>>>>>>> look like (before and after things have gone wrong)?
>>>>>>>
>>>>>>> It seems like either the rtl is not describing the result of the fusion
>>>>>>> correctly or there is some problem in the .md description of lxvp.
>>>>>>>
>>>>>>
>>>>>> After fusion pass:
>>>>>>
>>>>>> (insn 9299 2472 2412 187 (set (reg:V2DF 51 19 [orig:240 vect__302.545 ] 
>>>>>> [240])
>>>&g

Re: [patch, rs6000, middle-end 0/1] v1: Add implementation for different targets for pair mem fusion

2024-06-10 Thread Ajit Agarwal
Hello Richard:

On 10/06/24 3:20 pm, Richard Sandiford wrote:
> Ajit Agarwal  writes:
>> Hello Richard:
>>
>> On 10/06/24 2:52 pm, Richard Sandiford wrote:
>>> Ajit Agarwal  writes:
>>>> On 10/06/24 2:12 pm, Richard Sandiford wrote:
>>>>> Ajit Agarwal  writes:
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> +  rtx set = single_set (insn);
>>>>>>>>>>>>>> +  if (set == NULL_RTX)
>>>>>>>>>>>>>> +return false;
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> +  rtx op0 = SET_SRC (set);
>>>>>>>>>>>>>> +  rtx_code code = GET_CODE (op0);
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> +  // This check is added as register pairs are not 
>>>>>>>>>>>>>> generated
>>>>>>>>>>>>>> +  // by RA for neg:V2DF (fma: V2DF (reg1)
>>>>>>>>>>>>>> +  //  (reg2)
>>>>>>>>>>>>>> +  //  (neg:V2DF (reg3)))
>>>>>>>>>>>>>> +  if (GET_RTX_CLASS (code) == RTX_UNARY)
>>>>>>>>>>>>>> +return false;
>>>>>>>>>>>>>
>>>>>>>>>>>>> What's special about (neg (fma ...))?
>>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> I am not sure why register allocator fails allocating register 
>>>>>>>>>>>> pairs with
>>>>>>>>>>>> NEG Unary operation with fma operand. I have not debugged register 
>>>>>>>>>>>> allocator why the NEG
>>>>>>>>>>>> Unary operation with fma operand. 
>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> For neg (fma ...) cases because of subreg 128 bits from OOmode 256 
>>>>>>>>>> bits are
>>>>>>>>>> set correctly. 
>>>>>>>>>> IRA marked them spill candidates as spill priority is zero.
>>>>>>>>>>
>>>>>>>>>> Due to this LRA reload pass couldn't allocate register pairs.
>>>>>>>>>
>>>>>>>>> I think this is just restating the symptom though.  I suppose the same
>>>>>>>>> kind of questions apply here too: what was the instruction before the
>>>>>>>>> pass runs, what was the instruction after the pass runs, and why is
>>>>>>>>> the rtl change incorrect (by the meaning above)?
>>>>>>>>>
>>>>>>>>
>>>>>>>> Original case where we dont do load fusion, spill happens, in that
>>>>>>>> case we dont require sequential register pairs to be generated for 2 
>>>>>>>> loads
>>>>>>>> for. Hence it worked.
>>>>>>>>
>>>>>>>> rtl change is correct and there is no error.
>>>>>>>>
>>>>>>>> for load fusion spill happens and we dont generate sequential register 
>>>>>>>> pairs
>>>>>>>> because pf spill candidate and lxvp gives incorrect results as 
>>>>>>>> sequential register
>>>>>>>> pairs are required for lxvp.
>>>>>>>
>>>>>>> Can you go into more detail?  How is the lxvp represented?  And how do
>>>>>>> we end up not getting a sequential register pair?  What does the rtl
>>>>>>> look like (before and after things have gone wrong)?
>>>>>>>
>>>>>>> It seems like either the rtl is not describing the result of the fusion
>>>>>>> correctly or there is some problem in the .md description of lxvp.
>>>>>>>
>>>>>>
>>>>>> After fusion pass:
>>>>>>
>>>>>> (insn 9299 2472 2412 187 (set (reg:V2DF 51 19 [orig:240 vect__302.545 ] 
>>>>>> [240])
>>>&g

Re: [patch, rs6000, middle-end 0/1] v1: Add implementation for different targets for pair mem fusion

2024-06-10 Thread Ajit Agarwal
Hello Richard:

On 10/06/24 2:52 pm, Richard Sandiford wrote:
> Ajit Agarwal  writes:
>> On 10/06/24 2:12 pm, Richard Sandiford wrote:
>>> Ajit Agarwal  writes:
>>>>>>>>>>>> +
>>>>>>>>>>>> +rtx set = single_set (insn);
>>>>>>>>>>>> +if (set == NULL_RTX)
>>>>>>>>>>>> +  return false;
>>>>>>>>>>>> +
>>>>>>>>>>>> +rtx op0 = SET_SRC (set);
>>>>>>>>>>>> +rtx_code code = GET_CODE (op0);
>>>>>>>>>>>> +
>>>>>>>>>>>> +// This check is added as register pairs are not generated
>>>>>>>>>>>> +// by RA for neg:V2DF (fma: V2DF (reg1)
>>>>>>>>>>>> +//  (reg2)
>>>>>>>>>>>> +//  (neg:V2DF (reg3)))
>>>>>>>>>>>> +if (GET_RTX_CLASS (code) == RTX_UNARY)
>>>>>>>>>>>> +  return false;
>>>>>>>>>>>
>>>>>>>>>>> What's special about (neg (fma ...))?
>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> I am not sure why register allocator fails allocating register pairs 
>>>>>>>>>> with
>>>>>>>>>> NEG Unary operation with fma operand. I have not debugged register 
>>>>>>>>>> allocator why the NEG
>>>>>>>>>> Unary operation with fma operand. 
>>>>>>>>>
>>>>>>>>
>>>>>>>> For neg (fma ...) cases because of subreg 128 bits from OOmode 256 
>>>>>>>> bits are
>>>>>>>> set correctly. 
>>>>>>>> IRA marked them spill candidates as spill priority is zero.
>>>>>>>>
>>>>>>>> Due to this LRA reload pass couldn't allocate register pairs.
>>>>>>>
>>>>>>> I think this is just restating the symptom though.  I suppose the same
>>>>>>> kind of questions apply here too: what was the instruction before the
>>>>>>> pass runs, what was the instruction after the pass runs, and why is
>>>>>>> the rtl change incorrect (by the meaning above)?
>>>>>>>
>>>>>>
>>>>>> Original case where we dont do load fusion, spill happens, in that
>>>>>> case we dont require sequential register pairs to be generated for 2 
>>>>>> loads
>>>>>> for. Hence it worked.
>>>>>>
>>>>>> rtl change is correct and there is no error.
>>>>>>
>>>>>> for load fusion spill happens and we dont generate sequential register 
>>>>>> pairs
>>>>>> because pf spill candidate and lxvp gives incorrect results as 
>>>>>> sequential register
>>>>>> pairs are required for lxvp.
>>>>>
>>>>> Can you go into more detail?  How is the lxvp represented?  And how do
>>>>> we end up not getting a sequential register pair?  What does the rtl
>>>>> look like (before and after things have gone wrong)?
>>>>>
>>>>> It seems like either the rtl is not describing the result of the fusion
>>>>> correctly or there is some problem in the .md description of lxvp.
>>>>>
>>>>
>>>> After fusion pass:
>>>>
>>>> (insn 9299 2472 2412 187 (set (reg:V2DF 51 19 [orig:240 vect__302.545 ] 
>>>> [240])
>>>> (mem:V2DF (plus:DI (reg:DI 8 8 [orig:1285 ivtmp.886 ] [1285])
>>>> (const_int 16 [0x10])) [1 MEM  
>>>> [(real(kind=8) *)_4188]+16 S16 A64])) "shell_lam.fppized.f":238:72 1190 
>>>> {vsx_movv2df_64bit}
>>>>  (nil))
>>>> (insn 2412 9299 2477 187 (set (reg:V2DF 51 19 [orig:240 vect__302.545 ] 
>>>> [240])
>>>> (neg:V2DF (fma:V2DF (reg:V2DF 39 7 [ MEM  
>>>> [(real(kind=8) *)_4050]+16 ])
>>>> (reg:V2DF 44 12 [3119])
>>>> (neg:V2DF (reg:V2DF 51 19 [orig:240 vect__302.545 ] 
>>>> [240]) {*vsx_nfmsv2df4}
>>>>  (nil))
>>>>
>

Re: [patch, rs6000, middle-end 0/1] v1: Add implementation for different targets for pair mem fusion

2024-06-10 Thread Ajit Agarwal
Hello Richard:

On 10/06/24 2:12 pm, Richard Sandiford wrote:
> Ajit Agarwal  writes:
>>>>>>>>>> +
>>>>>>>>>> +  rtx set = single_set (insn);
>>>>>>>>>> +  if (set == NULL_RTX)
>>>>>>>>>> +return false;
>>>>>>>>>> +
>>>>>>>>>> +  rtx op0 = SET_SRC (set);
>>>>>>>>>> +  rtx_code code = GET_CODE (op0);
>>>>>>>>>> +
>>>>>>>>>> +  // This check is added as register pairs are not generated
>>>>>>>>>> +  // by RA for neg:V2DF (fma: V2DF (reg1)
>>>>>>>>>> +  //  (reg2)
>>>>>>>>>> +  //  (neg:V2DF (reg3)))
>>>>>>>>>> +  if (GET_RTX_CLASS (code) == RTX_UNARY)
>>>>>>>>>> +return false;
>>>>>>>>>
>>>>>>>>> What's special about (neg (fma ...))?
>>>>>>>>>
>>>>>>>>
>>>>>>>> I am not sure why register allocator fails allocating register pairs 
>>>>>>>> with
>>>>>>>> NEG Unary operation with fma operand. I have not debugged register 
>>>>>>>> allocator why the NEG
>>>>>>>> Unary operation with fma operand. 
>>>>>>>
>>>>>>
>>>>>> For neg (fma ...) cases because of subreg 128 bits from OOmode 256 bits 
>>>>>> are
>>>>>> set correctly. 
>>>>>> IRA marked them spill candidates as spill priority is zero.
>>>>>>
>>>>>> Due to this LRA reload pass couldn't allocate register pairs.
>>>>>
>>>>> I think this is just restating the symptom though.  I suppose the same
>>>>> kind of questions apply here too: what was the instruction before the
>>>>> pass runs, what was the instruction after the pass runs, and why is
>>>>> the rtl change incorrect (by the meaning above)?
>>>>>
>>>>
>>>> Original case where we dont do load fusion, spill happens, in that
>>>> case we dont require sequential register pairs to be generated for 2 loads
>>>> for. Hence it worked.
>>>>
>>>> rtl change is correct and there is no error.
>>>>
>>>> for load fusion spill happens and we dont generate sequential register 
>>>> pairs
>>>> because pf spill candidate and lxvp gives incorrect results as sequential 
>>>> register
>>>> pairs are required for lxvp.
>>>
>>> Can you go into more detail?  How is the lxvp represented?  And how do
>>> we end up not getting a sequential register pair?  What does the rtl
>>> look like (before and after things have gone wrong)?
>>>
>>> It seems like either the rtl is not describing the result of the fusion
>>> correctly or there is some problem in the .md description of lxvp.
>>>
>>
>> After fusion pass:
>>
>> (insn 9299 2472 2412 187 (set (reg:V2DF 51 19 [orig:240 vect__302.545 ] 
>> [240])
>> (mem:V2DF (plus:DI (reg:DI 8 8 [orig:1285 ivtmp.886 ] [1285])
>> (const_int 16 [0x10])) [1 MEM  
>> [(real(kind=8) *)_4188]+16 S16 A64])) "shell_lam.fppized.f":238:72 1190 
>> {vsx_movv2df_64bit}
>>  (nil))
>> (insn 2412 9299 2477 187 (set (reg:V2DF 51 19 [orig:240 vect__302.545 ] 
>> [240])
>> (neg:V2DF (fma:V2DF (reg:V2DF 39 7 [ MEM  
>> [(real(kind=8) *)_4050]+16 ])
>> (reg:V2DF 44 12 [3119])
>> (neg:V2DF (reg:V2DF 51 19 [orig:240 vect__302.545 ] 
>> [240]) {*vsx_nfmsv2df4}
>>  (nil))
>>
>> In LRA reload.
>>
>> (insn 2472 2461 2412 161 (set (reg:OO 2572 [ vect__300.543_236 ])
>> (mem:OO (reg:DI 4260 [orig:1285 ivtmp.886 ] [1285]) [1 MEM 
>>  [(real(kind=8) *)_4188]+0 S16 A64])) 
>> "shell_lam.fppized.f":238:72 2187 {*movoo}
>>  (expr_list:REG_EQUIV (mem:OO (reg:DI 4260 [orig:1285 ivtmp.886 ] 
>> [1285]) [1 MEM  [(real(kind=8) *)_4188]+0 S16 A64])
>> (nil)))
>> (insn 2412 2472 2477 161 (set (reg:V2DF 240 [ vect__302.545 ])
>> (neg:V2DF (fma:V2DF (subreg:V2DF (reg:OO 2561 [ MEM > real(kind=8)> [(real(kind=8) *)_4050] ]) 16)
>> (reg:V2DF 4283 [3119])
>

Re: [patch, rs6000, middle-end 0/1] v1: Add implementation for different targets for pair mem fusion

2024-06-07 Thread Ajit Agarwal
Hello Richard:

On 07/06/24 1:52 pm, Richard Sandiford wrote:
> Ajit Agarwal  writes:
>>>>>>>> +
>>>>>>>> +  df_ref use;
>>>>>>>> +  df_insn_info *insn_info = DF_INSN_INFO_GET (info->rtl ());
>>>>>>>> +  FOR_EACH_INSN_INFO_DEF (use, insn_info)
>>>>>>>> +{
>>>>>>>> +  struct df_link *def_link = DF_REF_CHAIN (use);
>>>>>>>> +
>>>>>>>> +  if (!def_link || !def_link->ref
>>>>>>>> +|| DF_REF_IS_ARTIFICIAL (def_link->ref))
>>>>>>>> +  continue;
>>>>>>>> +
>>>>>>>> +  while (def_link && def_link->ref)
>>>>>>>> +  {
>>>>>>>> +rtx_insn *insn = DF_REF_INSN (def_link->ref);
>>>>>>>> +if (GET_CODE (PATTERN (insn)) == PARALLEL)
>>>>>>>> +  return false;
>>>>>>>
>>>>>>> Why do you need to skip PARALLELs?
>>>>>>>
>>>>>>
>>>>>> vec_select with parallel give failures final.cc "can't split-up with 
>>>>>> subreg 128 (reg OO"
>>>>>> Thats why I have added this.
>>>>>
>>>>> But in (vec_select ... (parallel ...)), the parallel won't be the 
>>>>> PATTERN (insn).  It'll instead be a suboperand of the vec_select.
>>>>>
>>>>> Here too it's important to understand why the final.cc failure occurs
>>>>> and what the correct fix is.
>>>>>
>>>>
>>>> subreg with vec_select operand already exists before fusion pass.
>>>> We overwrite them with subreg 128 bits from 256 OO mode operand.
>>>
>>> But why is that wrong?  What was the full rtl of the subreg before the
>>> pass runs, what did the subreg look like after the pass, and why is the
>>> change not correct?
>>>
>>> In general, there are two main ways that an rtl change can be incorrect:
>>>
>>> (1) The new rtl isn't well-formed (such as (subreg (subreg X A) B)).
>>> In this case, the new rtl makes no inherent sense when viewed
>>> in isolation: it isn't necessary to see the old rtl to tell that
>>> the new rtl is wrong.
>>>
>>> (2) The new rtl is well-formed (i.e. makes inherent sense when viewed in
>>> isolation) but it does not have the same semantics as the old rtl.
>>> In other words, the new rtl is describing a different operation
>>> from the old rtl.
>>>
>>> I think we need to talk about it in those terms, rather than where
>>> the eventual ICE occurs.
>>>
>> Before the fusion.
>> old rtl looks like this:
>>
>> (vec_select:HI (subreg:V8HI (reg:V16QI 125 [ vect__29.38 ]) 0)
>>
>> After the fusion
>> new rtl looks like this:
>>
>> (vec_select:HI (subreg:V16QI (reg:OO 125 [ vect__29.38 ]) 16)
>>
>> new rtl is not well formed.
>>
>> Thats why its failing.
>>
>> reg:v16QI 125 is the destination of the load that needs to be fused.
> 
> This indicates that there's a bug in the substitution code.
> 
> It's probably better to create a fresh OO register, rather than
> change an existing 128-bit register to 256 bits.  If we do that,
> and if reg:V16QI 125 is the destination of the second load
> (which I assume it is from the 16 offset in the subreg),
> then the new RTL should be:
> 
>   (vec_select:HI (subreg:V8HI (reg:OO NEW_REG) 16) ...)
> 
> It's possible to get this by using insn_propagation to replace
> (reg:V16QI 125) with (subreg:V16QI (reg:OO NEW_REG) 16).
> insn_propagation should then take care of the rest.
> 
> There are no existing rtl-ssa routines for handling new registers
> though.  (The idea was to add things as the need arose.)
> 

Sure I will do that. Thanks.

>>>> Due to this in final.cc we couldnt splt at line 2807 and bails
>>>> out fatal_insn.
>>>>
>>>> Currently we dont support already existing subreg vector operand
>>>> to generate register pairs.
>>>> We should bail out from fusion pass in this case.
>>>>>>>> +
>>>>>>>> +rtx set = single_set (insn);
>>>>>>>> +if (set == NULL_RTX)
>>>>>>>> +  return false;
>>>>>>>> +
>>>>>>>> +   

Re: [patch, rs6000, middle-end 0/1] v1: Add implementation for different targets for pair mem fusion

2024-06-07 Thread Ajit Agarwal
Hello Richard:

On 07/06/24 4:24 am, Richard Sandiford wrote:
> Ajit Agarwal  writes:
>> On 06/06/24 8:03 pm, Richard Sandiford wrote:
>>> Ajit Agarwal  writes:
>>>> On 06/06/24 2:28 pm, Richard Sandiford wrote:
>>>>> Hi,
>>>>>
>>>>> Just some comments on the fuseable_load_p part, since that's what
>>>>> we were discussing last time.
>>>>>
>>>>> It looks like this now relies on:
>>>>>
>>>>> Ajit Agarwal  writes:
>>>>>> +  /* We use DF data flow because we change location rtx
>>>>>> + which is easier to find and modify.
>>>>>> + We use mix of rtl-ssa def-use and DF data flow
>>>>>> + where it is easier.  */
>>>>>> +  df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
>>>>>> +  df_analyze ();
>>>>>> +  df_set_flags (DF_DEFER_INSN_RESCAN);
>>>>>
>>>>> But please don't do this!  For one thing, building DU/UD chains
>>>>> as well as rtl-ssa is really expensive in terms of compile time.
>>>>> But more importantly, modifications need to happen via rtl-ssa
>>>>> to ensure that the IL is kept up-to-date.  If we don't do that,
>>>>> later fuse attempts will be based on stale data and so could
>>>>> generate incorrect code.
>>>>>
>>>>
>>>> Sure I have made changes to use only rtl-ssa and not to use
>>>> UD/DU chains. I will send the changes in separate subsequent
>>>> patch.
>>>
>>> Thanks.  Before you send the patch though:
>>>
>>>>>> +// Check whether load can be fusable or not.
>>>>>> +// Return true if fuseable otherwise false.
>>>>>> +bool
>>>>>> +rs6000_pair_fusion::fuseable_load_p (insn_info *info)
>>>>>> +{
>>>>>> +  for (auto def : info->defs())
>>>>>> +{
>>>>>> +  auto set = dyn_cast (def);
>>>>>> +  for (auto use1 : set->nondebug_insn_uses ())
>>>>>> +use1->set_is_live_out_use (true);
>>>>>> +}
>>>>>
>>>>> What was the reason for adding this loop?
>>>>>
>>>>
>>>> The purpose of adding is to avoid assert failure in 
>>>> gcc/rtl-ssa/changes.cc:252
>>>
>>> That assert is making sure that we don't delete a definition of a
>>> register (or memory) while a real insn still uses it.  If the assert
>>> is firing then something has gone wrong.
>>>
>>> Live-out uses are a particular kind of use that occur at the end of
>>> basic blocks.  It's incorrect to mark normal insn uses as live-out.
>>>
>>> When an assert fails, it's important to understand why the failure
>>> occurs, rather than brute-force the assert condition to true.
>>>
>>
>> The above assert failure occurs when there is a debug insn and its
>> use is not live-out.
> 
> Uses in debug insns are never live-out uses.
> 
> It sounds like the bug is that we're failing to update all debug uses of
> the original register.  We need to do that, or "reset" the debug insn if
> substitution fails for some reason.
> 
> See fixup_debug_uses for what the target-independent part of the pass
> does for debug insns that are affected by movement.  Hopefully the
> update needed here will be simpler than that.
> 

Sure. Thanks.

>>>>>> [...]
>>>>>> +
>>>>>> +  rtx addr = XEXP (SET_SRC (body), 0);
>>>>>> +
>>>>>> +  if (GET_CODE (addr) == PLUS
>>>>>> +  && XEXP (addr, 1) && CONST_INT_P (XEXP (addr, 1)))
>>>>>> +{
>>>>>> +  if (INTVAL (XEXP (addr, 1)) == -16)
>>>>>> +return false;
>>>>>> +  }
>>>>>
>>>>> What's special about -16?
>>>>>
>>>>
>>>> The tests like libgomp/for-8 fails with fused load with offset -16 and 0.
>>>> Thats why I have added this check.
>>>
>>> But why does it fail though?  It sounds like the testcase is pointing
>>> out a problem in the pass (or perhaps elsewhere).  It's important that
>>> we try to understand and fix the underlying problem.
>>>
>>
>> This check is not required anymore and will remove from subsequent patches.
> 
> OK, gr

Re: [patch, rs6000, middle-end 0/1] v1: Add implementation for different targets for pair mem fusion

2024-06-06 Thread Ajit Agarwal
Hello Richard:

On 06/06/24 8:03 pm, Richard Sandiford wrote:
> Ajit Agarwal  writes:
>> On 06/06/24 2:28 pm, Richard Sandiford wrote:
>>> Hi,
>>>
>>> Just some comments on the fuseable_load_p part, since that's what
>>> we were discussing last time.
>>>
>>> It looks like this now relies on:
>>>
>>> Ajit Agarwal  writes:
>>>> +  /* We use DF data flow because we change location rtx
>>>> +   which is easier to find and modify.
>>>> +   We use mix of rtl-ssa def-use and DF data flow
>>>> +   where it is easier.  */
>>>> +  df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
>>>> +  df_analyze ();
>>>> +  df_set_flags (DF_DEFER_INSN_RESCAN);
>>>
>>> But please don't do this!  For one thing, building DU/UD chains
>>> as well as rtl-ssa is really expensive in terms of compile time.
>>> But more importantly, modifications need to happen via rtl-ssa
>>> to ensure that the IL is kept up-to-date.  If we don't do that,
>>> later fuse attempts will be based on stale data and so could
>>> generate incorrect code.
>>>
>>
>> Sure I have made changes to use only rtl-ssa and not to use
>> UD/DU chains. I will send the changes in separate subsequent
>> patch.
> 
> Thanks.  Before you send the patch though:
> 
>>>> +// Check whether load can be fusable or not.
>>>> +// Return true if fuseable otherwise false.
>>>> +bool
>>>> +rs6000_pair_fusion::fuseable_load_p (insn_info *info)
>>>> +{
>>>> +  for (auto def : info->defs())
>>>> +{
>>>> +  auto set = dyn_cast (def);
>>>> +  for (auto use1 : set->nondebug_insn_uses ())
>>>> +  use1->set_is_live_out_use (true);
>>>> +}
>>>
>>> What was the reason for adding this loop?
>>>
>>
>> The purpose of adding is to avoid assert failure in 
>> gcc/rtl-ssa/changes.cc:252
> 
> That assert is making sure that we don't delete a definition of a
> register (or memory) while a real insn still uses it.  If the assert
> is firing then something has gone wrong.
> 
> Live-out uses are a particular kind of use that occur at the end of
> basic blocks.  It's incorrect to mark normal insn uses as live-out.
> 
> When an assert fails, it's important to understand why the failure
> occurs, rather than brute-force the assert condition to true.
> 

The above assert failure occurs when there is a debug insn and its
use is not live-out.

>>>> [...]
>>>> +
>>>> +  rtx addr = XEXP (SET_SRC (body), 0);
>>>> +
>>>> +  if (GET_CODE (addr) == PLUS
>>>> +  && XEXP (addr, 1) && CONST_INT_P (XEXP (addr, 1)))
>>>> +{
>>>> +  if (INTVAL (XEXP (addr, 1)) == -16)
>>>> +  return false;
>>>> +  }
>>>
>>> What's special about -16?
>>>
>>
>> The tests like libgomp/for-8 fails with fused load with offset -16 and 0.
>> Thats why I have added this check.
> 
> But why does it fail though?  It sounds like the testcase is pointing
> out a problem in the pass (or perhaps elsewhere).  It's important that
> we try to understand and fix the underlying problem.
> 

This check is not required anymore and will remove from subsequent patches.
>>>> +
>>>> +  df_ref use;
>>>> +  df_insn_info *insn_info = DF_INSN_INFO_GET (info->rtl ());
>>>> +  FOR_EACH_INSN_INFO_DEF (use, insn_info)
>>>> +{
>>>> +  struct df_link *def_link = DF_REF_CHAIN (use);
>>>> +
>>>> +  if (!def_link || !def_link->ref
>>>> +|| DF_REF_IS_ARTIFICIAL (def_link->ref))
>>>> +  continue;
>>>> +
>>>> +  while (def_link && def_link->ref)
>>>> +  {
>>>> +rtx_insn *insn = DF_REF_INSN (def_link->ref);
>>>> +if (GET_CODE (PATTERN (insn)) == PARALLEL)
>>>> +  return false;
>>>
>>> Why do you need to skip PARALLELs?
>>>
>>
>> vec_select with parallel give failures final.cc "can't split-up with subreg 
>> 128 (reg OO"
>> Thats why I have added this.
> 
> But in (vec_select ... (parallel ...)), the parallel won't be the 
> PATTERN (insn).  It'll instead be a suboperand of the vec_select.
> 
> Here too it's important to understand why the final.cc failure occurs
> and what the correct fix is.
> 

subreg with vec_select operand already exists

Re: [patch, rs6000, middle-end 0/1] v1: Add implementation for different targets for pair mem fusion

2024-06-06 Thread Ajit Agarwal
Hello Richard:

On 06/06/24 2:28 pm, Richard Sandiford wrote:
> Hi,
> 
> Just some comments on the fuseable_load_p part, since that's what
> we were discussing last time.
> 
> It looks like this now relies on:
> 
> Ajit Agarwal  writes:
>> +  /* We use DF data flow because we change location rtx
>> + which is easier to find and modify.
>> + We use mix of rtl-ssa def-use and DF data flow
>> + where it is easier.  */
>> +  df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
>> +  df_analyze ();
>> +  df_set_flags (DF_DEFER_INSN_RESCAN);
> 
> But please don't do this!  For one thing, building DU/UD chains
> as well as rtl-ssa is really expensive in terms of compile time.
> But more importantly, modifications need to happen via rtl-ssa
> to ensure that the IL is kept up-to-date.  If we don't do that,
> later fuse attempts will be based on stale data and so could
> generate incorrect code.
> 

Sure I have made changes to use only rtl-ssa and not to use
UD/DU chains. I will send the changes in separate subsequent
patch.

>> +// Check whether load can be fusable or not.
>> +// Return true if fuseable otherwise false.
>> +bool
>> +rs6000_pair_fusion::fuseable_load_p (insn_info *info)
>> +{
>> +  for (auto def : info->defs())
>> +{
>> +  auto set = dyn_cast (def);
>> +  for (auto use1 : set->nondebug_insn_uses ())
>> +use1->set_is_live_out_use (true);
>> +}
> 
> What was the reason for adding this loop?
>

The purpose of adding is to avoid assert failure in gcc/rtl-ssa/changes.cc:252

 
>> +
>> +  rtx_insn *rtl_insn = info ->rtl ();
>> +  rtx body = PATTERN (rtl_insn);
>> +  rtx dest_exp = SET_DEST (body);
>> +
>> +  if (REG_P (dest_exp) &&
>> +  (DF_REG_DEF_COUNT (REGNO (dest_exp)) > 1
> 
> The rtl-ssa way of checking this is:
> 
>   crtl->ssa->is_single_dominating_def (...)
> 
>> +   || DF_REG_EQ_USE_COUNT (REGNO (dest_exp)) > 0))
>> +return  false;
> 
> Why are uses in notes a problem?  In the worst case, we should just be
> able to remove the note instead.
>

We can remove this and its no more required. I will make this
change in subsequent patches.
 
>> +
>> +  rtx addr = XEXP (SET_SRC (body), 0);
>> +
>> +  if (GET_CODE (addr) == PLUS
>> +  && XEXP (addr, 1) && CONST_INT_P (XEXP (addr, 1)))
>> +{
>> +  if (INTVAL (XEXP (addr, 1)) == -16)
>> +return false;
>> +  }
> 
> What's special about -16?
> 

The tests like libgomp/for-8 fails with fused load with offset -16 and 0.
Thats why I have added this check.


>> +
>> +  df_ref use;
>> +  df_insn_info *insn_info = DF_INSN_INFO_GET (info->rtl ());
>> +  FOR_EACH_INSN_INFO_DEF (use, insn_info)
>> +{
>> +  struct df_link *def_link = DF_REF_CHAIN (use);
>> +
>> +  if (!def_link || !def_link->ref
>> +  || DF_REF_IS_ARTIFICIAL (def_link->ref))
>> +continue;
>> +
>> +  while (def_link && def_link->ref)
>> +{
>> +  rtx_insn *insn = DF_REF_INSN (def_link->ref);
>> +  if (GET_CODE (PATTERN (insn)) == PARALLEL)
>> +return false;
> 
> Why do you need to skip PARALLELs?
>

vec_select with parallel give failures final.cc "can't split-up with subreg 128 
(reg OO"
Thats why I have added this.

 
>> +
>> +  rtx set = single_set (insn);
>> +  if (set == NULL_RTX)
>> +return false;
>> +
>> +  rtx op0 = SET_SRC (set);
>> +  rtx_code code = GET_CODE (op0);
>> +
>> +  // This check is added as register pairs are not generated
>> +  // by RA for neg:V2DF (fma: V2DF (reg1)
>> +  //  (reg2)
>> +  //  (neg:V2DF (reg3)))
>> +  if (GET_RTX_CLASS (code) == RTX_UNARY)
>> +return false;
> 
> What's special about (neg (fma ...))?
>

I am not sure why register allocator fails allocating register pairs with
NEG Unary operation with fma operand. I have not debugged register allocator 
why the NEG
Unary operation with fma operand. 
 
>> +
>> +  def_link = def_link->next;
>> +}
>> + }
>> +  return true;
>> +}
> 
> Thanks,
> Richard

Thanks & Regards
Ajit


[patch, rs6000, middle-end 0/1] v1: Add implementation for different targets for pair mem fusion

2024-06-05 Thread Ajit Agarwal
Hello All:

All comments are addressed.

Common infrastructure using generic code for pair mem fusion of different
targets.

rs6000 target specific code implement virtual functions defined by generic code.

Target specific code are added in rs6000-mem-fusion.cc.

Tested for powerpc64-linux-gnu.

Thanks & Regards
Ajit


rs6000, middle-end: Add implementation for different targets for pair mem fusion

Common infrastructure using generic code for pair mem fusion of different
targets.

rs6000 target specific code implement virtual functions defined by generic code.

Target specific code are added in rs6000-mem-fusion.cc.

2024-06-06  Ajit Kumar Agarwal  

gcc/ChangeLog:

* config/rs6000/rs6000-passes.def: New mem fusion pass
before pass_early_remat.
* pair-fusion.h: Add additional pure virtual function
required for rs6000 target implementation.
* pair-fusion.cc: Use of virtual functions for additional
virtual function addded for rs6000 target.
* config/rs6000/rs6000-mem-fusion.cc: Add new pass.
Add target specific implementation for generic pure virtual
functions.
* config.gcc: Add new object file.
* config/rs6000/rs6000-protos.h: Add new prototype for mem
fusion pass.
* config/rs6000/t-rs6000: Add new rule.
* rtl-ssa/accesses.h: Moved set_is_live_out_use as public
from private.

gcc/testsuite/ChangeLog:

* g++.target/powerpc/mem-fusion.C: New test.
* g++.target/powerpc/mem-fusion-1.C: New test.
* gcc.target/powerpc/mma-builtin-1.c: Modify test.
---
 gcc/config.gcc|   2 +
 gcc/config/rs6000/rs6000-mem-fusion.cc| 645 ++
 gcc/config/rs6000/rs6000-passes.def   |   4 +-
 gcc/config/rs6000/rs6000-protos.h |   1 +
 gcc/config/rs6000/t-rs6000|   5 +
 gcc/pair-fusion.cc|  23 +-
 gcc/pair-fusion.h |  16 +
 gcc/rtl-ssa/accesses.h|   4 +-
 .../g++.target/powerpc/mem-fusion-1.C |  22 +
 gcc/testsuite/g++.target/powerpc/mem-fusion.C |  15 +
 .../gcc.target/powerpc/mma-builtin-1.c|   4 +-
 11 files changed, 731 insertions(+), 10 deletions(-)
 create mode 100644 gcc/config/rs6000/rs6000-mem-fusion.cc
 create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion-1.C
 create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion.C

diff --git a/gcc/config.gcc b/gcc/config.gcc
index e500ba63e32..348308b2e93 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -524,6 +524,7 @@ powerpc*-*-*)
extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
extra_objs="${extra_objs} rs6000-builtins.o rs6000-builtin.o"
+   extra_objs="${extra_objs} rs6000-mem-fusion.o"
extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h"
extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h"
extra_headers="${extra_headers} xmmintrin.h mm_malloc.h emmintrin.h"
@@ -560,6 +561,7 @@ rs6000*-*-*)
extra_options="${extra_options} g.opt fused-madd.opt 
rs6000/rs6000-tables.opt"
extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
+   extra_objs="${extra_objs} rs6000-mem-fusion.o"
target_gtfiles="$target_gtfiles 
\$(srcdir)/config/rs6000/rs6000-logue.cc 
\$(srcdir)/config/rs6000/rs6000-call.cc"
target_gtfiles="$target_gtfiles 
\$(srcdir)/config/rs6000/rs6000-pcrel-opt.cc"
;;
diff --git a/gcc/config/rs6000/rs6000-mem-fusion.cc 
b/gcc/config/rs6000/rs6000-mem-fusion.cc
new file mode 100644
index 000..bdc08062534
--- /dev/null
+++ b/gcc/config/rs6000/rs6000-mem-fusion.cc
@@ -0,0 +1,645 @@
+/* Subroutines used to perform adjacent load/store into
+   paired memory accesses for TARGET_POWER10 and TARGET_VSX.
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   .  */
+
+#define INCLUDE_ALGORITHM
+#define INCLUDE_FUNCTIONAL
+#define INCLUDE_LIST
+#define INCLUDE_TYPE_TRAITS
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "target.h"
+#include "df.h"
+#include "rtl.h"
+#include 

Re: [Patch, rs6000, aarch64, middle-end] Add implementation for different targets for pair mem fusion

2024-06-04 Thread Ajit Agarwal
Hello Richard:

On 03/06/24 9:28 pm, Ajit Agarwal wrote:
> Hello Richard:
> 
> On 03/06/24 8:24 pm, Richard Sandiford wrote:
>> Ajit Agarwal  writes:
>>> Hello Richard:
>>>
>>> On 03/06/24 7:47 pm, Richard Sandiford wrote:
>>>> Ajit Agarwal  writes:
>>>>> On 03/06/24 5:03 pm, Richard Sandiford wrote:
>>>>>> Ajit Agarwal  writes:
>>>>>>>> [...]
>>>>>>>> If it is intentional, what distinguishes things like vperm and 
>>>>>>>> xxinsertw
>>>>>>>> (and all other unspecs) from plain addition?
>>>>>>>>
>>>>>>>>   [(set (match_operand:VSX_F 0 "vsx_register_operand" "=wa")
>>>>>>>> (plus:VSX_F (match_operand:VSX_F 1 "vsx_register_operand" "wa")
>>>>>>>>(match_operand:VSX_F 2 "vsx_register_operand" 
>>>>>>>> "wa")))]
>>>>>>>>
>>>>>>>
>>>>>>> Plain addition are not supported currently.
>>>>>>> We have not seen many cases with plain addition and this patch
>>>>>>> will not accept plain addition.
>>>>>>>
>>>>>>>  
>>>>>>>> This is why the intention behind the patch is important.  As it stands,
>>>>>>>> it isn't clear what criteria the patch is using to distinguish "valid"
>>>>>>>> fuse candidates from "invalid" ones.
>>>>>>>>
>>>>>>>
>>>>>>> Intention behind this patch all variants of UNSPEC instructions are
>>>>>>> supported and uses without UNSPEC are not supported in this patch.
>>>>>>
>>>>>> But why make the distinction this way though?  UNSPEC is a very
>>>>>> GCC-specific concept.  Whether something is an UNSPEC or some other
>>>>>> RTL code depends largely on historical accident.  E.g. we have specific
>>>>>> codes for VEC_SELECT, VEC_MERGE, and VEC_DUPLICATE, but don't have one
>>>>>> for VEC_PERM (even for VEC_PERM_EXPR exists in gimple).
>>>>>>
>>>>>> It seems unlikely that GCC's choice about whether to represent something
>>>>>> as an UNSPEC or as another RTL code lines up neatly with the kind of
>>>>>> codegen decisions that a good assembly programmer would make.
>>>>>>
>>>>>> I suppose another way of asking is to turn this around and say: what
>>>>>> kind of uses are you trying to exclude?  Presumably things are worse
>>>>>> if you remove this function override.  But what makes them worse?
>>>>>> What kind of uses cause the regression?
>>>>>>
>>>>>
>>>>> Uses of fused load where load with low address uses are modified with 
>>>>> load with high address uses.
>>>>>
>>>>> Similarly load with high address uses are modified with load low address
>>>>> uses.
>>>>
>>>> It sounds like something is going wrong the subreg updates.
>>>> Can you give an example of where this occurs?  For instance...
>>>>
>>>>> This is the semantics of lxvp instructions which can occur through
>>>>> UNSPEC uses otherwise it breaks the functionality and seen failure
>>>>> in almost all vect regressions and SPEC benchmarks.
>>>>
>>>> ...could you take one of the simpler vect regressions, show the before
>>>> and after RTL, and why the transformation is wrong?
>>>
>>> Before the change:
>>>
>>> (insn 32 30 103 5 (set (reg:V16QI 127 [ _32 ])
>>> (mem:V16QI (reg:DI 130 [ ivtmp.37 ]) [1 MEM >> unsigned int> [(short unsigned int *)_55]+0 S16 A128])) {vsx_movv16qi_64bit}
>>>  (nil))
>>> (insn 103 32 135 5 (set (reg:V16QI 173 [ _32 ])
>>> (mem:V16QI (plus:DI (reg:DI 130 [ ivtmp.37 ])
>>> (const_int 16 [0x10])) [1 MEM >> int> [(short unsigned int *)_55]+0 S16 A128])) {vsx_movv16qi_64bit}
>>>  (nil))
>>> (insn 135 103 34 5 (set (reg:DI 155)
>>> (plus:DI (reg:DI 130 [ ivtmp.37 ])
>>> (const_int 16 [0x10]))) 66 {*adddi3}
>>>  (nil))
>>> (insn 34 135 104 5 (set (reg:V16QI 143 [ _27 ])
>>> (unspec:V16QI [
>>&g

Re: [Patch, rs6000, aarch64, middle-end] Add implementation for different targets for pair mem fusion

2024-06-03 Thread Ajit Agarwal
Hello Richard:

On 03/06/24 8:24 pm, Richard Sandiford wrote:
> Ajit Agarwal  writes:
>> Hello Richard:
>>
>> On 03/06/24 7:47 pm, Richard Sandiford wrote:
>>> Ajit Agarwal  writes:
>>>> On 03/06/24 5:03 pm, Richard Sandiford wrote:
>>>>> Ajit Agarwal  writes:
>>>>>>> [...]
>>>>>>> If it is intentional, what distinguishes things like vperm and xxinsertw
>>>>>>> (and all other unspecs) from plain addition?
>>>>>>>
>>>>>>>   [(set (match_operand:VSX_F 0 "vsx_register_operand" "=wa")
>>>>>>> (plus:VSX_F (match_operand:VSX_F 1 "vsx_register_operand" "wa")
>>>>>>> (match_operand:VSX_F 2 "vsx_register_operand" 
>>>>>>> "wa")))]
>>>>>>>
>>>>>>
>>>>>> Plain addition are not supported currently.
>>>>>> We have not seen many cases with plain addition and this patch
>>>>>> will not accept plain addition.
>>>>>>
>>>>>>  
>>>>>>> This is why the intention behind the patch is important.  As it stands,
>>>>>>> it isn't clear what criteria the patch is using to distinguish "valid"
>>>>>>> fuse candidates from "invalid" ones.
>>>>>>>
>>>>>>
>>>>>> Intention behind this patch all variants of UNSPEC instructions are
>>>>>> supported and uses without UNSPEC are not supported in this patch.
>>>>>
>>>>> But why make the distinction this way though?  UNSPEC is a very
>>>>> GCC-specific concept.  Whether something is an UNSPEC or some other
>>>>> RTL code depends largely on historical accident.  E.g. we have specific
>>>>> codes for VEC_SELECT, VEC_MERGE, and VEC_DUPLICATE, but don't have one
>>>>> for VEC_PERM (even for VEC_PERM_EXPR exists in gimple).
>>>>>
>>>>> It seems unlikely that GCC's choice about whether to represent something
>>>>> as an UNSPEC or as another RTL code lines up neatly with the kind of
>>>>> codegen decisions that a good assembly programmer would make.
>>>>>
>>>>> I suppose another way of asking is to turn this around and say: what
>>>>> kind of uses are you trying to exclude?  Presumably things are worse
>>>>> if you remove this function override.  But what makes them worse?
>>>>> What kind of uses cause the regression?
>>>>>
>>>>
>>>> Uses of fused load where load with low address uses are modified with load 
>>>> with high address uses.
>>>>
>>>> Similarly load with high address uses are modified with load low address
>>>> uses.
>>>
>>> It sounds like something is going wrong the subreg updates.
>>> Can you give an example of where this occurs?  For instance...
>>>
>>>> This is the semantics of lxvp instructions which can occur through
>>>> UNSPEC uses otherwise it breaks the functionality and seen failure
>>>> in almost all vect regressions and SPEC benchmarks.
>>>
>>> ...could you take one of the simpler vect regressions, show the before
>>> and after RTL, and why the transformation is wrong?
>>
>> Before the change:
>>
>> (insn 32 30 103 5 (set (reg:V16QI 127 [ _32 ])
>> (mem:V16QI (reg:DI 130 [ ivtmp.37 ]) [1 MEM > unsigned int> [(short unsigned int *)_55]+0 S16 A128])) {vsx_movv16qi_64bit}
>>  (nil))
>> (insn 103 32 135 5 (set (reg:V16QI 173 [ _32 ])
>> (mem:V16QI (plus:DI (reg:DI 130 [ ivtmp.37 ])
>> (const_int 16 [0x10])) [1 MEM  
>> [(short unsigned int *)_55]+0 S16 A128])) {vsx_movv16qi_64bit}
>>  (nil))
>> (insn 135 103 34 5 (set (reg:DI 155)
>> (plus:DI (reg:DI 130 [ ivtmp.37 ])
>> (const_int 16 [0x10]))) 66 {*adddi3}
>>  (nil))
>> (insn 34 135 104 5 (set (reg:V16QI 143 [ _27 ])
>> (unspec:V16QI [
>> (reg:V16QI 127 [ _32 ]) repeated x2
>> (reg:V16QI 152)
>> ] UNSPEC_VPERM))  {altivec_vperm_v16qi_direct}
>>  (expr_list:REG_DEAD (reg:V16QI 127 [ _32 ])
>> (nil)))
>> (insn 104 34 35 5 (set (reg:V16QI 174 [ _27 ])
>> (unspec:V16QI [
>> (reg:V16QI 173 [ _32 ]) repeated x2
>>   

Re: [Patch, rs6000, aarch64, middle-end] Add implementation for different targets for pair mem fusion

2024-06-03 Thread Ajit Agarwal
Hello Richard:

On 03/06/24 7:47 pm, Richard Sandiford wrote:
> Ajit Agarwal  writes:
>> On 03/06/24 5:03 pm, Richard Sandiford wrote:
>>> Ajit Agarwal  writes:
>>>>> [...]
>>>>> If it is intentional, what distinguishes things like vperm and xxinsertw
>>>>> (and all other unspecs) from plain addition?
>>>>>
>>>>>   [(set (match_operand:VSX_F 0 "vsx_register_operand" "=wa")
>>>>> (plus:VSX_F (match_operand:VSX_F 1 "vsx_register_operand" "wa")
>>>>>   (match_operand:VSX_F 2 "vsx_register_operand" "wa")))]
>>>>>
>>>>
>>>> Plain addition are not supported currently.
>>>> We have not seen many cases with plain addition and this patch
>>>> will not accept plain addition.
>>>>
>>>>  
>>>>> This is why the intention behind the patch is important.  As it stands,
>>>>> it isn't clear what criteria the patch is using to distinguish "valid"
>>>>> fuse candidates from "invalid" ones.
>>>>>
>>>>
>>>> Intention behind this patch all variants of UNSPEC instructions are
>>>> supported and uses without UNSPEC are not supported in this patch.
>>>
>>> But why make the distinction this way though?  UNSPEC is a very
>>> GCC-specific concept.  Whether something is an UNSPEC or some other
>>> RTL code depends largely on historical accident.  E.g. we have specific
>>> codes for VEC_SELECT, VEC_MERGE, and VEC_DUPLICATE, but don't have one
>>> for VEC_PERM (even for VEC_PERM_EXPR exists in gimple).
>>>
>>> It seems unlikely that GCC's choice about whether to represent something
>>> as an UNSPEC or as another RTL code lines up neatly with the kind of
>>> codegen decisions that a good assembly programmer would make.
>>>
>>> I suppose another way of asking is to turn this around and say: what
>>> kind of uses are you trying to exclude?  Presumably things are worse
>>> if you remove this function override.  But what makes them worse?
>>> What kind of uses cause the regression?
>>>
>>
>> Uses of fused load where load with low address uses are modified with load 
>> with high address uses.
>>
>> Similarly load with high address uses are modified with load low address
>> uses.
> 
> It sounds like something is going wrong the subreg updates.
> Can you give an example of where this occurs?  For instance...
> 
>> This is the semantics of lxvp instructions which can occur through
>> UNSPEC uses otherwise it breaks the functionality and seen failure
>> in almost all vect regressions and SPEC benchmarks.
> 
> ...could you take one of the simpler vect regressions, show the before
> and after RTL, and why the transformation is wrong?
>

Before the change:

(insn 32 30 103 5 (set (reg:V16QI 127 [ _32 ])
(mem:V16QI (reg:DI 130 [ ivtmp.37 ]) [1 MEM  [(short unsigned int *)_55]+0 S16 A128])) {vsx_movv16qi_64bit}
 (nil))
(insn 103 32 135 5 (set (reg:V16QI 173 [ _32 ])
(mem:V16QI (plus:DI (reg:DI 130 [ ivtmp.37 ])
(const_int 16 [0x10])) [1 MEM  
[(short unsigned int *)_55]+0 S16 A128])) {vsx_movv16qi_64bit}
 (nil))
(insn 135 103 34 5 (set (reg:DI 155)
(plus:DI (reg:DI 130 [ ivtmp.37 ])
(const_int 16 [0x10]))) 66 {*adddi3}
 (nil))
(insn 34 135 104 5 (set (reg:V16QI 143 [ _27 ])
(unspec:V16QI [
(reg:V16QI 127 [ _32 ]) repeated x2
(reg:V16QI 152)
] UNSPEC_VPERM))  {altivec_vperm_v16qi_direct}
 (expr_list:REG_DEAD (reg:V16QI 127 [ _32 ])
(nil)))
(insn 104 34 35 5 (set (reg:V16QI 174 [ _27 ])
(unspec:V16QI [
(reg:V16QI 173 [ _32 ]) repeated x2
(reg:V16QI 152)
] UNSPEC_VPERM)) 
 {altivec_vperm_v16qi_direct}


After the change:

(insn 103 30 135 5 (set (reg:OO 127 [ _32 ])
(mem:OO (reg:DI 130 [ ivtmp.37 ]) [1 MEM  
[(short unsigned int *)_55]+0 S16 A128])) {*movoo}
 (nil))
(insn 135 103 34 5 (set (reg:DI 155)
(plus:DI (reg:DI 130 [ ivtmp.37 ])
(const_int 16 [0x10]))) 66 {*adddi3}
 (nil))
(insn 34 135 104 5 (set (reg:V16QI 143 [ _27 ])
(unspec:V16QI [
(subreg:V16QI (reg:OO 127 [ _32 ]) 16)
(subreg:V16QI (reg:OO 127 [ _32 ]) 16)
(reg:V16QI 152)
] UNSPEC_VPERM)) {altivec_vperm_v16qi_direct}
 (expr_list:REG_DEAD (reg:OO 127 [ _32 ])
(nil)))
(insn 104 34 35 5 (set (reg:V16QI 174 [ _27 ])
(unspec:V16QI [
(subreg:V16QI (reg:OO 127 [ _32 ]) 0)
(subreg:V16QI (reg:OO 127 [ _32 ]) 0)
(reg:V16QI 152)
] UNSPEC_VPERM))  {altivec_vperm_v16qi_direct}

After the change the tests passes.
 
> Thanks,
> Richard

Thanks & Regards
Ajit

Thanks & Regards
Ajit


Re: [Patch, rs6000, aarch64, middle-end] Add implementation for different targets for pair mem fusion

2024-06-03 Thread Ajit Agarwal
Hello Richard:

On 03/06/24 5:03 pm, Richard Sandiford wrote:
> Ajit Agarwal  writes:
>>> [...]
>>> If it is intentional, what distinguishes things like vperm and xxinsertw
>>> (and all other unspecs) from plain addition?
>>>
>>>   [(set (match_operand:VSX_F 0 "vsx_register_operand" "=wa")
>>> (plus:VSX_F (match_operand:VSX_F 1 "vsx_register_operand" "wa")
>>> (match_operand:VSX_F 2 "vsx_register_operand" "wa")))]
>>>
>>
>> Plain addition are not supported currently.
>> We have not seen many cases with plain addition and this patch
>> will not accept plain addition.
>>
>>  
>>> This is why the intention behind the patch is important.  As it stands,
>>> it isn't clear what criteria the patch is using to distinguish "valid"
>>> fuse candidates from "invalid" ones.
>>>
>>
>> Intention behind this patch all variants of UNSPEC instructions are
>> supported and uses without UNSPEC are not supported in this patch.
> 
> But why make the distinction this way though?  UNSPEC is a very
> GCC-specific concept.  Whether something is an UNSPEC or some other
> RTL code depends largely on historical accident.  E.g. we have specific
> codes for VEC_SELECT, VEC_MERGE, and VEC_DUPLICATE, but don't have one
> for VEC_PERM (even for VEC_PERM_EXPR exists in gimple).
> 
> It seems unlikely that GCC's choice about whether to represent something
> as an UNSPEC or as another RTL code lines up neatly with the kind of
> codegen decisions that a good assembly programmer would make.
> 
> I suppose another way of asking is to turn this around and say: what
> kind of uses are you trying to exclude?  Presumably things are worse
> if you remove this function override.  But what makes them worse?
> What kind of uses cause the regression?
> 

Uses of fused load where load with low address uses are modified with load with 
high address uses.

Similarly load with high address uses are modified with load low address
uses.

This is the semantics of lxvp instructions which can occur through
UNSPEC uses otherwise it breaks the functionality and seen failure
in almost all vect regressions and SPEC benchmarks.


>>>>>>>> [...]
>>>>>>>> +  // Given insn_info pair I1 and I2, return true if offsets are in 
>>>>>>>> order.
>>>>>>>> +  virtual bool should_handle_unordered_insns (rtl_ssa::insn_info *i1,
>>>>>>>> +rtl_ssa::insn_info *i2) = 
>>>>>>>> 0;
>>>>>>>> +
>>>>>>>
>>>>>>> This name seems a bit misleading.  The function is used in:
>>>>>>>
>>>>>>> @@ -2401,6 +2405,9 @@ pair_fusion_bb_info::try_fuse_pair (bool load_p, 
>>>>>>> unsigned access_size,
>>>>>>>reversed = true;
>>>>>>>  }
>>>>>>>  
>>>>>>> +  if (!m_pass->should_handle_unordered_insns (i1, i2))
>>>>>>> +return false;
>>>>>>> +
>>>>>>>rtx cand_mems[2];
>>>>>>>rtx reg_ops[2];
>>>>>>>rtx pats[2];
>>>>>>>
>>>>>>> and so it acts as a general opt-out.  The insns aren't known to be 
>>>>>>> unordered.
>>>>>>>
>>>>>>> It looks like the rs6000 override requires the original insns to be
>>>>>>> in offset order.  Could you say why that's necessary?  (Both in email
>>>>>>> and as a comment in the code.)
>>>>>>>
>>>>>>
>>>>>> Yes rs6000 requires the original load insns to be in offset order.
>>>>>> Some regression tests like vect-outer-4f fails if we do load pair
>>>>>> fusion with load offsets are not in offset order as this breaks lxvp 
>>>>>> semantics.
>>>>>
>>>>> How does it break the semantics though?  In principle, the generic code
>>>>> only fuses if it has "proved" that the loads can happen in either order.
>>>>> So it shouldn't matter which order the hardware does things in.
>>>>>
>>>>> Could you give an example of the kind of situation that you want
>>>>> to avoid, and why it generates the wrong result?
>>>>>
>>>>
>>>> (insn 31 62 32 2 (set (reg:V1

Re: [Patch, rs6000, aarch64, middle-end] Add implementation for different targets for pair mem fusion

2024-06-03 Thread Ajit Agarwal
Hello Richard:

On 03/06/24 2:07 pm, Richard Sandiford wrote:
> Ajit Agarwal  writes:
>> Hello Richard:
>> On 31/05/24 8:08 pm, Richard Sandiford wrote:
>>> Ajit Agarwal  writes:
>>>> On 31/05/24 3:23 pm, Richard Sandiford wrote:
>>>>> Ajit Agarwal  writes:
>>>>>> Hello All:
>>>>>>
>>>>>> Common infrastructure using generic code for pair mem fusion of different
>>>>>> targets.
>>>>>>
>>>>>> rs6000 target specific specific code implements virtual functions defined
>>>>>> by generic code.
>>>>>>
>>>>>> Code is implemented with pure virtual functions to interface with target
>>>>>> code.
>>>>>>
>>>>>> Target specific code are added in rs6000-mem-fusion.cc and additional 
>>>>>> virtual
>>>>>> function implementation required for rs6000 are added in 
>>>>>> aarch64-ldp-fusion.cc.
>>>>>>
>>>>>> Bootstrapped and regtested for aarch64-linux-gnu and powerpc64-linux-gnu.
>>>>>>
>>>>>> Thanks & Regards
>>>>>> Ajit
>>>>>>
>>>>>>
>>>>>> aarch64, rs6000, middle-end: Add implementation for different targets 
>>>>>> for pair mem fusion
>>>>>>
>>>>>> Common infrastructure using generic code for pair mem fusion of different
>>>>>> targets.
>>>>>>
>>>>>> rs6000 target specific specific code implements virtual functions defined
>>>>>> by generic code.
>>>>>>
>>>>>> Code is implemented with pure virtual functions to interface with target
>>>>>> code.
>>>>>>
>>>>>> Target specific code are added in rs6000-mem-fusion.cc and additional 
>>>>>> virtual
>>>>>> function implementation required for rs6000 are added in 
>>>>>> aarch64-ldp-fusion.cc.
>>>>>>
>>>>>> 2024-05-31  Ajit Kumar Agarwal  
>>>>>>
>>>>>> gcc/ChangeLog:
>>>>>>
>>>>>>  * config/aarch64/aarch64-ldp-fusion.cc: Add target specific
>>>>>>  implementation of additional virtual functions added in pair_fusion
>>>>>>  struct.
>>>>>>  * config/rs6000/rs6000-passes.def: New mem fusion pass
>>>>>>  before pass_early_remat.
>>>>>>  * config/rs6000/rs6000-mem-fusion.cc: Add new pass.
>>>>>>  Add target specific implementation using pure virtual
>>>>>>  functions.
>>>>>>  * config.gcc: Add new object file.
>>>>>>  * config/rs6000/rs6000-protos.h: Add new prototype for mem
>>>>>>  fusion pass.
>>>>>>  * config/rs6000/t-rs6000: Add new rule.
>>>>>>  * rtl-ssa/accesses.h: Moved set_is_live_out_use as public
>>>>>>  from private.
>>>>>>
>>>>>> gcc/testsuite/ChangeLog:
>>>>>>
>>>>>>  * g++.target/powerpc/me-fusion.C: New test.
>>>>>>  * g++.target/powerpc/mem-fusion-1.C: New test.
>>>>>>  * gcc.target/powerpc/mma-builtin-1.c: Modify test.
>>>>>> ---
>>>>>
>>>>> This isn't a complete review, just some initial questions & comments
>>>>> about selected parts.
>>>>>
>>>>>> [...]
>>>>>> +/* Check whether load can be fusable or not.
>>>>>> +   Return true if dependent use is UNSPEC otherwise false.  */
>>>>>> +bool
>>>>>> +rs6000_pair_fusion::fuseable_load_p (insn_info *info)
>>>>>> +{
>>>>>> +  rtx_insn *insn = info->rtl ();
>>>>>> +
>>>>>> +  for (rtx note = REG_NOTES (insn); note; note = XEXP (note, 1))
>>>>>> +if (REG_NOTE_KIND (note) == REG_EQUAL
>>>>>> +|| REG_NOTE_KIND (note) == REG_EQUIV)
>>>>>> +  return false;
>>>>>
>>>>> It's unusual to punt on an optimisation because of a REG_EQUAL/EQUIV
>>>>> note.  What's the reason for doing this?  Are you trying to avoid
>>>>> fusing pairs before reload that are equivalent to a MEM (i.e. have
>>>>> a natural spill slot)?  I think Alex hit a similar sit

[Patch, aarch64 1/1] Additional interface function implementation

2024-06-02 Thread Ajit Agarwal
Hello All:

Common infrastructure using generic code for pair mem fusion of different
targets.

Implements additional interface virtual function implementation
required for rs6000 target.

Tested for aarch64-linux-gnu.

Thanks & Regards
Ajit

aarch64: Additional interface function implementation

Common infrastructure using generic code for pair mem fusion of different
targets.

Implements additional interface virtual function implementation
required for rs6000 target.

2024-06-02  Ajit Kumar Agarwal  

gcc/ChangeLog:

* config/aarch64/aarch64-ldp-fusion.cc: Add target specific
implementation of additional virtual functions added in pair_fusion
struct.
---
 gcc/config/aarch64/aarch64-ldp-fusion.cc | 23 +++
 1 file changed, 23 insertions(+)

diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc 
b/gcc/config/aarch64/aarch64-ldp-fusion.cc
index 0af927231d3..784cdc3937c 100644
--- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
+++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
@@ -104,6 +104,29 @@ struct aarch64_pair_fusion : public pair_fusion
  bool load_p) override final;
 
   rtx destructure_pair (rtx regs[2], rtx pattern, bool load_p) override final;
+
+  bool should_handle_unordered_insns (rtl_ssa::insn_info *,
+ rtl_ssa::insn_info *) override final
+  {
+return true;
+  }
+
+  bool fuseable_store_p (rtl_ssa::insn_info *,
+rtl_ssa::insn_info *) override final
+  {
+return true;
+  }
+
+  bool fuseable_load_p (rtl_ssa::insn_info *) override final
+  {
+return true;
+  }
+
+  void set_multiword_subreg (rtl_ssa::insn_info *, rtl_ssa::insn_info *,
+bool) override final
+  {
+return;
+  }
 };
 
 bool
-- 
2.43.0





Re: [Patch, rs6000, aarch64, middle-end] Add implementation for different targets for pair mem fusion

2024-06-02 Thread Ajit Agarwal
Hello Richard:

On 31/05/24 3:23 pm, Richard Sandiford wrote:
> Ajit Agarwal  writes:
>> Hello All:
>>
>> Common infrastructure using generic code for pair mem fusion of different
>> targets.
>>
>> rs6000 target specific specific code implements virtual functions defined
>> by generic code.
>>
>> Code is implemented with pure virtual functions to interface with target
>> code.
>>
>> Target specific code are added in rs6000-mem-fusion.cc and additional virtual
>> function implementation required for rs6000 are added in 
>> aarch64-ldp-fusion.cc.
>>
>> Bootstrapped and regtested for aarch64-linux-gnu and powerpc64-linux-gnu.
>>
>> Thanks & Regards
>> Ajit
>>
>>
>> aarch64, rs6000, middle-end: Add implementation for different targets for 
>> pair mem fusion
>>
>> Common infrastructure using generic code for pair mem fusion of different
>> targets.
>>
>> rs6000 target specific specific code implements virtual functions defined
>> by generic code.
>>
>> Code is implemented with pure virtual functions to interface with target
>> code.
>>
>> Target specific code are added in rs6000-mem-fusion.cc and additional virtual
>> function implementation required for rs6000 are added in 
>> aarch64-ldp-fusion.cc.
>>
>> 2024-05-31  Ajit Kumar Agarwal  
>>
>> gcc/ChangeLog:
>>
>>  * config/aarch64/aarch64-ldp-fusion.cc: Add target specific
>>  implementation of additional virtual functions added in pair_fusion
>>  struct.
>>  * config/rs6000/rs6000-passes.def: New mem fusion pass
>>  before pass_early_remat.
>>  * config/rs6000/rs6000-mem-fusion.cc: Add new pass.
>>  Add target specific implementation using pure virtual
>>  functions.
>>  * config.gcc: Add new object file.
>>  * config/rs6000/rs6000-protos.h: Add new prototype for mem
>>  fusion pass.
>>  * config/rs6000/t-rs6000: Add new rule.
>>  * rtl-ssa/accesses.h: Moved set_is_live_out_use as public
>>  from private.
>>
>> gcc/testsuite/ChangeLog:
>>
>>  * g++.target/powerpc/me-fusion.C: New test.
>>  * g++.target/powerpc/mem-fusion-1.C: New test.
>>  * gcc.target/powerpc/mma-builtin-1.c: Modify test.
>> ---
> 
> This isn't a complete review, just some initial questions & comments
> about selected parts.
> 
>> [...]
>> +/* Check whether load can be fusable or not.
>> +   Return true if dependent use is UNSPEC otherwise false.  */
>> +bool
>> +rs6000_pair_fusion::fuseable_load_p (insn_info *info)
>> +{
>> +  rtx_insn *insn = info->rtl ();
>> +
>> +  for (rtx note = REG_NOTES (insn); note; note = XEXP (note, 1))
>> +if (REG_NOTE_KIND (note) == REG_EQUAL
>> +|| REG_NOTE_KIND (note) == REG_EQUIV)
>> +  return false;
> 
> It's unusual to punt on an optimisation because of a REG_EQUAL/EQUIV
> note.  What's the reason for doing this?  Are you trying to avoid
> fusing pairs before reload that are equivalent to a MEM (i.e. have
> a natural spill slot)?  I think Alex hit a similar situation.
> 

Removed the above check and addressed in the new patch sent.
>> +
>> +  for (auto def : info->defs ())
>> +{
>> +  auto set = dyn_cast (def);
>> +  if (set && set->has_any_uses ())
>> +{
>> +  for (auto use : set->all_uses())
> 
> Nit: has_any_uses isn't necessary: the inner loop will simply do nothing
> in that case.  Also, we can/should restrict the scan to non-debug uses.
> 
> This can then be:
> 
>   for (auto def : info->defs ())
> if (auto set = dyn_cast (def))
>   for (auto use : set->nondebug_insn_uses())
> 
>> +{
>> +  if (use->insn ()->is_artificial ())
>> +return false;
>> +
>> +   insn_info *info = use->insn ();
>> +
>> +   if (info
>> +   && info->rtl ()
> 
> This test shouldn't be necessary.
> 
>> +   && info->is_real ())
>> +  {
>> +rtx_insn *rtl_insn = info->rtl ();
>> +rtx set = single_set (rtl_insn);
>> +
>> +if (set == NULL_RTX)
>> +  return false;
>> +
>> +rtx op0 = SET_SRC (set);
>> +if (GET_CODE (op0) != UNSPEC)
>> +  return false;
> 
> What's the motivation for rejecting unspecs?  It's unsual to treat
> all unspecs as a dist

[Patch, aarch64 1/2] aarch64: Additional interface function implementation

2024-06-02 Thread Ajit Agarwal
Hello All:

Common infrastructure using generic code for pair mem fusion of different
targets.

Implements additional interface virtual function implementation
required for rs6000 target.

Tested for aarch64-linux-gnu.

Thanks & Regards
Ajit

aarch64: Additional interface function implementation.

Common infrastructure using generic code for pair mem fusion of different
targets.

Implements additional interface virtual function implementation
required for rs6000 target.

2024-06-02  Ajit Kumar Agarwal  

gcc/ChangeLog:

* config/aarch64/aarch64-ldp-fusion.cc: Add target specific
implementation of additional virtual functions added in pair_fusion
struct.
---
 gcc/config/aarch64/aarch64-ldp-fusion.cc | 23 +++
 1 file changed, 23 insertions(+)

diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc 
b/gcc/config/aarch64/aarch64-ldp-fusion.cc
index 0af927231d3..784cdc3937c 100644
--- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
+++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
@@ -104,6 +104,29 @@ struct aarch64_pair_fusion : public pair_fusion
  bool load_p) override final;
 
   rtx destructure_pair (rtx regs[2], rtx pattern, bool load_p) override final;
+
+  bool should_handle_unordered_insns (rtl_ssa::insn_info *,
+ rtl_ssa::insn_info *) override final
+  {
+return true;
+  }
+
+  bool fuseable_store_p (rtl_ssa::insn_info *,
+rtl_ssa::insn_info *) override final
+  {
+return true;
+  }
+
+  bool fuseable_load_p (rtl_ssa::insn_info *) override final
+  {
+return true;
+  }
+
+  void set_multiword_subreg (rtl_ssa::insn_info *, rtl_ssa::insn_info *,
+bool) override final
+  {
+return;
+  }
 };
 
 bool
-- 
2.43.0



[Patch, rs6000, middle-end 0/1] Add implementation for different targets for pair mem fusion

2024-06-02 Thread Ajit Agarwal
Hello All:


All comments are addressed and patch is split into rs6000 and aarch64 target 
changes.

Common infrastructure using generic code for pair mem fusion of different
targets.

rs6000 target specific code implements virtual functions defined by generic 
code.

Target specific code are added in rs6000-mem-fusion.cc.

Tested for powerpc64-linux-gnu.

Thanks & Regards
Ajit

rs6000, middle-end: Add implementation for different targets for pair mem fusion

Common infrastructure using generic code for pair mem fusion of different
targets.

rs6000 target specific code implements virtual functions defined by generic 
code.

Target specific code are added in rs6000-mem-fusion.cc.

2024-06-02  Ajit Kumar Agarwal  

gcc/ChangeLog:

* config/rs6000/rs6000-passes.def: New mem fusion pass
before pass_early_remat.
* pair-fusion.h: Add additional pure virtual function
required for rs6000 target implementation.
* pair-fusion.cc: Use of virtual functions for additional
virtual function addded for rs6000 target.
* config/rs6000/rs6000-mem-fusion.cc: Add new pass.
Add target specific implementation for generic pure virtual
functions.
* config.gcc: Add new object file.
* config/rs6000/rs6000-protos.h: Add new prototype for mem
fusion pass.
* config/rs6000/t-rs6000: Add new rule.
* rtl-ssa/accesses.h: Moved set_is_live_out_use as public
from private.

gcc/testsuite/ChangeLog:

* g++.target/powerpc/mem-fusion.C: New test.
* g++.target/powerpc/mem-fusion-1.C: New test.
* gcc.target/powerpc/mma-builtin-1.c: Modify test.
---
 gcc/config.gcc|   2 +
 gcc/config/rs6000/rs6000-mem-fusion.cc| 651 ++
 gcc/config/rs6000/rs6000-passes.def   |   4 +-
 gcc/config/rs6000/rs6000-protos.h |   1 +
 gcc/config/rs6000/t-rs6000|   5 +
 gcc/pair-fusion.cc|  26 +-
 gcc/pair-fusion.h |  20 +
 gcc/rtl-ssa/accesses.h|   2 +-
 .../g++.target/powerpc/mem-fusion-1.C |  22 +
 gcc/testsuite/g++.target/powerpc/mem-fusion.C |  15 +
 .../gcc.target/powerpc/mma-builtin-1.c|   4 +-
 11 files changed, 743 insertions(+), 9 deletions(-)
 create mode 100644 gcc/config/rs6000/rs6000-mem-fusion.cc
 create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion-1.C
 create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion.C

diff --git a/gcc/config.gcc b/gcc/config.gcc
index e500ba63e32..348308b2e93 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -524,6 +524,7 @@ powerpc*-*-*)
extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
extra_objs="${extra_objs} rs6000-builtins.o rs6000-builtin.o"
+   extra_objs="${extra_objs} rs6000-mem-fusion.o"
extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h"
extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h"
extra_headers="${extra_headers} xmmintrin.h mm_malloc.h emmintrin.h"
@@ -560,6 +561,7 @@ rs6000*-*-*)
extra_options="${extra_options} g.opt fused-madd.opt 
rs6000/rs6000-tables.opt"
extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
+   extra_objs="${extra_objs} rs6000-mem-fusion.o"
target_gtfiles="$target_gtfiles 
\$(srcdir)/config/rs6000/rs6000-logue.cc 
\$(srcdir)/config/rs6000/rs6000-call.cc"
target_gtfiles="$target_gtfiles 
\$(srcdir)/config/rs6000/rs6000-pcrel-opt.cc"
;;
diff --git a/gcc/config/rs6000/rs6000-mem-fusion.cc 
b/gcc/config/rs6000/rs6000-mem-fusion.cc
new file mode 100644
index 000..45795cd48c4
--- /dev/null
+++ b/gcc/config/rs6000/rs6000-mem-fusion.cc
@@ -0,0 +1,651 @@
+/* Subroutines used to perform adjacent load/store into
+   paired memory accesses for TARGET_POWER10 and TARGET_VSX.
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   .  */
+
+#define INCLUDE_ALGORITHM
+#define INCLUDE_FUNCTIONAL
+#define INCLUDE_LIST
+#define INCLUDE_TYPE_TRAITS
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include 

Re: [Patch, rs6000, aarch64, middle-end] Add implementation for different targets for pair mem fusion

2024-06-01 Thread Ajit Agarwal
Hello Richard:

On 31/05/24 10:29 pm, Ajit Agarwal wrote:
> Hello Richard:
> 
> On 31/05/24 8:08 pm, Richard Sandiford wrote:
>> Ajit Agarwal  writes:
>>> On 31/05/24 3:23 pm, Richard Sandiford wrote:
>>>> Ajit Agarwal  writes:
>>>>> Hello All:
>>>>>
>>>>> Common infrastructure using generic code for pair mem fusion of different
>>>>> targets.
>>>>>
>>>>> rs6000 target specific specific code implements virtual functions defined
>>>>> by generic code.
>>>>>
>>>>> Code is implemented with pure virtual functions to interface with target
>>>>> code.
>>>>>
>>>>> Target specific code are added in rs6000-mem-fusion.cc and additional 
>>>>> virtual
>>>>> function implementation required for rs6000 are added in 
>>>>> aarch64-ldp-fusion.cc.
>>>>>
>>>>> Bootstrapped and regtested for aarch64-linux-gnu and powerpc64-linux-gnu.
>>>>>
>>>>> Thanks & Regards
>>>>> Ajit
>>>>>
>>>>>
>>>>> aarch64, rs6000, middle-end: Add implementation for different targets for 
>>>>> pair mem fusion
>>>>>
>>>>> Common infrastructure using generic code for pair mem fusion of different
>>>>> targets.
>>>>>
>>>>> rs6000 target specific specific code implements virtual functions defined
>>>>> by generic code.
>>>>>
>>>>> Code is implemented with pure virtual functions to interface with target
>>>>> code.
>>>>>
>>>>> Target specific code are added in rs6000-mem-fusion.cc and additional 
>>>>> virtual
>>>>> function implementation required for rs6000 are added in 
>>>>> aarch64-ldp-fusion.cc.
>>>>>
>>>>> 2024-05-31  Ajit Kumar Agarwal  
>>>>>
>>>>> gcc/ChangeLog:
>>>>>
>>>>>   * config/aarch64/aarch64-ldp-fusion.cc: Add target specific
>>>>>   implementation of additional virtual functions added in pair_fusion
>>>>>   struct.
>>>>>   * config/rs6000/rs6000-passes.def: New mem fusion pass
>>>>>   before pass_early_remat.
>>>>>   * config/rs6000/rs6000-mem-fusion.cc: Add new pass.
>>>>>   Add target specific implementation using pure virtual
>>>>>   functions.
>>>>>   * config.gcc: Add new object file.
>>>>>   * config/rs6000/rs6000-protos.h: Add new prototype for mem
>>>>>   fusion pass.
>>>>>   * config/rs6000/t-rs6000: Add new rule.
>>>>>   * rtl-ssa/accesses.h: Moved set_is_live_out_use as public
>>>>>   from private.
>>>>>
>>>>> gcc/testsuite/ChangeLog:
>>>>>
>>>>>   * g++.target/powerpc/me-fusion.C: New test.
>>>>>   * g++.target/powerpc/mem-fusion-1.C: New test.
>>>>>   * gcc.target/powerpc/mma-builtin-1.c: Modify test.
>>>>> ---
>>>>
>>>> This isn't a complete review, just some initial questions & comments
>>>> about selected parts.
>>>>
>>>>> [...]
>>>>> +/* Check whether load can be fusable or not.
>>>>> +   Return true if dependent use is UNSPEC otherwise false.  */
>>>>> +bool
>>>>> +rs6000_pair_fusion::fuseable_load_p (insn_info *info)
>>>>> +{
>>>>> +  rtx_insn *insn = info->rtl ();
>>>>> +
>>>>> +  for (rtx note = REG_NOTES (insn); note; note = XEXP (note, 1))
>>>>> +if (REG_NOTE_KIND (note) == REG_EQUAL
>>>>> + || REG_NOTE_KIND (note) == REG_EQUIV)
>>>>> +  return false;
>>>>
>>>> It's unusual to punt on an optimisation because of a REG_EQUAL/EQUIV
>>>> note.  What's the reason for doing this?  Are you trying to avoid
>>>> fusing pairs before reload that are equivalent to a MEM (i.e. have
>>>> a natural spill slot)?  I think Alex hit a similar situation.
>>>>
>>>
>>> We have used the above check because of some SPEC benchmarks failing with
>>> with MEM pairs having REG_EQUAL/EQUIV notes.
>>>
>>> By adding the checks the benchmarks passes and also it improves the
>>> performance.
>>>
>>> This checks were added during initial implementation of pair fusion
>>

Re: [Patch, rs6000, aarch64, middle-end] Add implementation for different targets for pair mem fusion

2024-05-31 Thread Ajit Agarwal
Hello Richard:

On 31/05/24 8:08 pm, Richard Sandiford wrote:
> Ajit Agarwal  writes:
>> On 31/05/24 3:23 pm, Richard Sandiford wrote:
>>> Ajit Agarwal  writes:
>>>> Hello All:
>>>>
>>>> Common infrastructure using generic code for pair mem fusion of different
>>>> targets.
>>>>
>>>> rs6000 target specific specific code implements virtual functions defined
>>>> by generic code.
>>>>
>>>> Code is implemented with pure virtual functions to interface with target
>>>> code.
>>>>
>>>> Target specific code are added in rs6000-mem-fusion.cc and additional 
>>>> virtual
>>>> function implementation required for rs6000 are added in 
>>>> aarch64-ldp-fusion.cc.
>>>>
>>>> Bootstrapped and regtested for aarch64-linux-gnu and powerpc64-linux-gnu.
>>>>
>>>> Thanks & Regards
>>>> Ajit
>>>>
>>>>
>>>> aarch64, rs6000, middle-end: Add implementation for different targets for 
>>>> pair mem fusion
>>>>
>>>> Common infrastructure using generic code for pair mem fusion of different
>>>> targets.
>>>>
>>>> rs6000 target specific specific code implements virtual functions defined
>>>> by generic code.
>>>>
>>>> Code is implemented with pure virtual functions to interface with target
>>>> code.
>>>>
>>>> Target specific code are added in rs6000-mem-fusion.cc and additional 
>>>> virtual
>>>> function implementation required for rs6000 are added in 
>>>> aarch64-ldp-fusion.cc.
>>>>
>>>> 2024-05-31  Ajit Kumar Agarwal  
>>>>
>>>> gcc/ChangeLog:
>>>>
>>>>* config/aarch64/aarch64-ldp-fusion.cc: Add target specific
>>>>implementation of additional virtual functions added in pair_fusion
>>>>struct.
>>>>* config/rs6000/rs6000-passes.def: New mem fusion pass
>>>>before pass_early_remat.
>>>>* config/rs6000/rs6000-mem-fusion.cc: Add new pass.
>>>>Add target specific implementation using pure virtual
>>>>functions.
>>>>* config.gcc: Add new object file.
>>>>* config/rs6000/rs6000-protos.h: Add new prototype for mem
>>>>fusion pass.
>>>>* config/rs6000/t-rs6000: Add new rule.
>>>>* rtl-ssa/accesses.h: Moved set_is_live_out_use as public
>>>>from private.
>>>>
>>>> gcc/testsuite/ChangeLog:
>>>>
>>>>* g++.target/powerpc/me-fusion.C: New test.
>>>>* g++.target/powerpc/mem-fusion-1.C: New test.
>>>>* gcc.target/powerpc/mma-builtin-1.c: Modify test.
>>>> ---
>>>
>>> This isn't a complete review, just some initial questions & comments
>>> about selected parts.
>>>
>>>> [...]
>>>> +/* Check whether load can be fusable or not.
>>>> +   Return true if dependent use is UNSPEC otherwise false.  */
>>>> +bool
>>>> +rs6000_pair_fusion::fuseable_load_p (insn_info *info)
>>>> +{
>>>> +  rtx_insn *insn = info->rtl ();
>>>> +
>>>> +  for (rtx note = REG_NOTES (insn); note; note = XEXP (note, 1))
>>>> +if (REG_NOTE_KIND (note) == REG_EQUAL
>>>> +  || REG_NOTE_KIND (note) == REG_EQUIV)
>>>> +  return false;
>>>
>>> It's unusual to punt on an optimisation because of a REG_EQUAL/EQUIV
>>> note.  What's the reason for doing this?  Are you trying to avoid
>>> fusing pairs before reload that are equivalent to a MEM (i.e. have
>>> a natural spill slot)?  I think Alex hit a similar situation.
>>>
>>
>> We have used the above check because of some SPEC benchmarks failing with
>> with MEM pairs having REG_EQUAL/EQUIV notes.
>>
>> By adding the checks the benchmarks passes and also it improves the
>> performance.
>>
>> This checks were added during initial implementation of pair fusion
>> pass.
>>
>> I will investigate further if this check is still required or not.
> 
> Thanks.  If it does affect SPEC results, it would be good to look
> at the underlying reason, as a justification for the check.
> 
> AIUI, the case Alex was due to the way that the RA recognises:
> 
>   (set (reg R) (mem address-of-a-stack-variable))
> REG_EQUIV: (mem address-of-a-stack-variable)
> 
> where the REG_

Re: [Patch, rs6000, aarch64, middle-end] Add implementation for different targets for pair mem fusion

2024-05-31 Thread Ajit Agarwal
Hello Richard:

On 31/05/24 3:23 pm, Richard Sandiford wrote:
> Ajit Agarwal  writes:
>> Hello All:
>>
>> Common infrastructure using generic code for pair mem fusion of different
>> targets.
>>
>> rs6000 target specific specific code implements virtual functions defined
>> by generic code.
>>
>> Code is implemented with pure virtual functions to interface with target
>> code.
>>
>> Target specific code are added in rs6000-mem-fusion.cc and additional virtual
>> function implementation required for rs6000 are added in 
>> aarch64-ldp-fusion.cc.
>>
>> Bootstrapped and regtested for aarch64-linux-gnu and powerpc64-linux-gnu.
>>
>> Thanks & Regards
>> Ajit
>>
>>
>> aarch64, rs6000, middle-end: Add implementation for different targets for 
>> pair mem fusion
>>
>> Common infrastructure using generic code for pair mem fusion of different
>> targets.
>>
>> rs6000 target specific specific code implements virtual functions defined
>> by generic code.
>>
>> Code is implemented with pure virtual functions to interface with target
>> code.
>>
>> Target specific code are added in rs6000-mem-fusion.cc and additional virtual
>> function implementation required for rs6000 are added in 
>> aarch64-ldp-fusion.cc.
>>
>> 2024-05-31  Ajit Kumar Agarwal  
>>
>> gcc/ChangeLog:
>>
>>  * config/aarch64/aarch64-ldp-fusion.cc: Add target specific
>>  implementation of additional virtual functions added in pair_fusion
>>  struct.
>>  * config/rs6000/rs6000-passes.def: New mem fusion pass
>>  before pass_early_remat.
>>  * config/rs6000/rs6000-mem-fusion.cc: Add new pass.
>>  Add target specific implementation using pure virtual
>>  functions.
>>  * config.gcc: Add new object file.
>>  * config/rs6000/rs6000-protos.h: Add new prototype for mem
>>  fusion pass.
>>  * config/rs6000/t-rs6000: Add new rule.
>>  * rtl-ssa/accesses.h: Moved set_is_live_out_use as public
>>  from private.
>>
>> gcc/testsuite/ChangeLog:
>>
>>  * g++.target/powerpc/me-fusion.C: New test.
>>  * g++.target/powerpc/mem-fusion-1.C: New test.
>>  * gcc.target/powerpc/mma-builtin-1.c: Modify test.
>> ---
> 
> This isn't a complete review, just some initial questions & comments
> about selected parts.
> 
>> [...]
>> +/* Check whether load can be fusable or not.
>> +   Return true if dependent use is UNSPEC otherwise false.  */
>> +bool
>> +rs6000_pair_fusion::fuseable_load_p (insn_info *info)
>> +{
>> +  rtx_insn *insn = info->rtl ();
>> +
>> +  for (rtx note = REG_NOTES (insn); note; note = XEXP (note, 1))
>> +if (REG_NOTE_KIND (note) == REG_EQUAL
>> +|| REG_NOTE_KIND (note) == REG_EQUIV)
>> +  return false;
> 
> It's unusual to punt on an optimisation because of a REG_EQUAL/EQUIV
> note.  What's the reason for doing this?  Are you trying to avoid
> fusing pairs before reload that are equivalent to a MEM (i.e. have
> a natural spill slot)?  I think Alex hit a similar situation.
> 

We have used the above check because of some SPEC benchmarks failing with
with MEM pairs having REG_EQUAL/EQUIV notes.

By adding the checks the benchmarks passes and also it improves the
performance.

This checks were added during initial implementation of pair fusion
pass.

I will investigate further if this check is still required or not.

Sorry for the inconvenience caused.

>> +
>> +  for (auto def : info->defs ())
>> +{
>> +  auto set = dyn_cast (def);
>> +  if (set && set->has_any_uses ())
>> +{
>> +  for (auto use : set->all_uses())
> 
> Nit: has_any_uses isn't necessary: the inner loop will simply do nothing
> in that case.  Also, we can/should restrict the scan to non-debug uses.
> 
> This can then be:
> 
>   for (auto def : info->defs ())
> if (auto set = dyn_cast (def))
>   for (auto use : set->nondebug_insn_uses())
> 

Sure. I will change as above.

>> +{
>> +  if (use->insn ()->is_artificial ())
>> +return false;
>> +
>> +   insn_info *info = use->insn ();
>> +
>> +   if (info
>> +   && info->rtl ()
> 
> This test shouldn't be necessary.
> 

Sure I will remove this check.

>> +   && info->is_real ())
>> +  {
>> +rtx_insn *rtl_insn = info->rtl ();
>> +rtx set = single_set 

[Patch, rs6000, aarch64, middle-end] Add implementation for different targets for pair mem fusion

2024-05-30 Thread Ajit Agarwal
Hello All:

Common infrastructure using generic code for pair mem fusion of different
targets.

rs6000 target specific specific code implements virtual functions defined
by generic code.

Code is implemented with pure virtual functions to interface with target
code.

Target specific code are added in rs6000-mem-fusion.cc and additional virtual
function implementation required for rs6000 are added in aarch64-ldp-fusion.cc.

Bootstrapped and regtested for aarch64-linux-gnu and powerpc64-linux-gnu.

Thanks & Regards
Ajit


aarch64, rs6000, middle-end: Add implementation for different targets for pair 
mem fusion

Common infrastructure using generic code for pair mem fusion of different
targets.

rs6000 target specific specific code implements virtual functions defined
by generic code.

Code is implemented with pure virtual functions to interface with target
code.

Target specific code are added in rs6000-mem-fusion.cc and additional virtual
function implementation required for rs6000 are added in aarch64-ldp-fusion.cc.

2024-05-31  Ajit Kumar Agarwal  

gcc/ChangeLog:

* config/aarch64/aarch64-ldp-fusion.cc: Add target specific
implementation of additional virtual functions added in pair_fusion
struct.
* config/rs6000/rs6000-passes.def: New mem fusion pass
before pass_early_remat.
* config/rs6000/rs6000-mem-fusion.cc: Add new pass.
Add target specific implementation using pure virtual
functions.
* config.gcc: Add new object file.
* config/rs6000/rs6000-protos.h: Add new prototype for mem
fusion pass.
* config/rs6000/t-rs6000: Add new rule.
* rtl-ssa/accesses.h: Moved set_is_live_out_use as public
from private.

gcc/testsuite/ChangeLog:

* g++.target/powerpc/me-fusion.C: New test.
* g++.target/powerpc/mem-fusion-1.C: New test.
* gcc.target/powerpc/mma-builtin-1.c: Modify test.
---
 gcc/config.gcc|   2 +
 gcc/config/aarch64/aarch64-ldp-fusion.cc  |  23 +
 gcc/config/rs6000/rs6000-mem-fusion.cc| 629 ++
 gcc/config/rs6000/rs6000-passes.def   |   4 +-
 gcc/config/rs6000/rs6000-protos.h |   1 +
 gcc/config/rs6000/t-rs6000|   5 +
 gcc/pair-fusion.cc|  18 +-
 gcc/pair-fusion.h |  20 +
 gcc/rtl-ssa/accesses.h|   2 +-
 .../g++.target/powerpc/mem-fusion-1.C |  22 +
 gcc/testsuite/g++.target/powerpc/mem-fusion.C |  15 +
 .../gcc.target/powerpc/mma-builtin-1.c|   4 +-
 12 files changed, 740 insertions(+), 5 deletions(-)
 create mode 100644 gcc/config/rs6000/rs6000-mem-fusion.cc
 create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion-1.C
 create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion.C

diff --git a/gcc/config.gcc b/gcc/config.gcc
index a37113bd00a..1beabc35d52 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -524,6 +524,7 @@ powerpc*-*-*)
extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
extra_objs="${extra_objs} rs6000-builtins.o rs6000-builtin.o"
+   extra_objs="${extra_objs} rs6000-mem-fusion.o"
extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h"
extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h"
extra_headers="${extra_headers} xmmintrin.h mm_malloc.h emmintrin.h"
@@ -560,6 +561,7 @@ rs6000*-*-*)
extra_options="${extra_options} g.opt fused-madd.opt 
rs6000/rs6000-tables.opt"
extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
+   extra_objs="${extra_objs} rs6000-mem-fusion.o"
target_gtfiles="$target_gtfiles 
\$(srcdir)/config/rs6000/rs6000-logue.cc 
\$(srcdir)/config/rs6000/rs6000-call.cc"
target_gtfiles="$target_gtfiles 
\$(srcdir)/config/rs6000/rs6000-pcrel-opt.cc"
;;
diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc 
b/gcc/config/aarch64/aarch64-ldp-fusion.cc
index 0af927231d3..784cdc3937c 100644
--- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
+++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
@@ -104,6 +104,29 @@ struct aarch64_pair_fusion : public pair_fusion
  bool load_p) override final;
 
   rtx destructure_pair (rtx regs[2], rtx pattern, bool load_p) override final;
+
+  bool should_handle_unordered_insns (rtl_ssa::insn_info *,
+ rtl_ssa::insn_info *) override final
+  {
+return true;
+  }
+
+  bool fuseable_store_p (rtl_ssa::insn_info *,
+rtl_ssa::insn_info *) override final
+  {
+return true;
+  }
+
+  bool fuseable_load_p (rtl_ssa::insn_info *) override final
+  {
+return true;
+  }
+
+  void set_multiword_subreg (rtl_ssa::insn_info *, rtl_ssa::insn_info *,
+bool) 

Re: [Patch, aarch64, middle-end\ v4: Move pair_fusion pass from aarch64 to middle-end

2024-05-30 Thread Ajit Agarwal
Hello Richard:

On 30/05/24 4:44 pm, Richard Sandiford wrote:
> Thanks for the update.  Some comments below, but looks very close
> to ready.
> 

Thanks a lot.

> Ajit Agarwal  writes:
>> diff --git a/gcc/pair-fusion.cc b/gcc/pair-fusion.cc
>> new file mode 100644
>> index 000..060fd95
>> --- /dev/null
>> +++ b/gcc/pair-fusion.cc
>> @@ -0,0 +1,3012 @@
>> +// Pass to fuse adjacent loads/stores into paired memory accesses.
>> +// Copyright (C) 2024 Free Software Foundation, Inc.
> 
> This should probably be 2023-2024, since it's based on code
> contributed in 2023.
> 

Addressed in v5 of the patch.

>> +//
>> +// This file is part of GCC.
>> +//
>> +// GCC is free software; you can redistribute it and/or modify it
>> +// under the terms of the GNU General Public License as published by
>> +// the Free Software Foundation; either version 3, or (at your option)
>> +// any later version.
>> +//
>> +// GCC is distributed in the hope that it will be useful, but
>> +// WITHOUT ANY WARRANTY; without even the implied warranty of
>> +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +// General Public License for more details.
>> +//
>> +// You should have received a copy of the GNU General Public License
>> +// along with GCC; see the file COPYING3.  If not see
>> +// <http://www.gnu.org/licenses/>.
>> +
>> +#define INCLUDE_ALGORITHM
>> +#define INCLUDE_FUNCTIONAL
>> +#define INCLUDE_LIST
>> +#define INCLUDE_TYPE_TRAITS
>> +#include "config.h"
>> +#include "system.h"
>> +#include "coretypes.h"
>> +#include "backend.h"
>> +#include "rtl.h"
>> +#include "df.h"
>> +#include "rtl-iter.h"
>> +#include "rtl-ssa.h"
>> +#include "cfgcleanup.h"
>> +#include "tree-pass.h"
>> +#include "ordered-hash-map.h"
>> +#include "tree-dfa.h"
>> +#include "fold-const.h"
>> +#include "tree-hash-traits.h"
>> +#include "print-tree.h"
>> +#include "pair-fusion.h"
>> +
>> +using namespace rtl_ssa;
>> +
>> +// We pack these fields (load_p, fpsimd_p, and size) into an integer
>> +// (LFS) which we use as part of the key into the main hash tables.
>> +//
>> +// The idea is that we group candidates together only if they agree on
>> +// the fields below.  Candidates that disagree on any of these
>> +// properties shouldn't be merged together.
>> +struct lfs_fields
>> +{
>> +  bool load_p;
>> +  bool fpsimd_p;
>> +  unsigned size;
>> +};
>> +
>> +using insn_list_t = std::list;
>> +
>> +// Information about the accesses at a given offset from a particular
>> +// base.  Stored in an access_group, see below.
>> +struct access_record
>> +{
>> +  poly_int64 offset;
>> +  std::list cand_insns;
>> +  std::list::iterator place;
>> +
>> +  access_record (poly_int64 off) : offset (off) {}
>> +};
>> +
>> +// A group of accesses where adjacent accesses could be ldp/stp
>> +// candidates.  The splay tree supports efficient insertion,
>> +// while the list supports efficient iteration.
>> +struct access_group
>> +{
>> +  splay_tree tree;
>> +  std::list list;
>> +
>> +  template
>> +  inline void track (Alloc node_alloc, poly_int64 offset, insn_info *insn);
>> +};
>> +
>> +// Test if this base candidate is viable according to HAZARDS.
>> +bool base_cand::viable () const
> 
> Formating nit, should be:
> 
> bool
> base_cand::viable () const
>

Addressed in v5 of the patch.

 
>> +{
>> +  return !hazards[0] || !hazards[1] || (*hazards[0] > *hazards[1]);
>> +}
>> [...]
>> +void
>> +pair_fusion_bb_info::transform ()
>> +{
>> +  traverse_base_map (expr_map);
>> +  traverse_base_map (def_map);
>> +}
>> +
>> +// the base register which we can fold in to make this pair use
>> +// a writeback addressing mode.
> 
> The first line of this comment is missing.  It should be:
> 
> // Given an existing pair insn INSN, look for a trailing update of
> 

Addressed in v5 of the patch.

>> [...]
>> diff --git a/gcc/pair-fusion.h b/gcc/pair-fusion.h
>> new file mode 100644
>> index 000..f295fdbdb8f
>> --- /dev/null
>> +++ b/gcc/pair-fusion.h
>> @@ -0,0 +1,195 @@
>> +// Pass to fuse adjacent loads/stores into paired memory accesses.
>> +//
>> +// This file contains th

Re: [Patch, aarch64, middle-end] v3: Move pair_fusion pass from aarch64 to middle-end

2024-05-24 Thread Ajit Agarwal
Hello Alex:

On 22/05/24 3:30 pm, Alex Coplan wrote:
> Hi Ajit,
> 
> You need to remove the header dependencies that are no longer required
> for aarch64-ldp-fusion.o in t-aarch64 (not forgetting to update the
> ChangeLog).  A few other minor nits below.
> 
> LGTM with those changes, but you'll need Richard S to approve.
> 
> Thanks a lot for doing this.
> 
> On 22/05/2024 00:16, Ajit Agarwal wrote:
>> Hello Alex/Richard:
>>
>> All comments are addressed.
>>
>> Move pair fusion pass from aarch64-ldp-fusion.cc to middle-end
>> to support multiple targets.
>>
>> Common infrastructure of load store pair fusion is divided into
>> target independent and target dependent code.
>>
>> Target independent code is structured in the following files.
>> gcc/pair-fusion.h
>> gcc/pair-fusion.cc
>>
>> Target independent code is the Generic code with pure virtual
>> function to interface betwwen target independent and dependent
>> code.
>>
>> Bootstrapped and regtested on aarch64-linux-gnu.
>>
>> Thanks & Regards
>> Ajit
>>
>>
>>
>> aarch64, middle-end: Move pair_fusion pass from aarch64 to middle-end
>>
>> Move pair fusion pass from aarch64-ldp-fusion.cc to middle-end
>> to support multiple targets.
>>
>> Common infrastructure of load store pair fusion is divided into
>> target independent and target dependent code.
>>
>> Target independent code is structured in the following files.
>> gcc/pair-fusion.h
>> gcc/pair-fusion.cc
>>
>> Target independent code is the Generic code with pure virtual
>> function to interface betwwen target independent and dependent
>> code.
>>
>> 2024-05-22  Ajit Kumar Agarwal  
>>
>> gcc/ChangeLog:
>>
>>  * pair-fusion.h: Generic header code for load store pair fusion
>>  that can be shared across different architectures.
>>  * pair-fusion.cc: Generic source code implementation for
>>  load store pair fusion that can be shared across different 
>> architectures.
>>  * Makefile.in: Add new object file pair-fusion.o.
>>  * config/aarch64/aarch64-ldp-fusion.cc: Delete generic code and move it
>>  to pair-fusion.cc in the middle-end.
>>  * config/aarch64/t-aarch64: Add header file dependency on pair-fusion.h.
>> ---
>>  gcc/Makefile.in  |1 +
>>  gcc/config/aarch64/aarch64-ldp-fusion.cc | 3298 +-
>>  gcc/config/aarch64/t-aarch64 |2 +-
>>  gcc/pair-fusion.cc   | 3013 
>>  gcc/pair-fusion.h|  193 ++
>>  5 files changed, 3286 insertions(+), 3221 deletions(-)
>>  create mode 100644 gcc/pair-fusion.cc
>>  create mode 100644 gcc/pair-fusion.h
>>
>> diff --git a/gcc/Makefile.in b/gcc/Makefile.in
>> index a7f15694c34..643342f623d 100644
>> --- a/gcc/Makefile.in
>> +++ b/gcc/Makefile.in
>> @@ -1563,6 +1563,7 @@ OBJS = \
>>  ipa-strub.o \
>>  ipa.o \
>>  ira.o \
>> +pair-fusion.o \
>>  ira-build.o \
>>  ira-costs.o \
>>  ira-conflicts.o \
>> diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc 
>> b/gcc/config/aarch64/aarch64-ldp-fusion.cc
>> index 085366cdf68..0af927231d3 100644
>> --- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
>> +++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
> 
>> diff --git a/gcc/config/aarch64/t-aarch64 b/gcc/config/aarch64/t-aarch64
>> index 78713558e7d..bdada08be70 100644
>> --- a/gcc/config/aarch64/t-aarch64
>> +++ b/gcc/config/aarch64/t-aarch64
>> @@ -203,7 +203,7 @@ aarch64-early-ra.o: 
>> $(srcdir)/config/aarch64/aarch64-early-ra.cc \
>>  aarch64-ldp-fusion.o: $(srcdir)/config/aarch64/aarch64-ldp-fusion.cc \
>>  $(CONFIG_H) $(SYSTEM_H) $(CORETYPES_H) $(BACKEND_H) $(RTL_H) $(DF_H) \
>>  $(RTL_SSA_H) cfgcleanup.h tree-pass.h ordered-hash-map.h tree-dfa.h \
>> -fold-const.h tree-hash-traits.h print-tree.h
>> +fold-const.h tree-hash-traits.h print-tree.h pair-fusion.h
> 
> So now you also need to remove the deps on the includes removed in the latest
> version of the patch.
>

Addressed in v4 of the patch.
 
>>  $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
>>  $(srcdir)/config/aarch64/aarch64-ldp-fusion.cc
>>  
>> diff --git a/gcc/pair-fusion.cc b/gcc/pair-fusion.cc
>> new file mode 100644
>> index 000..827b88cf2fc
>> --- /dev/null
>> +++ b/gcc/pair-fusion.cc
>> @@ -0,0 +1,3013 @@
>> +/

Re: [Patch, aarch64, middle-end] v2: Move pair_fusion pass from aarch64 to middle-end

2024-05-21 Thread Ajit Agarwal
Hello Alex:

On 21/05/24 10:22 pm, Alex Coplan wrote:
> Hi Ajit,
> 
> I've left some more comments below.  It's getting there now, thanks for
> your patience.
> 
> On 21/05/2024 20:32, Ajit Agarwal wrote:
>> Hello Alex/Richard:
>>
>> All comments are addressed.
>>
>> Move pair fusion pass from aarch64-ldp-fusion.cc to middle-end
>> to support multiple targets.
>>
>> Common infrastructure of load store pair fusion is divided into
>> target independent and target dependent code.
>>
>> Target independent code is structured in the following files.
>> gcc/pair-fusion.h
>> gcc/pair-fusion.cc
>>
>> Target independent code is the Generic code with pure virtual
>> function to interface betwwen target independent and dependent
>> code.
>>
>> Bootstrapped and regtested on aarch64-linux-gnu.
>>
>> Thabks & Regards
>> Ajit
>>
>>
>> aarch64, middle-end: Move pair_fusion pass from aarch64 to middle-end
>>
>> Move pair fusion pass from aarch64-ldp-fusion.cc to middle-end
>> to support multiple targets.
>>
>> Common infrastructure of load store pair fusion is divided into
>> target independent and target dependent code.
>>
>> Target independent code is structured in the following files.
>> gcc/pair-fusion.h
>> gcc/pair-fusion.cc
>>
>> Target independent code is the Generic code with pure virtual
>> function to interface betwwen target independent and dependent
>> code.
>>
>> 2024-05-21  Ajit Kumar Agarwal  
>>
>> gcc/ChangeLog:
>>
>>  * pair-fusion.h: Generic header code for load store pair fusion
>>  that can be shared across different architectures.
>>  * pair-fusion.cc: Generic source code implementation for
>>  load store pair fusion that can be shared across different 
>> architectures.
>>  * Makefile.in: Add new object file pair-fusion.o.
>>  * config/aarch64/aarch64-ldp-fusion.cc: Delete generic code and move it
>>  to pair-fusion.cc in the middle-end.
>>  * config/aarch64/t-aarch64: Add header file dependency pair-fusion.h.
> 
> insert "on" after dependency.
> 

Addressed in v3 of the patch.
>> ---
>>  gcc/Makefile.in  |1 +
>>  gcc/config/aarch64/aarch64-ldp-fusion.cc | 3282 +-
>>  gcc/config/aarch64/t-aarch64 |2 +-
>>  gcc/pair-fusion.cc   | 3013 
>>  gcc/pair-fusion.h|  189 ++
>>  5 files changed, 3280 insertions(+), 3207 deletions(-)
>>  create mode 100644 gcc/pair-fusion.cc
>>  create mode 100644 gcc/pair-fusion.h
>>
>> diff --git a/gcc/Makefile.in b/gcc/Makefile.in
>> index a7f15694c34..643342f623d 100644
>> --- a/gcc/Makefile.in
>> +++ b/gcc/Makefile.in
>> @@ -1563,6 +1563,7 @@ OBJS = \
>>  ipa-strub.o \
>>  ipa.o \
>>  ira.o \
>> +pair-fusion.o \
>>  ira-build.o \
>>  ira-costs.o \
>>  ira-conflicts.o \
>> diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc 
>> b/gcc/config/aarch64/aarch64-ldp-fusion.cc
>> index 085366cdf68..612f62060bc 100644
>> --- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
>> +++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
>> @@ -40,262 +40,13 @@
>>  
>>  using namespace rtl_ssa;
> 
> I think we should drop this, since the public interface and remaining
> backend code in this file is independent of RTL-SSA.  I think you should
> also drop the inlcude of "rtl-ssa.h" from this file.   These two
> changes will force you to get the header file (pair-fusion.h) right.
> 
> With these changes we can also significantly thin out the include list
> in this file.  The current set of includes is:
> 
> #define INCLUDE_ALGORITHM
> #define INCLUDE_FUNCTIONAL
> #define INCLUDE_LIST
> #define INCLUDE_TYPE_TRAITS
> #include "config.h"
> #include "system.h"
> #include "coretypes.h"
> #include "backend.h"
> #include "rtl.h"
> #include "df.h"
> #include "rtl-iter.h"
> #include "rtl-ssa.h"
> #include "cfgcleanup.h"
> #include "tree-pass.h"
> #include "ordered-hash-map.h"
> #include "tree-dfa.h"
> #include "fold-const.h"
> #include "tree-hash-traits.h"
> #include "print-tree.h"
> #include "insn-attr.h"
> 
> I think instead the following should be enough for this file:
> 
> #include "config.h"
> #include "system.

Re: [Patch, aarch64, middle-end] Move pair_fusion pass from aarch64 to middle-end

2024-05-21 Thread Ajit Agarwal
Hello Alex:

On 21/05/24 6:50 pm, Alex Coplan wrote:
> On 20/05/2024 21:50, Ajit Agarwal wrote:
>> Hello Alex/Richard:
>>
>> Move pair fusion pass from aarch64-ldp-fusion.cc to middle-end
>> to support multiple targets.
>>
>> Common infrastructure of load store pair fusion is divided into
>> target independent and target dependent code.
>>
>> Target independent code is structured in the following files.
>> gcc/pair-fusion.h
>> gcc/pair-fusion.cc
>>
>> Target independent code is the Generic code with pure virtual
>> function to interface betwwen target independent and dependent
>> code.
>>
>> Bootstrapped and regtested on aarch64-linux-gnu.
>>
>> Thanks & Regards
>> Ajit
>>
>> aarch64, middle-end: Move pair_fusion pass from aarch64 to middle-end
>>
>> Move pair fusion pass from aarch64-ldp-fusion.cc to middle-end
>> to support multiple targets.
>>
>> Common infrastructure of load store pair fusion is divided into
>> target independent and target dependent code.
>>
>> Target independent code is structured in the following files.
>> gcc/pair-fusion.h
>> gcc/pair-fusion.cc
>>
>> Target independent code is the Generic code with pure virtual
>> function to interface betwwen target independent and dependent
>> code.
>>
>> 2024-05-20  Ajit Kumar Agarwal  
>>
>> gcc/ChangeLog:
>>
>>  * pair-fusion.h: Generic header code for load store fusion
>>  that can be shared across different architectures.
>>  * pair-fusion.cc: Generic source code implementation for
>>  load store fusion that can be shared across different architectures.
>>  * Makefile.in: Add new executable pair-fusion.o
>>  * config/aarch64/aarch64-ldp-fusion.cc: Target specific
>>  code for load store fusion of aarch64.
> 
> Apologies for missing this in the last review but you'll also need to
> update gcc/config/aarch64/t-aarch64 to add a dependency on pair-fusion.h
> for aarch64-ldp-fusion.o.
> 

Addrssd in v2 of the patch.
> Thanks,
> Alex

Thanks & Regards
Ajit
> 
>> ---
>>  gcc/Makefile.in  |1 +
>>  gcc/config/aarch64/aarch64-ldp-fusion.cc | 3303 +-
>>  gcc/pair-fusion.cc   | 2852 +++
>>  gcc/pair-fusion.h|  340 +++
>>  4 files changed, 3268 insertions(+), 3228 deletions(-)
>>  create mode 100644 gcc/pair-fusion.cc
>>  create mode 100644 gcc/pair-fusion.h
> 


Re: [Patch, aarch64, middle-end] Move pair_fusion pass from aarch64 to middle-end

2024-05-21 Thread Ajit Agarwal
Hello Alex:

On 21/05/24 6:02 pm, Alex Coplan wrote:
> On 21/05/2024 16:02, Ajit Agarwal wrote:
>> Hello Alex:
>>
>> On 21/05/24 1:16 am, Alex Coplan wrote:
>>> On 20/05/2024 18:44, Alex Coplan wrote:
>>>> Hi Ajit,
>>>>
>>>> On 20/05/2024 21:50, Ajit Agarwal wrote:
>>>>> Hello Alex/Richard:
>>>>>
>>>>> Move pair fusion pass from aarch64-ldp-fusion.cc to middle-end
>>>>> to support multiple targets.
>>>>>
>>>>> Common infrastructure of load store pair fusion is divided into
>>>>> target independent and target dependent code.
>>>>>
>>>>> Target independent code is structured in the following files.
>>>>> gcc/pair-fusion.h
>>>>> gcc/pair-fusion.cc
>>>>>
>>>>> Target independent code is the Generic code with pure virtual
>>>>> function to interface betwwen target independent and dependent
>>>>> code.
>>>>>
>>>>> Bootstrapped and regtested on aarch64-linux-gnu.
>>>>>
>>>>> Thanks & Regards
>>>>> Ajit
>>>>>
>>>>> aarch64, middle-end: Move pair_fusion pass from aarch64 to middle-end
>>>>>
>>>>> Move pair fusion pass from aarch64-ldp-fusion.cc to middle-end
>>>>> to support multiple targets.
>>>>>
>>>>> Common infrastructure of load store pair fusion is divided into
>>>>> target independent and target dependent code.
>>>>>
>>>>> Target independent code is structured in the following files.
>>>>> gcc/pair-fusion.h
>>>>> gcc/pair-fusion.cc
>>>>>
>>>>> Target independent code is the Generic code with pure virtual
>>>>> function to interface betwwen target independent and dependent
>>>>> code.
>>>>>
>>>>> 2024-05-20  Ajit Kumar Agarwal  
>>>>>
>>>>> gcc/ChangeLog:
>>>>>
>>>>>   * pair-fusion.h: Generic header code for load store fusion
>>>>
>>>> Insert "pair" before fusion?
>>
>> Addressed in v1 of the patch.
>>>>
>>>>>   that can be shared across different architectures.
>>>>>   * pair-fusion.cc: Generic source code implementation for
>>>>>   load store fusion that can be shared across different architectures.
>>>>
>>>> Likewise.
>> Addressed in v1 of the patch.
>>>>
>>>>>   * Makefile.in: Add new executable pair-fusion.o
>>>>
>>>> It's not an executable but an object file.
>>>>
>>>>>   * config/aarch64/aarch64-ldp-fusion.cc: Target specific
>>>>>   code for load store fusion of aarch64.
>>>>
>>>> I guess this should say something like: "Delete generic code and move it
>>>> to pair-fusion.cc in the middle-end."
>>>>
>>>> I've left some comments below on the header file.  The rest of the patch
>>>> looks pretty good to me.  I tried diffing the original contents of
>>>> aarch64-ldp-fusion.cc with pair-fusion.cc, and that looks as expected.
>>>>
>>>
>>> 
>>>
>>>>> diff --git a/gcc/pair-fusion.h b/gcc/pair-fusion.h
>>>>> new file mode 100644
>>>>> index 000..00f6d3e149a
>>>>> --- /dev/null
>>>>> +++ b/gcc/pair-fusion.h
>>>>> @@ -0,0 +1,340 @@
>>>>> +// Pair Mem fusion generic header file.
>>>>> +// Copyright (C) 2024 Free Software Foundation, Inc.
>>>>> +//
>>>>> +// This file is part of GCC.
>>>>> +//
>>>>> +// GCC is free software; you can redistribute it and/or modify it
>>>>> +// under the terms of the GNU General Public License as published by
>>>>> +// the Free Software Foundation; either version 3, or (at your option)
>>>>> +// any later version.
>>>>> +//
>>>>> +// GCC is distributed in the hope that it will be useful, but
>>>>> +// WITHOUT ANY WARRANTY; without even the implied warranty of
>>>>> +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>>>>> +// General Public License for more details.
>>>>> +//
>>>>> +// You should have received a copy of the GNU General Public License
>>>>> +// along 

Re: [Patch, aarch64, middle-end] Move pair_fusion pass from aarch64 to middle-end

2024-05-21 Thread Ajit Agarwal
Hello Alex:

On 21/05/24 1:16 am, Alex Coplan wrote:
> On 20/05/2024 18:44, Alex Coplan wrote:
>> Hi Ajit,
>>
>> On 20/05/2024 21:50, Ajit Agarwal wrote:
>>> Hello Alex/Richard:
>>>
>>> Move pair fusion pass from aarch64-ldp-fusion.cc to middle-end
>>> to support multiple targets.
>>>
>>> Common infrastructure of load store pair fusion is divided into
>>> target independent and target dependent code.
>>>
>>> Target independent code is structured in the following files.
>>> gcc/pair-fusion.h
>>> gcc/pair-fusion.cc
>>>
>>> Target independent code is the Generic code with pure virtual
>>> function to interface betwwen target independent and dependent
>>> code.
>>>
>>> Bootstrapped and regtested on aarch64-linux-gnu.
>>>
>>> Thanks & Regards
>>> Ajit
>>>
>>> aarch64, middle-end: Move pair_fusion pass from aarch64 to middle-end
>>>
>>> Move pair fusion pass from aarch64-ldp-fusion.cc to middle-end
>>> to support multiple targets.
>>>
>>> Common infrastructure of load store pair fusion is divided into
>>> target independent and target dependent code.
>>>
>>> Target independent code is structured in the following files.
>>> gcc/pair-fusion.h
>>> gcc/pair-fusion.cc
>>>
>>> Target independent code is the Generic code with pure virtual
>>> function to interface betwwen target independent and dependent
>>> code.
>>>
>>> 2024-05-20  Ajit Kumar Agarwal  
>>>
>>> gcc/ChangeLog:
>>>
>>> * pair-fusion.h: Generic header code for load store fusion
>>
>> Insert "pair" before fusion?

Addressed in v1 of the patch.
>>
>>> that can be shared across different architectures.
>>> * pair-fusion.cc: Generic source code implementation for
>>> load store fusion that can be shared across different architectures.
>>
>> Likewise.
Addressed in v1 of the patch.
>>
>>> * Makefile.in: Add new executable pair-fusion.o
>>
>> It's not an executable but an object file.
>>
>>> * config/aarch64/aarch64-ldp-fusion.cc: Target specific
>>> code for load store fusion of aarch64.
>>
>> I guess this should say something like: "Delete generic code and move it
>> to pair-fusion.cc in the middle-end."
>>
>> I've left some comments below on the header file.  The rest of the patch
>> looks pretty good to me.  I tried diffing the original contents of
>> aarch64-ldp-fusion.cc with pair-fusion.cc, and that looks as expected.
>>
> 
> 
> 
>>> diff --git a/gcc/pair-fusion.h b/gcc/pair-fusion.h
>>> new file mode 100644
>>> index 000..00f6d3e149a
>>> --- /dev/null
>>> +++ b/gcc/pair-fusion.h
>>> @@ -0,0 +1,340 @@
>>> +// Pair Mem fusion generic header file.
>>> +// Copyright (C) 2024 Free Software Foundation, Inc.
>>> +//
>>> +// This file is part of GCC.
>>> +//
>>> +// GCC is free software; you can redistribute it and/or modify it
>>> +// under the terms of the GNU General Public License as published by
>>> +// the Free Software Foundation; either version 3, or (at your option)
>>> +// any later version.
>>> +//
>>> +// GCC is distributed in the hope that it will be useful, but
>>> +// WITHOUT ANY WARRANTY; without even the implied warranty of
>>> +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>>> +// General Public License for more details.
>>> +//
>>> +// You should have received a copy of the GNU General Public License
>>> +// along with GCC; see the file COPYING3.  If not see
>>> +// <http://www.gnu.org/licenses/>.
>>> +
>>> +#define INCLUDE_ALGORITHM
>>> +#define INCLUDE_FUNCTIONAL
>>> +#define INCLUDE_LIST
>>> +#define INCLUDE_TYPE_TRAITS
>>> +#include "config.h"
>>> +#include "system.h"
>>> +#include "coretypes.h"
>>> +#include "backend.h"
>>> +#include "rtl.h"
>>> +#include "df.h"
>>> +#include "rtl-iter.h"
>>> +#include "rtl-ssa.h"
>>
>> I'm not sure how desirable this is, but you might be able to
>> forward-declare RTL-SSA types like this:
>>
>> class def_info;
>> class insn_info;
>> class insn_range_info;
>>
>> thus removing the need to includ

[Patch, aarch64] Further renaming of generic code

2024-05-20 Thread Ajit Agarwal
Hello Alex/Richard:

Renaming of generic code is done to make target independent
and target dependent code to support multiple targets.

Target independent code is the Generic code with pure virtual function
to interface betwwen target independent and dependent code.

Target dependent code is the implementation of pure virtual function for
aarch64 target and the call to target independent code.

Bootstrapped and regtested on aarch64-linux-gnu.

Thanks & Regards
Ajit

aarch64: Further renaming of generic code

Renaming of generic code is done to make target independent
and target dependent code to support multiple targets.

Target independent code is the Generic code with pure virtual function
to interface betwwen target independent and dependent code.

Target dependent code is the implementation of pure virtual function for
aarch64 target and the call to target independent code.

2024-05-20  Ajit Kumar Agarwal  

gcc/ChangeLog:

* config/aarch64/aarch64-ldp-fusion.cc: Renaming of generic code
---
 gcc/config/aarch64/aarch64-ldp-fusion.cc | 55 
 1 file changed, 28 insertions(+), 27 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc 
b/gcc/config/aarch64/aarch64-ldp-fusion.cc
index 6b2a44f101b..6924e48fe7e 100644
--- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
+++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
@@ -368,7 +368,7 @@ struct aarch64_pair_fusion : public pair_fusion
 };
 
 // State used by the pass for a given basic block.
-struct ldp_bb_info
+struct pair_fusion_bb_info
 {
   using def_hash = nofree_ptr_hash;
   using expr_key_t = pair_hash>;
@@ -389,14 +389,14 @@ struct ldp_bb_info
 
   static const size_t obstack_alignment = sizeof (void *);
 
-  ldp_bb_info (bb_info *bb, pair_fusion *d)
+  pair_fusion_bb_info (bb_info *bb, pair_fusion *d)
 : m_bb (bb), m_pass (d), m_emitted_tombstone (false)
   {
 obstack_specify_allocation (_obstack, OBSTACK_CHUNK_SIZE,
obstack_alignment, obstack_chunk_alloc,
obstack_chunk_free);
   }
-  ~ldp_bb_info ()
+  ~pair_fusion_bb_info ()
   {
 obstack_free (_obstack, nullptr);
 
@@ -484,7 +484,7 @@ aarch64_pair_fusion::gen_pair (rtx *pats, rtx writeback, 
bool load_p)
 }
 
 splay_tree_node *
-ldp_bb_info::node_alloc (access_record *access)
+pair_fusion_bb_info::node_alloc (access_record *access)
 {
   using T = splay_tree_node;
   void *addr = obstack_alloc (_obstack, sizeof (T));
@@ -532,7 +532,7 @@ drop_writeback (rtx mem)
 // RTX_AUTOINC addresses.  The interface is like strip_offset except we take a
 // MEM so that we know the mode of the access.
 static rtx
-ldp_strip_offset (rtx mem, poly_int64 *offset)
+pair_mem_strip_offset (rtx mem, poly_int64 *offset)
 {
   rtx addr = XEXP (mem, 0);
 
@@ -658,7 +658,8 @@ access_group::track (Alloc alloc_node, poly_int64 offset, 
insn_info *insn)
 // MEM_EXPR base (i.e. a tree decl) relative to which we can track the access.
 // LFS is used as part of the key to the hash table, see track_access.
 bool
-ldp_bb_info::track_via_mem_expr (insn_info *insn, rtx mem, lfs_fields lfs)
+pair_fusion_bb_info::track_via_mem_expr (insn_info *insn, rtx mem,
+lfs_fields lfs)
 {
   if (!MEM_EXPR (mem) || !MEM_OFFSET_KNOWN_P (mem))
 return false;
@@ -706,7 +707,7 @@ ldp_bb_info::track_via_mem_expr (insn_info *insn, rtx mem, 
lfs_fields lfs)
 // this basic block.  LOAD_P is true if the access is a load, and MEM
 // is the mem rtx that occurs in INSN.
 void
-ldp_bb_info::track_access (insn_info *insn, bool load_p, rtx mem)
+pair_fusion_bb_info::track_access (insn_info *insn, bool load_p, rtx mem)
 {
   // We can't combine volatile MEMs, so punt on these.
   if (MEM_VOLATILE_P (mem))
@@ -739,7 +740,7 @@ ldp_bb_info::track_access (insn_info *insn, bool load_p, 
rtx mem)
   poly_int64 mem_off;
   rtx addr = XEXP (mem, 0);
   const bool autoinc_p = GET_RTX_CLASS (GET_CODE (addr)) == RTX_AUTOINC;
-  rtx base = ldp_strip_offset (mem, _off);
+  rtx base = pair_mem_strip_offset (mem, _off);
   if (!REG_P (base))
 return;
 
@@ -1099,7 +1100,7 @@ def_upwards_move_range (def_info *def)
 // Class that implements a state machine for building the changes needed to 
form
 // a store pair instruction.  This allows us to easily build the changes in
 // program order, as required by rtl-ssa.
-struct stp_change_builder
+struct store_change_builder
 {
   enum class state
   {
@@ -1126,7 +1127,7 @@ struct stp_change_builder
 
   bool done () const { return m_state == state::DONE; }
 
-  stp_change_builder (insn_info *insns[2],
+  store_change_builder (insn_info *insns[2],
  insn_info *repurpose,
  insn_info *dest)
 : m_state (state::FIRST), m_insns { insns[0], insns[1] },
@@ -1402,7 +1403,7 @@ extract_writebacks (bool load_p, rtx pats[2], int changed)
   const bool autoinc_p = GET_RTX_CLASS (GET_CODE (addr)) == RTX_AUTOINC;
 
   poly_int64 offset;
- 

Re: [Patch, aarch64] v6: Preparatory patch to place target independent and,dependent changed code in one file

2024-05-18 Thread Ajit Agarwal
Hello Alex:

On 16/05/24 10:21 pm, Alex Coplan wrote:
> Hi Ajit,
> 
> Thanks a lot for working through the review feedback.
> 
> The patch LGTM with the two minor suggested changes below.  I can't
> approve the patch, though, so you'll need an OK from Richard S.
> 
> Also, I'm not sure if it makes sense to apply the patch in isolation, it
> might make more sense to only apply it in series with follow-up patches to:
>  - Finish renaming any bits of the generic code that need renaming (I
>guess we'll want to rename at least ldp_bb_info to something else,
>probably there are other bits too).
>  - Move the generic parts out of gcc/config/aarch64 to a .cc file in the
>middle-end.
> 
> I'll let Richard S make the final judgement on that.  I don't really
> mind either way.
> 
> On 15/05/2024 15:06, Ajit Agarwal wrote:
>> Hello Alex/Richard:
>>
>> All review comments are addressed.
>>
>> Common infrastructure of load store pair fusion is divided into target
>> independent and target dependent changed code.
>>
>> Target independent code is the Generic code with pure virtual function
>> to interface between target independent and dependent code.
>>
>> Target dependent code is the implementation of pure virtual function for
>> aarch64 target and the call to target independent code.
>>
>> Bootstrapped and regtested on aarch64-linux-gnu.
>>
>> Thanks & Regards
>> Ajit
>>
>> aarch64: Preparatory patch to place target independent and
>> dependent changed code in one file
>>
>> Common infrastructure of load store pair fusion is divided into target
>> independent and target dependent changed code.
>>
>> Target independent code is the Generic code with pure virtual function
>> to interface betwwen target independent and dependent code.
>>
>> Target dependent code is the implementation of pure virtual function for
>> aarch64 target and the call to target independent code.
>>
>> 2024-05-15  Ajit Kumar Agarwal  
>>
>> gcc/ChangeLog:
>>
>>  * config/aarch64/aarch64-ldp-fusion.cc: Place target
>>  independent and dependent changed code.
>> ---
>>  gcc/config/aarch64/aarch64-ldp-fusion.cc | 533 +++
>>  1 file changed, 357 insertions(+), 176 deletions(-)
>>
>> diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc 
>> b/gcc/config/aarch64/aarch64-ldp-fusion.cc
>> index 1d9caeab05d..429e532ea3b 100644
>> --- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
>> +++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
>> @@ -138,6 +138,225 @@ struct alt_base
>>poly_int64 offset;
>>  };
>>  
>> +// Virtual base class for load/store walkers used in alias analysis.
>> +struct alias_walker
>> +{
>> +  virtual bool conflict_p (int ) const = 0;
>> +  virtual insn_info *insn () const = 0;
>> +  virtual bool valid () const = 0;
>> +  virtual void advance () = 0;
>> +};
>> +
>> +// When querying handle_writeback_opportunities, this enum is used to
>> +// qualify which opportunities we are asking about.
>> +enum class writeback {
>> +  // Only those writeback opportunities that arise from existing
>> +  // auto-increment accesses.
>> +  EXISTING,
> 
> Very minor nit: I think an extra blank line here would be nice for readability
> now that the enumerators have comments above.
> 
>> +  // All writeback opportunities including those that involve folding
>> +  // base register updates into a non-writeback pair.
>> +  ALL
>> +};
>> +
> 
> Can we have a block comment here which describes the purpose of the
> class and how it fits together with the target?  Something like the
> following would do:
> 
> // This class can be overriden by targets to give a pass that fuses
> // adjacent loads and stores into load/store pair instructions.
> //
> // The target can override the various virtual functions to customize
> // the behaviour of the pass as appropriate for the target.
> 

Addressed in v7 of the patch.
>> +struct pair_fusion {
>> +  pair_fusion ()
>> +  {
>> +calculate_dominance_info (CDI_DOMINATORS);
>> +df_analyze ();
>> +crtl->ssa = new rtl_ssa::function_info (cfun);
>> +  };
>> +
>> +  // Given:
>> +  // - an rtx REG_OP, the non-memory operand in a load/store insn,
>> +  // - a machine_mode MEM_MODE, the mode of the MEM in that insn, and
>> +  // - a boolean LOAD_P (true iff the insn is a load), then:
>> +  // return true if the access should be considered an FP/SIMD access.
>> +  // Such accesses are segregat

Re: [Patch, aarch64] v6: Preparatory patch to place target independent and,dependent changed code in one file

2024-05-18 Thread Ajit Agarwal
Hello Richard:

On 17/05/24 11:07 pm, Richard Sandiford wrote:
> Ajit Agarwal  writes:
>> Hello Alex/Richard:
>>
>> All review comments are addressed.
>>
>> Common infrastructure of load store pair fusion is divided into target
>> independent and target dependent changed code.
>>
>> Target independent code is the Generic code with pure virtual function
>> to interface between target independent and dependent code.
>>
>> Target dependent code is the implementation of pure virtual function for
>> aarch64 target and the call to target independent code.
>>
>> Bootstrapped and regtested on aarch64-linux-gnu.
>>
>> Thanks & Regards
>> Ajit
> 
> Thanks for the patch and thanks to Alex for the reviews.  The patch
> looks good to me apart from the minor nits below and the comments that
> Alex had.  Please post the updated patch for a final ok though.
> 
>> aarch64: Preparatory patch to place target independent and
>> dependent changed code in one file
>>
>> Common infrastructure of load store pair fusion is divided into target
>> independent and target dependent changed code.
>>
>> Target independent code is the Generic code with pure virtual function
>> to interface betwwen target independent and dependent code.
>>
>> Target dependent code is the implementation of pure virtual function for
>> aarch64 target and the call to target independent code.
>>
>> 2024-05-15  Ajit Kumar Agarwal  
>>
>> gcc/ChangeLog:
>>
>>  * config/aarch64/aarch64-ldp-fusion.cc: Place target
>>  independent and dependent changed code.
> 
> Not sure this is a complete sentence.  Maybe:
> 
>   * config/aarch64/aarch64-ldp-fusion.cc: Factor out a
>   target-independent interface and move it to the head of the file.
> 
> That technically isn't detailed enough for a changelog entry,
> but IMO we should use it anyway.  It's pointless to write the usual
> amount of detail when the code is going to move soon.
> 

Addressed in v7 of the patch.
>> ---
>>  gcc/config/aarch64/aarch64-ldp-fusion.cc | 533 +++
>>  1 file changed, 357 insertions(+), 176 deletions(-)
>>
>> diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc 
>> b/gcc/config/aarch64/aarch64-ldp-fusion.cc
>> index 1d9caeab05d..429e532ea3b 100644
>> --- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
>> +++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
>> @@ -138,6 +138,225 @@ struct alt_base
>>poly_int64 offset;
>>  };
>>  
>> +// Virtual base class for load/store walkers used in alias analysis.
>> +struct alias_walker
>> +{
>> +  virtual bool conflict_p (int ) const = 0;
>> +  virtual insn_info *insn () const = 0;
>> +  virtual bool valid () const = 0;
>> +  virtual void advance () = 0;
>> +};
>> +
>> +// When querying handle_writeback_opportunities, this enum is used to
>> +// qualify which opportunities we are asking about.
>> +enum class writeback {
>> +  // Only those writeback opportunities that arise from existing
>> +  // auto-increment accesses.
>> +  EXISTING,
>> +  // All writeback opportunities including those that involve folding
> 
> There should be a comma after "opportunities"
> 
>> +  // base register updates into a non-writeback pair.
>> +  ALL
>> +};
>> +
>> +struct pair_fusion {
>> +  pair_fusion ()
>> +  {
>> +calculate_dominance_info (CDI_DOMINATORS);
>> +df_analyze ();
>> +crtl->ssa = new rtl_ssa::function_info (cfun);
>> +  };
> 
> Unnecessary trailing ";".  I think it'd be better to define this and
> the destructor out-of-line though.  For one thing, it'll reduce the number
> of header file dependencies, once the code is moved to its own header file.
> 

Addressed in v7 of the patch.
>> +
>> +  // Given:
>> +  // - an rtx REG_OP, the non-memory operand in a load/store insn,
>> +  // - a machine_mode MEM_MODE, the mode of the MEM in that insn, and
>> +  // - a boolean LOAD_P (true iff the insn is a load), then:
>> +  // return true if the access should be considered an FP/SIMD access.
>> +  // Such accesses are segregated from GPR accesses, since we only want
>> +  // to form pairs for accesses that use the same register file.
>> +  virtual bool fpsimd_op_p (rtx, machine_mode, bool)
>> +  {
>> +return false;
>> +  }
>> +
>> +  // Return true if we should consider forming pairs from memory
>> +  // accesses with operand mode MODE at this stage in compilation.
>> +  virtual b

[Patch, aarch64] v7: Preparatory patch to place target independent and dependent changed code in one file

2024-05-18 Thread Ajit Agarwal
Hello Alex/Richard:

All comments are addressed.

Common infrastructure of load store pair fusion is divided into target
independent and target dependent changed code.

Target independent code is the Generic code with pure virtual function
to interface between target independent and dependent code.

Target dependent code is the implementation of pure virtual function for
aarch64 target and the call to target independent code.

Bootstrapped and regtested on aarch64-linux-gnu.

Thanks & Regards
Ajit


aarch64: Preparatory patch to place target independent and
dependent changed code in one file

Common infrastructure of load store pair fusion is divided into target
independent and target dependent changed code.

Target independent code is the Generic code with pure virtual function
to interface betwwen target independent and dependent code.

Target dependent code is the implementation of pure virtual function for
aarch64 target and the call to target independent code.

2024-05-18  Ajit Kumar Agarwal  

gcc/ChangeLog:

* config/aarch64/aarch64-ldp-fusion.cc: Factor out a
target-independent interface and move it to the head of the file
---
 gcc/config/aarch64/aarch64-ldp-fusion.cc | 555 +++
 1 file changed, 373 insertions(+), 182 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc 
b/gcc/config/aarch64/aarch64-ldp-fusion.cc
index 1d9caeab05d..e4e55b84f8b 100644
--- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
+++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
@@ -138,6 +138,235 @@ struct alt_base
   poly_int64 offset;
 };
 
+// Virtual base class for load/store walkers used in alias analysis.
+struct alias_walker
+{
+  virtual bool conflict_p (int ) const = 0;
+  virtual insn_info *insn () const = 0;
+  virtual bool valid () const = 0;
+  virtual void advance () = 0;
+};
+
+// When querying should_handle_writeback, this enum is used to
+// qualify which opportunities we are asking about.
+enum class writeback {
+  // Only those writeback opportunities that arise from existing
+  // auto-increment accesses.
+  EXISTING,
+
+  // All writeback opportunities including those that involve folding
+  // base register updates into a non-writeback pair.
+  ALL
+};
+
+// This class can be overriden by targets to give a pass that fuses
+// adjacent loads and stores into load/store pair instructions.
+//
+// The target can override the various virtual functions to customize
+// the behaviour of the pass as appropriate for the target.
+struct pair_fusion {
+  pair_fusion ();
+
+  // Given:
+  // - an rtx REG_OP, the non-memory operand in a load/store insn,
+  // - a machine_mode MEM_MODE, the mode of the MEM in that insn, and
+  // - a boolean LOAD_P (true iff the insn is a load), then:
+  // return true if the access should be considered an FP/SIMD access.
+  // Such accesses are segregated from GPR accesses, since we only want
+  // to form pairs for accesses that use the same register file.
+  virtual bool fpsimd_op_p (rtx, machine_mode, bool)
+  {
+return false;
+  }
+
+  // Return true if we should consider forming pairs from memory
+  // accesses with operand mode MODE at this stage in compilation.
+  virtual bool pair_operand_mode_ok_p (machine_mode mode) = 0;
+
+  // Return true iff REG_OP is a suitable register operand for a paired
+  // memory access, where LOAD_P is true if we're asking about loads and
+  // false for stores.  MODE gives the mode of the operand.
+  virtual bool pair_reg_operand_ok_p (bool load_p, rtx reg_op,
+ machine_mode mode) = 0;
+
+  // Return alias check limit.
+  // This is needed to avoid unbounded quadratic behaviour when
+  // performing alias analysis.
+  virtual int pair_mem_alias_check_limit () = 0;
+
+  // Return true if we should try to handle writeback opportunities.
+  // WHICH determines the kinds of writeback opportunities the caller
+  // is asking about.
+  virtual bool should_handle_writeback (enum writeback which) = 0;
+
+  // Given BASE_MEM, the mem from the lower candidate access for a pair,
+  // and LOAD_P (true if the access is a load), check if we should proceed
+  // to form the pair given the target's code generation policy on
+  // paired accesses.
+  virtual bool pair_mem_ok_with_policy (rtx base_mem, bool load_p) = 0;
+
+  // Generate the pattern for a paired access.  PATS gives the patterns
+  // for the individual memory accesses (which by this point must share a
+  // common base register).  If WRITEBACK is non-NULL, then this rtx
+  // describes the update to the base register that should be performed by
+  // the resulting insn.  LOAD_P is true iff the accesses are loads.
+  virtual rtx gen_pair (rtx *pats, rtx writeback, bool load_p) = 0;
+
+  // Return true if INSN is a paired memory access.  If so, set LOAD_P to
+  // true iff INSN is a load pair.
+  virtual bool pair_mem_insn_p (rtx_insn *insn, bool _p) = 0;
+
+  // Return true if we should track loads.
+  virtual bool 

Re: [Patch, aarch64] v6: Preparatory patch to place target independent and,dependent changed code in one file

2024-05-17 Thread Ajit Agarwal
Hello Alex:

On 17/05/24 6:22 pm, Alex Coplan wrote:
> Hi Ajit,
> 
> On 17/05/2024 18:05, Ajit Agarwal wrote:
>> Hello Alex:
>>
>> On 16/05/24 10:21 pm, Alex Coplan wrote:
>>> Hi Ajit,
>>>
>>> Thanks a lot for working through the review feedback.
>>>
>>
>> Thanks a lot for reviewing the code and approving the patch.
> 
> To be clear, I didn't approve the patch because I can't, I just said
> that it looks good to me.  You need an AArch64 maintainer (probably
> Richard S) to approve it.
> 

Thats what I meant. Sorry for the confusion.
>>
>>> The patch LGTM with the two minor suggested changes below.  I can't
>>> approve the patch, though, so you'll need an OK from Richard S.
>>>
>>> Also, I'm not sure if it makes sense to apply the patch in isolation, it
>>> might make more sense to only apply it in series with follow-up patches to:
>>>  - Finish renaming any bits of the generic code that need renaming (I
>>>guess we'll want to rename at least ldp_bb_info to something else,
>>>probably there are other bits too).
>>>  - Move the generic parts out of gcc/config/aarch64 to a .cc file in the
>>>middle-end.
>>>
>>
>> Addressed in separate patch sent.
> 
> Hmm, that doens't look right.  You sent a single patch here:
> https://gcc.gnu.org/pipermail/gcc-patches/2024-May/652028.html
> which looks to squash the work you've done in this patch together with
> the move.
> 
> What I expect to see is a patch series, as follows:
> 
> [PATCH 1/3] aarch64: Split generic code from aarch64 code in ldp fusion
> [PATCH 2/3] aarch64: Further renaming of generic code
> [PATCH 3/3] aarch64, middle-end: Move pair_fusion pass from aarch64 to 
> middle-end
> 
> where 1/3 is exactly the patch that I reviewed above with the two
> (minor) requested changes (plus any changes requested by Richard), 2/3
> (optionally) does further renaming to use generic terminology in the
> generic code where needed/desired, and 3/3 does a straight cut/paste
> move of code into pair-fusion.h and pair-fusion.cc, with no other
> changes (save for perhaps a Makefile change and adding an include in
> aarch64-ldp-fusion.cc).
> 
> Arguably you could split this even further and do the move of the
> pair_fusion class to the new header in a separate patch prior to the
> final move.
> 
> N.B. (IMO) the patches should be presented like this both for review and
> (if approved) when committing.
> 
> Richard S may have further suggestions on how to split the patches /
> make them more tractable to review, I think this is the bare minimum
> that is needed though.
> 

Sure, I will make patches as per above.

> Hope that makes sense.
> 
> Thanks,
> Alex
>

Thanks & Regards
Ajit
 
>>  
>>> I'll let Richard S make the final judgement on that.  I don't really
>>> mind either way.
>>
>> Sure.
>>
>> Thanks & Regards
>> Ajit
>>>
>>> On 15/05/2024 15:06, Ajit Agarwal wrote:
>>>> Hello Alex/Richard:
>>>>
>>>> All review comments are addressed.
>>>>
>>>> Common infrastructure of load store pair fusion is divided into target
>>>> independent and target dependent changed code.
>>>>
>>>> Target independent code is the Generic code with pure virtual function
>>>> to interface between target independent and dependent code.
>>>>
>>>> Target dependent code is the implementation of pure virtual function for
>>>> aarch64 target and the call to target independent code.
>>>>
>>>> Bootstrapped and regtested on aarch64-linux-gnu.
>>>>
>>>> Thanks & Regards
>>>> Ajit
>>>>
>>>> aarch64: Preparatory patch to place target independent and
>>>> dependent changed code in one file
>>>>
>>>> Common infrastructure of load store pair fusion is divided into target
>>>> independent and target dependent changed code.
>>>>
>>>> Target independent code is the Generic code with pure virtual function
>>>> to interface betwwen target independent and dependent code.
>>>>
>>>> Target dependent code is the implementation of pure virtual function for
>>>> aarch64 target and the call to target independent code.
>>>>
>>>> 2024-05-15  Ajit Kumar Agarwal  
>>>>
>>>> gcc/ChangeLog:
>>>>
>>>>* config/aarch64/aarch64-ldp-fusion.cc: Place target
>>>>independent and d

Re: [Patch, aarch64] v6: Preparatory patch to place target independent and,dependent changed code in one file

2024-05-17 Thread Ajit Agarwal
Hello Alex:

On 16/05/24 10:21 pm, Alex Coplan wrote:
> Hi Ajit,
> 
> Thanks a lot for working through the review feedback.
> 

Thanks a lot for reviewing the code and approving the patch.

> The patch LGTM with the two minor suggested changes below.  I can't
> approve the patch, though, so you'll need an OK from Richard S.
> 
> Also, I'm not sure if it makes sense to apply the patch in isolation, it
> might make more sense to only apply it in series with follow-up patches to:
>  - Finish renaming any bits of the generic code that need renaming (I
>guess we'll want to rename at least ldp_bb_info to something else,
>probably there are other bits too).
>  - Move the generic parts out of gcc/config/aarch64 to a .cc file in the
>middle-end.
>

Addressed in separate patch sent.
 
> I'll let Richard S make the final judgement on that.  I don't really
> mind either way.

Sure.

Thanks & Regards
Ajit
> 
> On 15/05/2024 15:06, Ajit Agarwal wrote:
>> Hello Alex/Richard:
>>
>> All review comments are addressed.
>>
>> Common infrastructure of load store pair fusion is divided into target
>> independent and target dependent changed code.
>>
>> Target independent code is the Generic code with pure virtual function
>> to interface between target independent and dependent code.
>>
>> Target dependent code is the implementation of pure virtual function for
>> aarch64 target and the call to target independent code.
>>
>> Bootstrapped and regtested on aarch64-linux-gnu.
>>
>> Thanks & Regards
>> Ajit
>>
>> aarch64: Preparatory patch to place target independent and
>> dependent changed code in one file
>>
>> Common infrastructure of load store pair fusion is divided into target
>> independent and target dependent changed code.
>>
>> Target independent code is the Generic code with pure virtual function
>> to interface betwwen target independent and dependent code.
>>
>> Target dependent code is the implementation of pure virtual function for
>> aarch64 target and the call to target independent code.
>>
>> 2024-05-15  Ajit Kumar Agarwal  
>>
>> gcc/ChangeLog:
>>
>>  * config/aarch64/aarch64-ldp-fusion.cc: Place target
>>  independent and dependent changed code.
>> ---
>>  gcc/config/aarch64/aarch64-ldp-fusion.cc | 533 +++
>>  1 file changed, 357 insertions(+), 176 deletions(-)
>>
>> diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc 
>> b/gcc/config/aarch64/aarch64-ldp-fusion.cc
>> index 1d9caeab05d..429e532ea3b 100644
>> --- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
>> +++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
>> @@ -138,6 +138,225 @@ struct alt_base
>>poly_int64 offset;
>>  };
>>  
>> +// Virtual base class for load/store walkers used in alias analysis.
>> +struct alias_walker
>> +{
>> +  virtual bool conflict_p (int ) const = 0;
>> +  virtual insn_info *insn () const = 0;
>> +  virtual bool valid () const = 0;
>> +  virtual void advance () = 0;
>> +};
>> +
>> +// When querying handle_writeback_opportunities, this enum is used to
>> +// qualify which opportunities we are asking about.
>> +enum class writeback {
>> +  // Only those writeback opportunities that arise from existing
>> +  // auto-increment accesses.
>> +  EXISTING,
> 
> Very minor nit: I think an extra blank line here would be nice for readability
> now that the enumerators have comments above.
> 
>> +  // All writeback opportunities including those that involve folding
>> +  // base register updates into a non-writeback pair.
>> +  ALL
>> +};
>> +
> 
> Can we have a block comment here which describes the purpose of the
> class and how it fits together with the target?  Something like the
> following would do:
> 
> // This class can be overriden by targets to give a pass that fuses
> // adjacent loads and stores into load/store pair instructions.
> //
> // The target can override the various virtual functions to customize
> // the behaviour of the pass as appropriate for the target.
> 
>> +struct pair_fusion {
>> +  pair_fusion ()
>> +  {
>> +calculate_dominance_info (CDI_DOMINATORS);
>> +df_analyze ();
>> +crtl->ssa = new rtl_ssa::function_info (cfun);
>> +  };
>> +
>> +  // Given:
>> +  // - an rtx REG_OP, the non-memory operand in a load/store insn,
>> +  // - a machine_mode MEM_MODE, the mode of the MEM in that insn, and
>> +  // - a boolean LOAD_P (true iff the insn is a load), then:
>>

Re: [Patch, aarch64] v4: Preparatory patch to place target independent and,dependent changed code in one file

2024-05-15 Thread Ajit Agarwal
Hello Alex:

On 14/05/24 11:53 pm, Alex Coplan wrote:
> Hi Ajit,
> 
> Please can you pay careful attention to the review comments?
> 
> In particular, you have ignored my comment about changing the access of
> member functions in ldp_bb_info several times now (on at least three
> patch reviews).
> 
> Likewise on multiple occasions you've only partially implemented a piece
> of review feedback (e.g. applying the "override" keyword to virtual
> overrides).
> 
> That all makes it rather tiresome to review your patches.
> 
> Also, I realise I should have mentioned this on a previous revision of
> this patch, but I thought we previously agreed (with Richard S) to split
> out the renaming in existing code (e.g. ldp/stp -> "paired access" and
> so on) to a separate patch?  That would make this eaiser to review.
> 

Sorry for the inconvenience caused. Hopefully I have incorporated
all the comments in v6 version of the patch.

> On 14/05/2024 15:08, Ajit Agarwal wrote:
>> Hello Alex/Richard:
>>
>> All comments are addressed.
>>
>> Common infrastructure of load store pair fusion is divided into target
>> independent and target dependent changed code.
>>
>> Target independent code is the Generic code with pure virtual function
>> to interface betwwen target independent and dependent code.
>>
>> Target dependent code is the implementation of pure virtual function for
>> aarch64 target and the call to target independent code.
>>
>> Bootstrapped on aarch64-linux-gnu.
>>
>> Thanks & Regards
>> Ajit
>>
>>
>>
>> arch64: Preparatory patch to place target independent and
>> dependent changed code in one file
>>
>> Common infrastructure of load store pair fusion is divided into target
>> independent and target dependent changed code.
>>
>> Target independent code is the Generic code with pure virtual function
>> to interface betwwen target independent and dependent code.
>>
>> Target dependent code is the implementation of pure virtual function for
>> aarch64 target and the call to target independent code.
>>
>> 2024-05-14  Ajit Kumar Agarwal  
>>
>> gcc/ChangeLog:
>>
>>  * config/aarch64/aarch64-ldp-fusion.cc: Place target
>>  independent and dependent changed code.
>> ---
>>  gcc/config/aarch64/aarch64-ldp-fusion.cc | 526 +++
>>  1 file changed, 345 insertions(+), 181 deletions(-)
>>
>> diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc 
>> b/gcc/config/aarch64/aarch64-ldp-fusion.cc
>> index 1d9caeab05d..e6af4b0570a 100644
>> --- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
>> +++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
>> @@ -138,6 +138,210 @@ struct alt_base
>>poly_int64 offset;
>>  };
>>  
>> +// Virtual base class for load/store walkers used in alias analysis.
>> +struct alias_walker
>> +{
>> +  virtual bool conflict_p (int ) const = 0;
>> +  virtual insn_info *insn () const = 0;
>> +  virtual bool valid () const = 0;
>> +  virtual void advance () = 0;
>> +};
>> +
>> +// This is used in handle_writeback_opportunities describing
>> +// ALL if aarch64_ldp_writeback > 1 otherwise check
>> +// EXISTING if aarch64_ldp_writeback.
> 
> Since this enum belongs to the generic interface, it's best if it is
> described in general terms, i.e. the comment shouldn't refer to the
> aarch64 param.
> 
> How about:
> 
> // When querying handle_writeback_opportunities, this enum is used to
> // qualify which opportunities we are asking about.
> 
> then above the EXISTING enumerator, you could say:
> 
>   // Only those writeback opportunities that arise from existing
>   // auto-increment accesses.
> 
> and for ALL, you could say:
> 
>   // All writeback opportunities including those that involve folding
>   // base register updates into a non-writeback pair.
>

Addressed in v6 of the patch.
 
>> +enum class writeback {
>> +  ALL,
>> +  EXISTING
>> +};
> 
> Also, sorry for the very minor nit, but I think it is more logical if we
> flip the order of the enumerators here, i.e. EXISTING should come first.
> 
>> +
>> +struct pair_fusion {
>> +  pair_fusion ()
>> +  {
>> +calculate_dominance_info (CDI_DOMINATORS);
>> +df_analyze ();
>> +crtl->ssa = new rtl_ssa::function_info (cfun);
>> +  };
>> +
>> +  // Given:
>> +  // - an rtx REG_OP, the non-memory operand in a load/store insn,
>> +  // - a machine_mode MEM_MODE, the mode of the MEM in that insn, and
>> +  /

[Patch, aarch64] v6: Preparatory patch to place target independent and,dependent changed code in one file

2024-05-15 Thread Ajit Agarwal
Hello Alex/Richard:

All review comments are addressed.

Common infrastructure of load store pair fusion is divided into target
independent and target dependent changed code.

Target independent code is the Generic code with pure virtual function
to interface between target independent and dependent code.

Target dependent code is the implementation of pure virtual function for
aarch64 target and the call to target independent code.

Bootstrapped and regtested on aarch64-linux-gnu.

Thanks & Regards
Ajit

aarch64: Preparatory patch to place target independent and
dependent changed code in one file

Common infrastructure of load store pair fusion is divided into target
independent and target dependent changed code.

Target independent code is the Generic code with pure virtual function
to interface betwwen target independent and dependent code.

Target dependent code is the implementation of pure virtual function for
aarch64 target and the call to target independent code.

2024-05-15  Ajit Kumar Agarwal  

gcc/ChangeLog:

* config/aarch64/aarch64-ldp-fusion.cc: Place target
independent and dependent changed code.
---
 gcc/config/aarch64/aarch64-ldp-fusion.cc | 533 +++
 1 file changed, 357 insertions(+), 176 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc 
b/gcc/config/aarch64/aarch64-ldp-fusion.cc
index 1d9caeab05d..429e532ea3b 100644
--- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
+++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
@@ -138,6 +138,225 @@ struct alt_base
   poly_int64 offset;
 };
 
+// Virtual base class for load/store walkers used in alias analysis.
+struct alias_walker
+{
+  virtual bool conflict_p (int ) const = 0;
+  virtual insn_info *insn () const = 0;
+  virtual bool valid () const = 0;
+  virtual void advance () = 0;
+};
+
+// When querying handle_writeback_opportunities, this enum is used to
+// qualify which opportunities we are asking about.
+enum class writeback {
+  // Only those writeback opportunities that arise from existing
+  // auto-increment accesses.
+  EXISTING,
+  // All writeback opportunities including those that involve folding
+  // base register updates into a non-writeback pair.
+  ALL
+};
+
+struct pair_fusion {
+  pair_fusion ()
+  {
+calculate_dominance_info (CDI_DOMINATORS);
+df_analyze ();
+crtl->ssa = new rtl_ssa::function_info (cfun);
+  };
+
+  // Given:
+  // - an rtx REG_OP, the non-memory operand in a load/store insn,
+  // - a machine_mode MEM_MODE, the mode of the MEM in that insn, and
+  // - a boolean LOAD_P (true iff the insn is a load), then:
+  // return true if the access should be considered an FP/SIMD access.
+  // Such accesses are segregated from GPR accesses, since we only want
+  // to form pairs for accesses that use the same register file.
+  virtual bool fpsimd_op_p (rtx, machine_mode, bool)
+  {
+return false;
+  }
+
+  // Return true if we should consider forming pairs from memory
+  // accesses with operand mode MODE at this stage in compilation.
+  virtual bool pair_operand_mode_ok_p (machine_mode mode) = 0;
+
+  // Return true iff REG_OP is a suitable register operand for a paired
+  // memory access, where LOAD_P is true if we're asking about loads and
+  // false for stores.  MODE gives the mode of the operand.
+  virtual bool pair_reg_operand_ok_p (bool load_p, rtx reg_op,
+ machine_mode mode) = 0;
+
+  // Return alias check limit.
+  // This is needed to avoid unbounded quadratic behaviour when
+  // performing alias analysis.
+  virtual int pair_mem_alias_check_limit () = 0;
+
+  // Returns true if we should try to handle writeback opportunities.
+  // WHICH determines the kinds of writeback opportunities the caller
+  // is asking about.
+  virtual bool handle_writeback_opportunities (enum writeback which) = 0 ;
+
+  // Given BASE_MEM, the mem from the lower candidate access for a pair,
+  // and LOAD_P (true if the access is a load), check if we should proceed
+  // to form the pair given the target's code generation policy on
+  // paired accesses.
+  virtual bool pair_mem_ok_with_policy (rtx base_mem, bool load_p) = 0;
+
+  // Generate the pattern for a paired access.  PATS gives the patterns
+  // for the individual memory accesses (which by this point must share a
+  // common base register).  If WRITEBACK is non-NULL, then this rtx
+  // describes the update to the base register that should be performed by
+  // the resulting insn.  LOAD_P is true iff the accesses are loads.
+  virtual rtx gen_pair (rtx *pats, rtx writeback, bool load_p) = 0;
+
+  // Return true if INSN is a paired memory access.  If so, set LOAD_P to
+  // true iff INSN is a load pair.
+  virtual bool pair_mem_insn_p (rtx_insn *insn, bool _p) = 0;
+
+  // Return true if we should track loads.
+  virtual bool track_loads_p ()
+  {
+return true;
+  }
+
+  // Return true if we should track stores.
+  virtual bool track_stores_p ()
+  {
+return 

Re: [Patch, aarch64] v3: Preparatory patch to place target independent and,dependent changed code in one file

2024-05-14 Thread Ajit Agarwal
Hello Alex:

On 13/05/24 8:49 pm, Alex Coplan wrote:
> Hi Ajit,
> 
> Why did you send three mails for this revision of the patch?  If you're
> going to send a new revision of the patch you should increment the
> version number and outline the changes / reasons for the new revision.
> 

There were issues sending the patch through thunderbird, hence multople
pacthes. Sorry for inconvenience caused.

> Mostly the comments below are just style nits and things you missed from
> the last review(s) (please try not to miss so many in the future).
>

Addressed.
 
> On 09/05/2024 17:06, Ajit Agarwal wrote:
>> Hello Alex/Richard:
>>
>> All review comments are addressed.
>>
>> Common infrastructure of load store pair fusion is divided into target
>> independent and target dependent changed code.
>>
>> Target independent code is the Generic code with pure virtual function
>> to interface betwwen target independent and dependent code.
>>
>> Target dependent code is the implementation of pure virtual function for
>> aarch64 target and the call to target independent code.
>>
>> Bootstrapped on aarch64-linux-gnu.
>>
>> Thanks & Regards
>> Ajit
>>
>>
>>
>> aarch64: Preparatory patch to place target independent and
>> dependent changed code in one file
>>
>> Common infrastructure of load store pair fusion is divided into target
>> independent and target dependent changed code.
>>
>> Target independent code is the Generic code with pure virtual function
>> to interface betwwen target independent and dependent code.
>>
>> Target dependent code is the implementation of pure virtual function for
>> aarch64 target and the call to target independent code.
>>
>> 2024-05-09  Ajit Kumar Agarwal  
>>
>> gcc/ChangeLog:
>>
>>  * config/aarch64/aarch64-ldp-fusion.cc: Place target
>>  independent and dependent changed code.
>> ---
>>  gcc/config/aarch64/aarch64-ldp-fusion.cc | 542 +++
>>  1 file changed, 363 insertions(+), 179 deletions(-)
>>
>> diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc 
>> b/gcc/config/aarch64/aarch64-ldp-fusion.cc
>> index 1d9caeab05d..217790e111a 100644
>> --- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
>> +++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
>> @@ -138,6 +138,224 @@ struct alt_base
>>poly_int64 offset;
>>  };
>>  
>> +// Virtual base class for load/store walkers used in alias analysis.
>> +struct alias_walker
>> +{
>> +  virtual bool conflict_p (int ) const = 0;
>> +  virtual insn_info *insn () const = 0;
>> +  virtual bool valid () const = 0;
>> +  virtual void advance () = 0;
>> +};
>> +
>> +enum class writeback{
> 
> You missed a nit here.  Space before '{'.
> 

Addressed.
>> +  ALL,
>> +  EXISTING
>> +};
> 
> You also missed adding comments for the enum, please see the review for v2:
> https://gcc.gnu.org/pipermail/gcc-patches/2024-May/651074.html
> 

Addressed.
>> +
>> +struct pair_fusion {
>> +  pair_fusion ()
>> +  {
>> +calculate_dominance_info (CDI_DOMINATORS);
>> +df_analyze ();
>> +crtl->ssa = new rtl_ssa::function_info (cfun);
>> +  };
>> +
>> +  // Given:
>> +  // - an rtx REG_OP, the non-memory operand in a load/store insn,
>> +  // - a machine_mode MEM_MODE, the mode of the MEM in that insn, and
>> +  // - a boolean LOAD_P (true iff the insn is a load), then:
>> +  // return true if the access should be considered an FP/SIMD access.
>> +  // Such accesses are segregated from GPR accesses, since we only want
>> +  // to form pairs for accesses that use the same register file.
>> +  virtual bool fpsimd_op_p (rtx, machine_mode, bool)
>> +  {
>> +return false;
>> +  }
>> +
>> +  // Return true if we should consider forming ldp/stp insns from memory
>> +  // accesses with operand mode MODE at this stage in compilation.
>> +  virtual bool pair_operand_mode_ok_p (machine_mode mode) = 0;
>> +
>> +  // Return true iff REG_OP is a suitable register operand for a paired
>> +  // memory access, where LOAD_P is true if we're asking about loads and
>> +  // false for stores.  MEM_MODE gives the mode of the operand.
>> +  virtual bool pair_reg_operand_ok_p (bool load_p, rtx reg_op,
>> +  machine_mode mode) = 0;
> 
> The comment needs updating since we changed the name of the last param,
> i.e. s/MEM_MODE/MODE/.
> 
Addressed.
>> +
>> +  // Return alias check limit.
>> +  /

[Patch, aarch64] v5: Preparatory patch to place target independent and,dependent changed code in one file

2024-05-14 Thread Ajit Agarwal
Hello Alex/Richard:

All review comments are incorporated.

Changes since v4:

 - changed prototype of destructure_pair from rti parameter to pattern 
parameter.


Common infrastructure of load store pair fusion is divided into target
independent and target dependent changed code.

Target independent code is the Generic code with pure virtual function
to interface betwwen target independent and dependent code.

Target dependent code is the implementation of pure virtual function for
aarch64 target and the call to target independent code.

Bootstrapped on aarch64-linux-gnu.

Thanks & Regards
Ajit


aarch64: Preparatory patch to place target independent and
dependent changed code in one file

Common infrastructure of load store pair fusion is divided into target
independent and target dependent changed code.

Target independent code is the Generic code with pure virtual function
to interface betwwen target independent and dependent code.

Target dependent code is the implementation of pure virtual function for
aarch64 target and the call to target independent code.

2024-05-14  Ajit Kumar Agarwal  

gcc/ChangeLog:

* config/aarch64/aarch64-ldp-fusion.cc: Place target
independent and dependent changed code.
---
 gcc/config/aarch64/aarch64-ldp-fusion.cc | 526 +++
 1 file changed, 345 insertions(+), 181 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc 
b/gcc/config/aarch64/aarch64-ldp-fusion.cc
index 1d9caeab05d..3551767e29e 100644
--- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
+++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
@@ -138,6 +138,210 @@ struct alt_base
   poly_int64 offset;
 };
 
+// Virtual base class for load/store walkers used in alias analysis.
+struct alias_walker
+{
+  virtual bool conflict_p (int ) const = 0;
+  virtual insn_info *insn () const = 0;
+  virtual bool valid () const = 0;
+  virtual void advance () = 0;
+};
+
+// This is used in handle_writeback_opportunities describing
+// ALL if aarch64_ldp_writeback > 1 otherwise check
+// EXISTING if aarch64_ldp_writeback.
+enum class writeback {
+  ALL,
+  EXISTING
+};
+
+struct pair_fusion {
+  pair_fusion ()
+  {
+calculate_dominance_info (CDI_DOMINATORS);
+df_analyze ();
+crtl->ssa = new rtl_ssa::function_info (cfun);
+  };
+
+  // Given:
+  // - an rtx REG_OP, the non-memory operand in a load/store insn,
+  // - a machine_mode MEM_MODE, the mode of the MEM in that insn, and
+  // - a boolean LOAD_P (true iff the insn is a load), then:
+  // return true if the access should be considered an FP/SIMD access.
+  // Such accesses are segregated from GPR accesses, since we only want
+  // to form pairs for accesses that use the same register file.
+  virtual bool fpsimd_op_p (rtx, machine_mode, bool)
+  {
+return false;
+  }
+
+  // Return true if we should consider forming ldp/stp insns from memory
+  // accesses with operand mode MODE at this stage in compilation.
+  virtual bool pair_operand_mode_ok_p (machine_mode mode) = 0;
+
+  // Return true iff REG_OP is a suitable register operand for a paired
+  // memory access, where LOAD_P is true if we're asking about loads and
+  // false for stores.  MODE gives the mode of the operand.
+  virtual bool pair_reg_operand_ok_p (bool load_p, rtx reg_op,
+ machine_mode mode) = 0;
+
+  // Return alias check limit.
+  // This is needed to avoid unbounded quadratic behaviour when
+  // performing alias analysis.
+  virtual int pair_mem_alias_check_limit () = 0;
+
+  // Returns true if we should try to handle writeback opportunities
+  // WHICH parameter decides ALL or EXISTING writeback pairs.
+  virtual bool handle_writeback_opportunities (enum writeback which) = 0 ;
+
+  // Given BASE_MEM, the mem from the lower candidate access for a pair,
+  // and LOAD_P (true if the access is a load), check if we should proceed
+  // to form the pair given the target's code generation policy on
+  // paired accesses.
+  virtual bool pair_mem_ok_with_policy (rtx base_mem, bool load_p) = 0;
+
+  // Generate the pattern for a paired access. PATS gives the patterns
+  // for the individual memory accesses (which by this point must share a
+  // common base register).  If WRITEBACK is non-NULL, then this rtx
+  // describes the update to the base register that should be performed by
+  // the resulting insn.  LOAD_P is true iff the accesses are loads.
+  virtual rtx gen_pair (rtx *pats, rtx writeback, bool load_p) = 0;
+
+  // Return true if memory is paired access, given INSN and LOAD_P
+  // is true for load insn and false for store insn.
+  virtual bool pair_mem_insn_p (rtx_insn *, bool &) = 0;
+
+  // Return true if we should track loads.
+  virtual bool track_loads_p ()
+  {
+return true;
+  }
+
+  // Return true if we should track stores.
+  virtual bool track_stores_p ()
+  {
+return true;
+  }
+
+  // Return true if OFF  is in range.
+  virtual bool pair_mem_in_range_p (HOST_WIDE_INT off) = 0;
+
+  

[Patch, aarch64] v4: Preparatory patch to place target independent and,dependent changed code in one file

2024-05-14 Thread Ajit Agarwal
Hello Alex/Richard:

All comments are addressed.

Common infrastructure of load store pair fusion is divided into target
independent and target dependent changed code.

Target independent code is the Generic code with pure virtual function
to interface betwwen target independent and dependent code.

Target dependent code is the implementation of pure virtual function for
aarch64 target and the call to target independent code.

Bootstrapped on aarch64-linux-gnu.

Thanks & Regards
Ajit



arch64: Preparatory patch to place target independent and
dependent changed code in one file

Common infrastructure of load store pair fusion is divided into target
independent and target dependent changed code.

Target independent code is the Generic code with pure virtual function
to interface betwwen target independent and dependent code.

Target dependent code is the implementation of pure virtual function for
aarch64 target and the call to target independent code.

2024-05-14  Ajit Kumar Agarwal  

gcc/ChangeLog:

* config/aarch64/aarch64-ldp-fusion.cc: Place target
independent and dependent changed code.
---
 gcc/config/aarch64/aarch64-ldp-fusion.cc | 526 +++
 1 file changed, 345 insertions(+), 181 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc 
b/gcc/config/aarch64/aarch64-ldp-fusion.cc
index 1d9caeab05d..e6af4b0570a 100644
--- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
+++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
@@ -138,6 +138,210 @@ struct alt_base
   poly_int64 offset;
 };
 
+// Virtual base class for load/store walkers used in alias analysis.
+struct alias_walker
+{
+  virtual bool conflict_p (int ) const = 0;
+  virtual insn_info *insn () const = 0;
+  virtual bool valid () const = 0;
+  virtual void advance () = 0;
+};
+
+// This is used in handle_writeback_opportunities describing
+// ALL if aarch64_ldp_writeback > 1 otherwise check
+// EXISTING if aarch64_ldp_writeback.
+enum class writeback {
+  ALL,
+  EXISTING
+};
+
+struct pair_fusion {
+  pair_fusion ()
+  {
+calculate_dominance_info (CDI_DOMINATORS);
+df_analyze ();
+crtl->ssa = new rtl_ssa::function_info (cfun);
+  };
+
+  // Given:
+  // - an rtx REG_OP, the non-memory operand in a load/store insn,
+  // - a machine_mode MEM_MODE, the mode of the MEM in that insn, and
+  // - a boolean LOAD_P (true iff the insn is a load), then:
+  // return true if the access should be considered an FP/SIMD access.
+  // Such accesses are segregated from GPR accesses, since we only want
+  // to form pairs for accesses that use the same register file.
+  virtual bool fpsimd_op_p (rtx, machine_mode, bool)
+  {
+return false;
+  }
+
+  // Return true if we should consider forming ldp/stp insns from memory
+  // accesses with operand mode MODE at this stage in compilation.
+  virtual bool pair_operand_mode_ok_p (machine_mode mode) = 0;
+
+  // Return true iff REG_OP is a suitable register operand for a paired
+  // memory access, where LOAD_P is true if we're asking about loads and
+  // false for stores.  MODE gives the mode of the operand.
+  virtual bool pair_reg_operand_ok_p (bool load_p, rtx reg_op,
+ machine_mode mode) = 0;
+
+  // Return alias check limit.
+  // This is needed to avoid unbounded quadratic behaviour when
+  // performing alias analysis.
+  virtual int pair_mem_alias_check_limit () = 0;
+
+  // Returns true if we should try to handle writeback opportunities
+  // WHICH parameter decides ALL or EXISTING writeback pairs.
+  virtual bool handle_writeback_opportunities (enum writeback which) = 0 ;
+
+  // Given BASE_MEM, the mem from the lower candidate access for a pair,
+  // and LOAD_P (true if the access is a load), check if we should proceed
+  // to form the pair given the target's code generation policy on
+  // paired accesses.
+  virtual bool pair_mem_ok_with_policy (rtx base_mem, bool load_p) = 0;
+
+  // Generate the pattern for a paired access. PATS gives the patterns
+  // for the individual memory accesses (which by this point must share a
+  // common base register).  If WRITEBACK is non-NULL, then this rtx
+  // describes the update to the base register that should be performed by
+  // the resulting insn.  LOAD_P is true iff the accesses are loads.
+  virtual rtx gen_pair (rtx *pats, rtx writeback, bool load_p) = 0;
+
+  // Return true if memory is paired access, given INSN and LOAD_P
+  // is true for load insn and false for store insn.
+  virtual bool pair_mem_insn_p (rtx_insn *, bool &) = 0;
+
+  // Return true if we should track loads.
+  virtual bool track_loads_p ()
+  {
+return true;
+  }
+
+  // Return true if we should track stores.
+  virtual bool track_stores_p ()
+  {
+return true;
+  }
+
+  // Return true if OFF  is in range.
+  virtual bool pair_mem_in_range_p (HOST_WIDE_INT off) = 0;
+
+  // Given a load/store pair insn in PATTERN, unpack the insn, storing
+  // the register operands in REGS, and 

[Patch, aarch64] v4: Preparatory patch to place target independent and,dependent changed code in one file

2024-05-14 Thread Ajit Agarwal
Hello Alex/Richard:

All comments are addressed.

There were some issues in sending the patch sending it again.

Common infrastructure of load store pair fusion is divided into target
independent and target dependent changed code.

Target independent code is the Generic code with pure virtual function
to interface betwwen target independent and dependent code.

Target dependent code is the implementation of pure virtual function for
aarch64 target and the call to target independent code.

Bootstrapped on aarch64-linux-gnu.

Thanks & Regards
Ajit



aarch64: Preparatory patch to place target independent and
dependent changed code in one file

Common infrastructure of load store pair fusion is divided into target
independent and target dependent changed code.

Target independent code is the Generic code with pure virtual function
to interface betwwen target independent and dependent code.

Target dependent code is the implementation of pure virtual function for
aarch64 target and the call to target independent code.

2024-05-14  Ajit Kumar Agarwal  

gcc/ChangeLog:

* config/aarch64/aarch64-ldp-fusion.cc: Place target
independent and dependent changed code.
---
 gcc/config/aarch64/aarch64-ldp-fusion.cc | 526 +++
 1 file changed, 345 insertions(+), 181 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc 
b/gcc/config/aarch64/aarch64-ldp-fusion.cc
index 1d9caeab05d..e6af4b0570a 100644
--- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
+++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
@@ -138,6 +138,210 @@ struct alt_base
   poly_int64 offset;
 };
 
+// Virtual base class for load/store walkers used in alias analysis.
+struct alias_walker
+{
+  virtual bool conflict_p (int ) const = 0;
+  virtual insn_info *insn () const = 0;
+  virtual bool valid () const = 0;
+  virtual void advance () = 0;
+};
+
+// This is used in handle_writeback_opportunities describing
+// ALL if aarch64_ldp_writeback > 1 otherwise check
+// EXISTING if aarch64_ldp_writeback.
+enum class writeback {
+  ALL,
+  EXISTING
+};
+
+struct pair_fusion {
+  pair_fusion ()
+  {
+calculate_dominance_info (CDI_DOMINATORS);
+df_analyze ();
+crtl->ssa = new rtl_ssa::function_info (cfun);
+  };
+
+  // Given:
+  // - an rtx REG_OP, the non-memory operand in a load/store insn,
+  // - a machine_mode MEM_MODE, the mode of the MEM in that insn, and
+  // - a boolean LOAD_P (true iff the insn is a load), then:
+  // return true if the access should be considered an FP/SIMD access.
+  // Such accesses are segregated from GPR accesses, since we only want
+  // to form pairs for accesses that use the same register file.
+  virtual bool fpsimd_op_p (rtx, machine_mode, bool)
+  {
+return false;
+  }
+
+  // Return true if we should consider forming ldp/stp insns from memory
+  // accesses with operand mode MODE at this stage in compilation.
+  virtual bool pair_operand_mode_ok_p (machine_mode mode) = 0;
+
+  // Return true iff REG_OP is a suitable register operand for a paired
+  // memory access, where LOAD_P is true if we're asking about loads and
+  // false for stores.  MODE gives the mode of the operand.
+  virtual bool pair_reg_operand_ok_p (bool load_p, rtx reg_op,
+ machine_mode mode) = 0;
+
+  // Return alias check limit.
+  // This is needed to avoid unbounded quadratic behaviour when
+  // performing alias analysis.
+  virtual int pair_mem_alias_check_limit () = 0;
+
+  // Returns true if we should try to handle writeback opportunities
+  // WHICH parameter decides ALL or EXISTING writeback pairs.
+  virtual bool handle_writeback_opportunities (enum writeback which) = 0 ;
+
+  // Given BASE_MEM, the mem from the lower candidate access for a pair,
+  // and LOAD_P (true if the access is a load), check if we should proceed
+  // to form the pair given the target's code generation policy on
+  // paired accesses.
+  virtual bool pair_mem_ok_with_policy (rtx base_mem, bool load_p) = 0;
+
+  // Generate the pattern for a paired access. PATS gives the patterns
+  // for the individual memory accesses (which by this point must share a
+  // common base register).  If WRITEBACK is non-NULL, then this rtx
+  // describes the update to the base register that should be performed by
+  // the resulting insn.  LOAD_P is true iff the accesses are loads.
+  virtual rtx gen_pair (rtx *pats, rtx writeback, bool load_p) = 0;
+
+  // Return true if memory is paired access, given INSN and LOAD_P
+  // is true for load insn and false for store insn.
+  virtual bool pair_mem_insn_p (rtx_insn *, bool &) = 0;
+
+  // Return true if we should track loads.
+  virtual bool track_loads_p ()
+  {
+return true;
+  }
+
+  // Return true if we should track stores.
+  virtual bool track_stores_p ()
+  {
+return true;
+  }
+
+  // Return true if OFF  is in range.
+  virtual bool pair_mem_in_range_p (HOST_WIDE_INT off) = 0;
+
+  // Given a load/store pair insn in PATTERN, unpack 

test mail

2024-05-14 Thread Ajit Agarwal




Test mail

2024-05-14 Thread Ajit Agarwal


[Patch, aarch64] v3: Preparatory patch to place target independent and,dependent changed code in one file

2024-05-10 Thread Ajit Agarwal
Hello Alex:

All comments are addressed.

Common infrastructure of load store pair fusion is divided into target
independent and target dependent changed code.

Target independent code is the Generic code with pure virtual function
to interface betwwen target independent and dependent code.

Target dependent code is the implementation of pure virtual function for
aarch64 target and the call to target independent code.

Bootstrapped on aarch64-linux-gnu.

Thanks & Regards
Ajit



arch64: Preparatory patch to place target independent and
dependent changed code in one file

Common infrastructure of load store pair fusion is divided into target
independent and target dependent changed code.

Target independent code is the Generic code with pure virtual function
to interface betwwen target independent and dependent code.

Target dependent code is the implementation of pure virtual function for
aarch64 target and the call to target independent code.

2024-05-09  Ajit Kumar Agarwal  

gcc/ChangeLog:

* config/aarch64/aarch64-ldp-fusion.cc: Place target
independent and dependent changed code.
---
 gcc/config/aarch64/aarch64-ldp-fusion.cc | 542 +++
 1 file changed, 363 insertions(+), 179 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc 
b/gcc/config/aarch64/aarch64-ldp-fusion.cc
index 1d9caeab05d..217790e111a 100644
--- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
+++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
@@ -138,6 +138,224 @@ struct alt_base
   poly_int64 offset;
 };
 
+// Virtual base class for load/store walkers used in alias analysis.
+struct alias_walker
+{
+  virtual bool conflict_p (int ) const = 0;
+  virtual insn_info *insn () const = 0;
+  virtual bool valid () const = 0;
+  virtual void advance () = 0;
+};
+
+enum class writeback{
+  ALL,
+  EXISTING
+};
+
+struct pair_fusion {
+  pair_fusion ()
+  {
+calculate_dominance_info (CDI_DOMINATORS);
+df_analyze ();
+crtl->ssa = new rtl_ssa::function_info (cfun);
+  };
+
+  // Given:
+  // - an rtx REG_OP, the non-memory operand in a load/store insn,
+  // - a machine_mode MEM_MODE, the mode of the MEM in that insn, and
+  // - a boolean LOAD_P (true iff the insn is a load), then:
+  // return true if the access should be considered an FP/SIMD access.
+  // Such accesses are segregated from GPR accesses, since we only want
+  // to form pairs for accesses that use the same register file.
+  virtual bool fpsimd_op_p (rtx, machine_mode, bool)
+  {
+return false;
+  }
+
+  // Return true if we should consider forming ldp/stp insns from memory
+  // accesses with operand mode MODE at this stage in compilation.
+  virtual bool pair_operand_mode_ok_p (machine_mode mode) = 0;
+
+  // Return true iff REG_OP is a suitable register operand for a paired
+  // memory access, where LOAD_P is true if we're asking about loads and
+  // false for stores.  MEM_MODE gives the mode of the operand.
+  virtual bool pair_reg_operand_ok_p (bool load_p, rtx reg_op,
+ machine_mode mode) = 0;
+
+  // Return alias check limit.
+  // This is needed to avoid unbounded quadratic behaviour when
+  // performing alias analysis.
+  virtual int pair_mem_alias_check_limit () = 0;
+
+  // Returns true if we should try to handle writeback opportunities
+  // (not whether there are any).
+  virtual bool handle_writeback_opportunities (enum writeback which) = 0 ;
+
+  // Given BASE_MEM, the mem from the lower candidate access for a pair,
+  // and LOAD_P (true if the access is a load), check if we should proceed
+  // to form the pair given the target's code generation policy on
+  // paired accesses.
+  virtual bool pair_mem_ok_with_policy (rtx first_mem, bool load_p,
+   machine_mode mode) = 0;
+
+  // Generate the pattern for a paired access. PATS gives the patterns
+  // for the individual memory accesses (which by this point must share a
+  // common base register).  If WRITEBACK is non-NULL, then this rtx
+  // describes the update to the base register that should be performed by
+  // the resulting insn.  LOAD_P is true iff the accesses are loads.
+  virtual rtx gen_pair (rtx *pats, rtx writeback,
+   bool load_p) = 0;
+
+  // Return true if memory writeback can be promoted, given
+  // insn and load_p is true for load insn otherwise false.
+  virtual bool pair_mem_insn_p (rtx_insn *, bool &)
+  {
+ return false;
+  }
+
+  // if we _should_ track loads.
+  virtual bool track_loads_p ()
+  {
+return true;
+  }
+
+  // if we _should_ track stores.
+  virtual bool track_stores_p ()
+  {
+return true;
+  }
+
+  // Return true if offset is in of range.
+  virtual bool pair_mem_in_range_p (HOST_WIDE_INT off) = 0;
+
+  // Given a load/store pair insn in PATTERN, unpack the insn, storing
+  // the register operands in REGS, and returning the mem.
+  virtual rtx destructure_pair (rtx regs[2], rtx rti, bool load_p) = 0;
+

[Patch, aarch64] v3: Preparatory patch to place target independent and,dependent changed code in one file

2024-05-10 Thread Ajit Agarwal
Hello Alex:

All comments are addressed.

Common infrastructure of load store pair fusion is divided into target
independent and target dependent changed code.

Target independent code is the Generic code with pure virtual function
to interface betwwen target independent and dependent code.

Target dependent code is the implementation of pure virtual function for
aarch64 target and the call to target independent code.

Bootstrapped on aarch64-linux-gnu.

Thanks & Regards
Ajit


arch64: Preparatory patch to place target independent and
dependent changed code in one file

Common infrastructure of load store pair fusion is divided into target
independent and target dependent changed code.

Target independent code is the Generic code with pure virtual function
to interface betwwen target independent and dependent code.

Target dependent code is the implementation of pure virtual function for
aarch64 target and the call to target independent code.

2024-05-09  Ajit Kumar Agarwal  

gcc/ChangeLog:

* config/aarch64/aarch64-ldp-fusion.cc: Place target
independent and dependent changed code.
---
 gcc/config/aarch64/aarch64-ldp-fusion.cc | 542 +++
 1 file changed, 363 insertions(+), 179 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc 
b/gcc/config/aarch64/aarch64-ldp-fusion.cc
index 1d9caeab05d..217790e111a 100644
--- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
+++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
@@ -138,6 +138,224 @@ struct alt_base
   poly_int64 offset;
 };
 
+// Virtual base class for load/store walkers used in alias analysis.
+struct alias_walker
+{
+  virtual bool conflict_p (int ) const = 0;
+  virtual insn_info *insn () const = 0;
+  virtual bool valid () const = 0;
+  virtual void advance () = 0;
+};
+
+enum class writeback{
+  ALL,
+  EXISTING
+};
+
+struct pair_fusion {
+  pair_fusion ()
+  {
+calculate_dominance_info (CDI_DOMINATORS);
+df_analyze ();
+crtl->ssa = new rtl_ssa::function_info (cfun);
+  };
+
+  // Given:
+  // - an rtx REG_OP, the non-memory operand in a load/store insn,
+  // - a machine_mode MEM_MODE, the mode of the MEM in that insn, and
+  // - a boolean LOAD_P (true iff the insn is a load), then:
+  // return true if the access should be considered an FP/SIMD access.
+  // Such accesses are segregated from GPR accesses, since we only want
+  // to form pairs for accesses that use the same register file.
+  virtual bool fpsimd_op_p (rtx, machine_mode, bool)
+  {
+return false;
+  }
+
+  // Return true if we should consider forming ldp/stp insns from memory
+  // accesses with operand mode MODE at this stage in compilation.
+  virtual bool pair_operand_mode_ok_p (machine_mode mode) = 0;
+
+  // Return true iff REG_OP is a suitable register operand for a paired
+  // memory access, where LOAD_P is true if we're asking about loads and
+  // false for stores.  MEM_MODE gives the mode of the operand.
+  virtual bool pair_reg_operand_ok_p (bool load_p, rtx reg_op,
+ machine_mode mode) = 0;
+
+  // Return alias check limit.
+  // This is needed to avoid unbounded quadratic behaviour when
+  // performing alias analysis.
+  virtual int pair_mem_alias_check_limit () = 0;
+
+  // Returns true if we should try to handle writeback opportunities
+  // (not whether there are any).
+  virtual bool handle_writeback_opportunities (enum writeback which) = 0 ;
+
+  // Given BASE_MEM, the mem from the lower candidate access for a pair,
+  // and LOAD_P (true if the access is a load), check if we should proceed
+  // to form the pair given the target's code generation policy on
+  // paired accesses.
+  virtual bool pair_mem_ok_with_policy (rtx first_mem, bool load_p,
+   machine_mode mode) = 0;
+
+  // Generate the pattern for a paired access. PATS gives the patterns
+  // for the individual memory accesses (which by this point must share a
+  // common base register).  If WRITEBACK is non-NULL, then this rtx
+  // describes the update to the base register that should be performed by
+  // the resulting insn.  LOAD_P is true iff the accesses are loads.
+  virtual rtx gen_pair (rtx *pats, rtx writeback,
+   bool load_p) = 0;
+
+  // Return true if memory writeback can be promoted, given
+  // insn and load_p is true for load insn otherwise false.
+  virtual bool pair_mem_insn_p (rtx_insn *, bool &)
+  {
+ return false;
+  }
+
+  // if we _should_ track loads.
+  virtual bool track_loads_p ()
+  {
+return true;
+  }
+
+  // if we _should_ track stores.
+  virtual bool track_stores_p ()
+  {
+return true;
+  }
+
+  // Return true if offset is in of range.
+  virtual bool pair_mem_in_range_p (HOST_WIDE_INT off) = 0;
+
+  // Given a load/store pair insn in PATTERN, unpack the insn, storing
+  // the register operands in REGS, and returning the mem.
+  virtual rtx destructure_pair (rtx regs[2], rtx rti, bool load_p) = 0;
+
+ 

test mail

2024-05-10 Thread Ajit Agarwal


[Patch, aarch64] v3: Preparatory patch to place target independent and,dependent changed code in one file

2024-05-09 Thread Ajit Agarwal
Hello Alex/Richard:

All review comments are addressed.

Common infrastructure of load store pair fusion is divided into target
independent and target dependent changed code.

Target independent code is the Generic code with pure virtual function
to interface betwwen target independent and dependent code.

Target dependent code is the implementation of pure virtual function for
aarch64 target and the call to target independent code.

Bootstrapped on aarch64-linux-gnu.

Thanks & Regards
Ajit



aarch64: Preparatory patch to place target independent and
dependent changed code in one file

Common infrastructure of load store pair fusion is divided into target
independent and target dependent changed code.

Target independent code is the Generic code with pure virtual function
to interface betwwen target independent and dependent code.

Target dependent code is the implementation of pure virtual function for
aarch64 target and the call to target independent code.

2024-05-09  Ajit Kumar Agarwal  

gcc/ChangeLog:

* config/aarch64/aarch64-ldp-fusion.cc: Place target
independent and dependent changed code.
---
 gcc/config/aarch64/aarch64-ldp-fusion.cc | 542 +++
 1 file changed, 363 insertions(+), 179 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc 
b/gcc/config/aarch64/aarch64-ldp-fusion.cc
index 1d9caeab05d..217790e111a 100644
--- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
+++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
@@ -138,6 +138,224 @@ struct alt_base
   poly_int64 offset;
 };
 
+// Virtual base class for load/store walkers used in alias analysis.
+struct alias_walker
+{
+  virtual bool conflict_p (int ) const = 0;
+  virtual insn_info *insn () const = 0;
+  virtual bool valid () const = 0;
+  virtual void advance () = 0;
+};
+
+enum class writeback{
+  ALL,
+  EXISTING
+};
+
+struct pair_fusion {
+  pair_fusion ()
+  {
+calculate_dominance_info (CDI_DOMINATORS);
+df_analyze ();
+crtl->ssa = new rtl_ssa::function_info (cfun);
+  };
+
+  // Given:
+  // - an rtx REG_OP, the non-memory operand in a load/store insn,
+  // - a machine_mode MEM_MODE, the mode of the MEM in that insn, and
+  // - a boolean LOAD_P (true iff the insn is a load), then:
+  // return true if the access should be considered an FP/SIMD access.
+  // Such accesses are segregated from GPR accesses, since we only want
+  // to form pairs for accesses that use the same register file.
+  virtual bool fpsimd_op_p (rtx, machine_mode, bool)
+  {
+return false;
+  }
+
+  // Return true if we should consider forming ldp/stp insns from memory
+  // accesses with operand mode MODE at this stage in compilation.
+  virtual bool pair_operand_mode_ok_p (machine_mode mode) = 0;
+
+  // Return true iff REG_OP is a suitable register operand for a paired
+  // memory access, where LOAD_P is true if we're asking about loads and
+  // false for stores.  MEM_MODE gives the mode of the operand.
+  virtual bool pair_reg_operand_ok_p (bool load_p, rtx reg_op,
+ machine_mode mode) = 0;
+
+  // Return alias check limit.
+  // This is needed to avoid unbounded quadratic behaviour when
+  // performing alias analysis.
+  virtual int pair_mem_alias_check_limit () = 0;
+
+  // Returns true if we should try to handle writeback opportunities
+  // (not whether there are any).
+  virtual bool handle_writeback_opportunities (enum writeback which) = 0 ;
+
+  // Given BASE_MEM, the mem from the lower candidate access for a pair,
+  // and LOAD_P (true if the access is a load), check if we should proceed
+  // to form the pair given the target's code generation policy on
+  // paired accesses.
+  virtual bool pair_mem_ok_with_policy (rtx first_mem, bool load_p,
+   machine_mode mode) = 0;
+
+  // Generate the pattern for a paired access. PATS gives the patterns
+  // for the individual memory accesses (which by this point must share a
+  // common base register).  If WRITEBACK is non-NULL, then this rtx
+  // describes the update to the base register that should be performed by
+  // the resulting insn.  LOAD_P is true iff the accesses are loads.
+  virtual rtx gen_pair (rtx *pats, rtx writeback,
+   bool load_p) = 0;
+
+  // Return true if memory writeback can be promoted, given
+  // insn and load_p is true for load insn otherwise false.
+  virtual bool pair_mem_insn_p (rtx_insn *, bool &)
+  {
+ return false;
+  }
+
+  // if we _should_ track loads.
+  virtual bool track_loads_p ()
+  {
+return true;
+  }
+
+  // if we _should_ track stores.
+  virtual bool track_stores_p ()
+  {
+return true;
+  }
+
+  // Return true if offset is in of range.
+  virtual bool pair_mem_in_range_p (HOST_WIDE_INT off) = 0;
+
+  // Given a load/store pair insn in PATTERN, unpack the insn, storing
+  // the register operands in REGS, and returning the mem.
+  virtual rtx destructure_pair (rtx regs[2], rtx rti, bool 

test mail

2024-05-09 Thread Ajit Agarwal


Re: [PATCH, aarch64] v2: Preparatory patch to place target independent and,dependent changed code in one file

2024-05-09 Thread Ajit Agarwal
Hello Alex:

On 08/05/24 6:18 pm, Alex Coplan wrote:
> Hi Ajit,
> 
> Sorry for the long delay in reviewing this.
> 
> This is really getting there now.  I've left a few more comments below.
> 
> Apart from minor style things, the main remaining issues are mostly
> around comments.  It's important to have good clear comments for
> functions with the parameters (and return value, if any) clearly
> described.  See https://www.gnu.org/prep/standards/standards.html#Comments
> 
> Note that this now needs a little rebasing, too.
> 

Done.

> On 21/04/2024 13:22, Ajit Agarwal wrote:
>> Hello Alex/Richard:
>>
>> All review comments are addressed and changes are made to transform_for_base
>> function as per consensus.
>>
>> Common infrastructure of load store pair fusion is divided into target
>> independent and target dependent changed code.
>>
>> Target independent code is the Generic code with pure virtual function
>> to interface betwwen target independent and dependent code.
>>
>> Target dependent code is the implementation of pure virtual function for
>> aarch64 target and the call to target independent code.
>>
>> Bootstrapped on aarch64-linux-gnu.
>>
>> Thanks & Regards
>> Ajit
>>
>>
>>
>> aarch64: Preparatory patch to place target independent and
>> dependent changed code in one file
>>
>> Common infrastructure of load store pair fusion is divided into target
>> independent and target dependent changed code.
>>
>> Target independent code is the Generic code with pure virtual function
>> to interface betwwen target independent and dependent code.
>>
>> Target dependent code is the implementation of pure virtual function for
>> aarch64 target and the call to target independent code.
>>
>> 2024-04-21  Ajit Kumar Agarwal  
>>
>> gcc/ChangeLog:
>>
>>  * config/aarch64/aarch64-ldp-fusion.cc: Place target
>>  independent and dependent changed code
>> ---
>>  gcc/config/aarch64/aarch64-ldp-fusion.cc | 484 +++
>>  1 file changed, 325 insertions(+), 159 deletions(-)
>>
>> diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc 
>> b/gcc/config/aarch64/aarch64-ldp-fusion.cc
>> index 365dcf48b22..83a917e1d20 100644
>> --- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
>> +++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
>> @@ -138,6 +138,189 @@ struct alt_base
>>poly_int64 offset;
>>  };
>>  
>> +// Virtual base class for load/store walkers used in alias analysis.
>> +struct alias_walker
>> +{
>> +  virtual bool conflict_p (int ) const = 0;
>> +  virtual insn_info *insn () const = 0;
>> +  virtual bool valid () const = 0;
>> +  virtual void advance () = 0;
>> +};
>> +
>> +// Forward declaration to be used inside the aarch64_pair_fusion class.
>> +bool ldp_operand_mode_ok_p (machine_mode mode);
>> +rtx aarch64_destructure_load_pair (rtx regs[2], rtx pattern);
>> +rtx aarch64_destructure_store_pair (rtx regs[2], rtx pattern);
>> +rtx aarch64_gen_writeback_pair (rtx wb_effect, rtx pair_mem, rtx regs[2],
>> +bool load_p);
> 
> I don't think we want to change the linkage of these, they should be kept
> static.
> 
>> +enum class writeback{
> 
> Nit: space before '{'
> 
>> +  WRITEBACK_PAIR_P,
>> +  WRITEBACK
>> +};
> 
> We're going to want some more descriptive names here.  How about
> EXISTING and ALL?  Note that the WRITEBACK_ prefix isn't necessary as
> you're using an enum class, so uses of the enumerators need to be
> prefixed with writeback:: anyway.  A comment describing the usage of the
> enum as well as comments above the enumerators describing their
> interpretation would be good.
> 

Done.
>> +
>> +struct pair_fusion {
>> +
> 
> Nit: excess blank line.
> 
>> +  pair_fusion ()
>> +  {
>> +calculate_dominance_info (CDI_DOMINATORS);
>> +df_analyze ();
>> +crtl->ssa = new rtl_ssa::function_info (cfun);
>> +  };
> 
> Can we have one blank line between the virtual functions, please?  I
> think that would be more readable now that there are comments above each
> of them.
> 

Done.

>> +  // Return true if GPR is FP or SIMD accesses, passed
>> +  // with GPR reg_op rtx, machine mode and load_p.
> 
> It's slightly awkward trying to document this without the parameter
> names, but I can see that you're omitting them to avoid unused parameter
> warnings.  One option would be to introduce names in the comment as you
> go.  How about this instead:
>

Re: [PATCH V4 1/3] aarch64: Place target independent and dependent changed code in one file

2024-04-22 Thread Ajit Agarwal
Hello Alex:

On 14/04/24 10:29 pm, Ajit Agarwal wrote:
> Hello Alex:
> 
> On 12/04/24 11:02 pm, Ajit Agarwal wrote:
>> Hello Alex:
>>
>> On 12/04/24 8:15 pm, Alex Coplan wrote:
>>> On 12/04/2024 20:02, Ajit Agarwal wrote:
>>>> Hello Alex:
>>>>
>>>> On 11/04/24 7:55 pm, Alex Coplan wrote:
>>>>> On 10/04/2024 23:48, Ajit Agarwal wrote:
>>>>>> Hello Alex:
>>>>>>
>>>>>> On 10/04/24 7:52 pm, Alex Coplan wrote:
>>>>>>> Hi Ajit,
>>>>>>>
>>>>>>> On 10/04/2024 15:31, Ajit Agarwal wrote:
>>>>>>>> Hello Alex:
>>>>>>>>
>>>>>>>> On 10/04/24 1:42 pm, Alex Coplan wrote:
>>>>>>>>> Hi Ajit,
>>>>>>>>>
>>>>>>>>> On 09/04/2024 20:59, Ajit Agarwal wrote:
>>>>>>>>>> Hello Alex:
>>>>>>>>>>
>>>>>>>>>> On 09/04/24 8:39 pm, Alex Coplan wrote:
>>>>>>>>>>> On 09/04/2024 20:01, Ajit Agarwal wrote:
>>>>>>>>>>>> Hello Alex:
>>>>>>>>>>>>
>>>>>>>>>>>> On 09/04/24 7:29 pm, Alex Coplan wrote:
>>>>>>>>>>>>> On 09/04/2024 17:30, Ajit Agarwal wrote:
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> On 05/04/24 10:03 pm, Alex Coplan wrote:
>>>>>>>>>>>>>>> On 05/04/2024 13:53, Ajit Agarwal wrote:
>>>>>>>>>>>>>>>> Hello Alex/Richard:
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> All review comments are incorporated.
>>>>>>> 
>>>>>>>>>>>>>>>> @@ -2890,8 +3018,8 @@ ldp_bb_info::merge_pairs (insn_list_t 
>>>>>>>>>>>>>>>> _list,
>>>>>>>>>>>>>>>>  // of accesses.  If we find two sets of adjacent accesses, 
>>>>>>>>>>>>>>>> call
>>>>>>>>>>>>>>>>  // merge_pairs.
>>>>>>>>>>>>>>>>  void
>>>>>>>>>>>>>>>> -ldp_bb_info::transform_for_base (int encoded_lfs,
>>>>>>>>>>>>>>>> -   access_group )
>>>>>>>>>>>>>>>> +pair_fusion_bb_info::transform_for_base (int encoded_lfs,
>>>>>>>>>>>>>>>> +   access_group )
>>>>>>>>>>>>>>>>  {
>>>>>>>>>>>>>>>>const auto lfs = decode_lfs (encoded_lfs);
>>>>>>>>>>>>>>>>const unsigned access_size = lfs.size;
>>>>>>>>>>>>>>>> @@ -2909,7 +3037,7 @@ ldp_bb_info::transform_for_base (int 
>>>>>>>>>>>>>>>> encoded_lfs,
>>>>>>>>>>>>>>>>   access.cand_insns,
>>>>>>>>>>>>>>>>   lfs.load_p,
>>>>>>>>>>>>>>>>   access_size);
>>>>>>>>>>>>>>>> -skip_next = access.cand_insns.empty ();
>>>>>>>>>>>>>>>> +skip_next = bb_state->cand_insns_empty_p 
>>>>>>>>>>>>>>>> (access.cand_insns);
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> As above, why is this needed?
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> For rs6000 we want to return always true. as load store pair
>>>>>>>>>>>>>> that are to be merged with 8/16 16/32 32/64 is occuring for 
>>>>>>>>>>>>>> rs6000.
>>>>>>>>>>>>>> And we want load store pair to 8/16 32/64. Thats why we want
>>>>>>>>>>>>>> to generate always true for

[PATCH, aarch64] v2: Preparatory patch to place target independent and,dependent changed code in one file

2024-04-21 Thread Ajit Agarwal
Hello Alex/Richard:

All review comments are addressed and changes are made to transform_for_base
function as per consensus.

Common infrastructure of load store pair fusion is divided into target
independent and target dependent changed code.

Target independent code is the Generic code with pure virtual function
to interface betwwen target independent and dependent code.

Target dependent code is the implementation of pure virtual function for
aarch64 target and the call to target independent code.

Bootstrapped on aarch64-linux-gnu.

Thanks & Regards
Ajit



aarch64: Preparatory patch to place target independent and
dependent changed code in one file

Common infrastructure of load store pair fusion is divided into target
independent and target dependent changed code.

Target independent code is the Generic code with pure virtual function
to interface betwwen target independent and dependent code.

Target dependent code is the implementation of pure virtual function for
aarch64 target and the call to target independent code.

2024-04-21  Ajit Kumar Agarwal  

gcc/ChangeLog:

* config/aarch64/aarch64-ldp-fusion.cc: Place target
independent and dependent changed code
---
 gcc/config/aarch64/aarch64-ldp-fusion.cc | 484 +++
 1 file changed, 325 insertions(+), 159 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc 
b/gcc/config/aarch64/aarch64-ldp-fusion.cc
index 365dcf48b22..83a917e1d20 100644
--- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
+++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
@@ -138,6 +138,189 @@ struct alt_base
   poly_int64 offset;
 };
 
+// Virtual base class for load/store walkers used in alias analysis.
+struct alias_walker
+{
+  virtual bool conflict_p (int ) const = 0;
+  virtual insn_info *insn () const = 0;
+  virtual bool valid () const = 0;
+  virtual void advance () = 0;
+};
+
+// Forward declaration to be used inside the aarch64_pair_fusion class.
+bool ldp_operand_mode_ok_p (machine_mode mode);
+rtx aarch64_destructure_load_pair (rtx regs[2], rtx pattern);
+rtx aarch64_destructure_store_pair (rtx regs[2], rtx pattern);
+rtx aarch64_gen_writeback_pair (rtx wb_effect, rtx pair_mem, rtx regs[2],
+   bool load_p);
+enum class writeback{
+  WRITEBACK_PAIR_P,
+  WRITEBACK
+};
+
+struct pair_fusion {
+
+  pair_fusion ()
+  {
+calculate_dominance_info (CDI_DOMINATORS);
+df_analyze ();
+crtl->ssa = new rtl_ssa::function_info (cfun);
+  };
+  // Return true if GPR is FP or SIMD accesses, passed
+  // with GPR reg_op rtx, machine mode and load_p.
+  virtual bool fpsimd_op_p (rtx, machine_mode, bool)
+  {
+return false;
+  }
+  // Return true if pair operand mode is ok. Passed with
+  // machine mode.
+  virtual bool pair_operand_mode_ok_p (machine_mode mode) = 0;
+  // Return true if reg operand is ok, passed with load_p,
+  // reg_op rtx and machine mode.
+  virtual bool pair_reg_operand_ok_p (bool load_p, rtx reg_op,
+ machine_mode mem_mode) = 0;
+  // Return alias check limit.
+  virtual int pair_mem_alias_check_limit () = 0;
+  // Return true if there is writeback opportunities. Passed
+  // with enum writeback.
+  virtual bool handle_writeback_opportunities (enum writeback wback) = 0 ;
+  // Return true if mem ok ldp stp policy model passed with
+  // rtx mem, load_p and machine mode.
+  virtual bool pair_mem_ok_with_policy (rtx first_mem, bool load_p,
+   machine_mode mode) = 0;
+  // Gen load store mem pair. Return load store rtx passed
+  // with arguments load store pattern, writeback rtx and
+  // load_p.
+  virtual rtx gen_mem_pair (rtx *pats, rtx writeback,
+   bool load_p) = 0;
+  // Return true if memory writeback can be promoted, passed
+  // with insn, rtx pattern and load_p. load_p is set by this
+  // hook.
+  virtual bool pair_mem_promote_writeback_p (insn_info *, rtx, bool &)
+  {
+ return false;
+  }
+  // Return true if we track loads.
+  virtual bool track_loads_p ()
+  {
+return true;
+  }
+  // Return true if we track stores.
+  virtual bool track_stores_p ()
+  {
+return true;
+  }
+  // Return true if offset is out of range.
+  virtual bool pair_mem_out_of_range_p (HOST_WIDE_INT off) = 0;
+  // Return destructure pair. Passed with rtx reg, insn pattern
+  // and load_p.
+  virtual rtx gen_destructure_pair (rtx regs[2], rtx rti, bool load_p) = 0;
+  // Return writeback pair. Passed with rtx writeback effect, mem rtx
+  // regs rtx and load_p.
+  virtual rtx gen_writeback_pair (rtx wb_effect, rtx mem,
+ rtx regs[2], bool load_p) = 0;
+  void ldp_fusion_bb (bb_info *bb);
+  insn_info * find_trailing_add (insn_info *insns[2],
+const insn_range_info _range,
+int initial_writeback,
+rtx *writeback_effect,
+def_info 

Re: [PATCH V4 1/3] aarch64: Place target independent and dependent changed code in one file

2024-04-14 Thread Ajit Agarwal
Hello Alex:

On 12/04/24 11:02 pm, Ajit Agarwal wrote:
> Hello Alex:
> 
> On 12/04/24 8:15 pm, Alex Coplan wrote:
>> On 12/04/2024 20:02, Ajit Agarwal wrote:
>>> Hello Alex:
>>>
>>> On 11/04/24 7:55 pm, Alex Coplan wrote:
>>>> On 10/04/2024 23:48, Ajit Agarwal wrote:
>>>>> Hello Alex:
>>>>>
>>>>> On 10/04/24 7:52 pm, Alex Coplan wrote:
>>>>>> Hi Ajit,
>>>>>>
>>>>>> On 10/04/2024 15:31, Ajit Agarwal wrote:
>>>>>>> Hello Alex:
>>>>>>>
>>>>>>> On 10/04/24 1:42 pm, Alex Coplan wrote:
>>>>>>>> Hi Ajit,
>>>>>>>>
>>>>>>>> On 09/04/2024 20:59, Ajit Agarwal wrote:
>>>>>>>>> Hello Alex:
>>>>>>>>>
>>>>>>>>> On 09/04/24 8:39 pm, Alex Coplan wrote:
>>>>>>>>>> On 09/04/2024 20:01, Ajit Agarwal wrote:
>>>>>>>>>>> Hello Alex:
>>>>>>>>>>>
>>>>>>>>>>> On 09/04/24 7:29 pm, Alex Coplan wrote:
>>>>>>>>>>>> On 09/04/2024 17:30, Ajit Agarwal wrote:
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> On 05/04/24 10:03 pm, Alex Coplan wrote:
>>>>>>>>>>>>>> On 05/04/2024 13:53, Ajit Agarwal wrote:
>>>>>>>>>>>>>>> Hello Alex/Richard:
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> All review comments are incorporated.
>>>>>> 
>>>>>>>>>>>>>>> @@ -2890,8 +3018,8 @@ ldp_bb_info::merge_pairs (insn_list_t 
>>>>>>>>>>>>>>> _list,
>>>>>>>>>>>>>>>  // of accesses.  If we find two sets of adjacent accesses, call
>>>>>>>>>>>>>>>  // merge_pairs.
>>>>>>>>>>>>>>>  void
>>>>>>>>>>>>>>> -ldp_bb_info::transform_for_base (int encoded_lfs,
>>>>>>>>>>>>>>> -access_group )
>>>>>>>>>>>>>>> +pair_fusion_bb_info::transform_for_base (int encoded_lfs,
>>>>>>>>>>>>>>> +access_group )
>>>>>>>>>>>>>>>  {
>>>>>>>>>>>>>>>const auto lfs = decode_lfs (encoded_lfs);
>>>>>>>>>>>>>>>const unsigned access_size = lfs.size;
>>>>>>>>>>>>>>> @@ -2909,7 +3037,7 @@ ldp_bb_info::transform_for_base (int 
>>>>>>>>>>>>>>> encoded_lfs,
>>>>>>>>>>>>>>>access.cand_insns,
>>>>>>>>>>>>>>>lfs.load_p,
>>>>>>>>>>>>>>>access_size);
>>>>>>>>>>>>>>> - skip_next = access.cand_insns.empty ();
>>>>>>>>>>>>>>> + skip_next = bb_state->cand_insns_empty_p 
>>>>>>>>>>>>>>> (access.cand_insns);
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> As above, why is this needed?
>>>>>>>>>>>>>
>>>>>>>>>>>>> For rs6000 we want to return always true. as load store pair
>>>>>>>>>>>>> that are to be merged with 8/16 16/32 32/64 is occuring for 
>>>>>>>>>>>>> rs6000.
>>>>>>>>>>>>> And we want load store pair to 8/16 32/64. Thats why we want
>>>>>>>>>>>>> to generate always true for rs6000 to skip pairs as above.
>>>>>>>>>>>>
>>>>>>>>>>>> Hmm, sorry, I'm not sure I follow.  Are you saying that for rs6000 
>>>>>>>>>>>> you have
>>>>>>>>>>>> load/store pair instructions where the two arms of the access are 
>&

Re: [PATCH V4 1/3] aarch64: Place target independent and dependent changed code in one file

2024-04-12 Thread Ajit Agarwal
Hello Alex:

On 12/04/24 8:15 pm, Alex Coplan wrote:
> On 12/04/2024 20:02, Ajit Agarwal wrote:
>> Hello Alex:
>>
>> On 11/04/24 7:55 pm, Alex Coplan wrote:
>>> On 10/04/2024 23:48, Ajit Agarwal wrote:
>>>> Hello Alex:
>>>>
>>>> On 10/04/24 7:52 pm, Alex Coplan wrote:
>>>>> Hi Ajit,
>>>>>
>>>>> On 10/04/2024 15:31, Ajit Agarwal wrote:
>>>>>> Hello Alex:
>>>>>>
>>>>>> On 10/04/24 1:42 pm, Alex Coplan wrote:
>>>>>>> Hi Ajit,
>>>>>>>
>>>>>>> On 09/04/2024 20:59, Ajit Agarwal wrote:
>>>>>>>> Hello Alex:
>>>>>>>>
>>>>>>>> On 09/04/24 8:39 pm, Alex Coplan wrote:
>>>>>>>>> On 09/04/2024 20:01, Ajit Agarwal wrote:
>>>>>>>>>> Hello Alex:
>>>>>>>>>>
>>>>>>>>>> On 09/04/24 7:29 pm, Alex Coplan wrote:
>>>>>>>>>>> On 09/04/2024 17:30, Ajit Agarwal wrote:
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> On 05/04/24 10:03 pm, Alex Coplan wrote:
>>>>>>>>>>>>> On 05/04/2024 13:53, Ajit Agarwal wrote:
>>>>>>>>>>>>>> Hello Alex/Richard:
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> All review comments are incorporated.
>>>>> 
>>>>>>>>>>>>>> @@ -2890,8 +3018,8 @@ ldp_bb_info::merge_pairs (insn_list_t 
>>>>>>>>>>>>>> _list,
>>>>>>>>>>>>>>  // of accesses.  If we find two sets of adjacent accesses, call
>>>>>>>>>>>>>>  // merge_pairs.
>>>>>>>>>>>>>>  void
>>>>>>>>>>>>>> -ldp_bb_info::transform_for_base (int encoded_lfs,
>>>>>>>>>>>>>> - access_group )
>>>>>>>>>>>>>> +pair_fusion_bb_info::transform_for_base (int encoded_lfs,
>>>>>>>>>>>>>> + access_group )
>>>>>>>>>>>>>>  {
>>>>>>>>>>>>>>const auto lfs = decode_lfs (encoded_lfs);
>>>>>>>>>>>>>>const unsigned access_size = lfs.size;
>>>>>>>>>>>>>> @@ -2909,7 +3037,7 @@ ldp_bb_info::transform_for_base (int 
>>>>>>>>>>>>>> encoded_lfs,
>>>>>>>>>>>>>> access.cand_insns,
>>>>>>>>>>>>>> lfs.load_p,
>>>>>>>>>>>>>> access_size);
>>>>>>>>>>>>>> -  skip_next = access.cand_insns.empty ();
>>>>>>>>>>>>>> +  skip_next = bb_state->cand_insns_empty_p 
>>>>>>>>>>>>>> (access.cand_insns);
>>>>>>>>>>>>>
>>>>>>>>>>>>> As above, why is this needed?
>>>>>>>>>>>>
>>>>>>>>>>>> For rs6000 we want to return always true. as load store pair
>>>>>>>>>>>> that are to be merged with 8/16 16/32 32/64 is occuring for rs6000.
>>>>>>>>>>>> And we want load store pair to 8/16 32/64. Thats why we want
>>>>>>>>>>>> to generate always true for rs6000 to skip pairs as above.
>>>>>>>>>>>
>>>>>>>>>>> Hmm, sorry, I'm not sure I follow.  Are you saying that for rs6000 
>>>>>>>>>>> you have
>>>>>>>>>>> load/store pair instructions where the two arms of the access are 
>>>>>>>>>>> storing
>>>>>>>>>>> operands of different sizes?  Or something else?
>>>>>>>>>>>
>>>>>>>>>>> As it stands the logic is to skip the next iteration only if we
>>>>>>>>>>> exhausted all the candidate insns for 

Re: [PATCH V4 1/3] aarch64: Place target independent and dependent changed code in one file

2024-04-12 Thread Ajit Agarwal
Hello Alex:

On 11/04/24 7:55 pm, Alex Coplan wrote:
> On 10/04/2024 23:48, Ajit Agarwal wrote:
>> Hello Alex:
>>
>> On 10/04/24 7:52 pm, Alex Coplan wrote:
>>> Hi Ajit,
>>>
>>> On 10/04/2024 15:31, Ajit Agarwal wrote:
>>>> Hello Alex:
>>>>
>>>> On 10/04/24 1:42 pm, Alex Coplan wrote:
>>>>> Hi Ajit,
>>>>>
>>>>> On 09/04/2024 20:59, Ajit Agarwal wrote:
>>>>>> Hello Alex:
>>>>>>
>>>>>> On 09/04/24 8:39 pm, Alex Coplan wrote:
>>>>>>> On 09/04/2024 20:01, Ajit Agarwal wrote:
>>>>>>>> Hello Alex:
>>>>>>>>
>>>>>>>> On 09/04/24 7:29 pm, Alex Coplan wrote:
>>>>>>>>> On 09/04/2024 17:30, Ajit Agarwal wrote:
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> On 05/04/24 10:03 pm, Alex Coplan wrote:
>>>>>>>>>>> On 05/04/2024 13:53, Ajit Agarwal wrote:
>>>>>>>>>>>> Hello Alex/Richard:
>>>>>>>>>>>>
>>>>>>>>>>>> All review comments are incorporated.
>>> 
>>>>>>>>>>>> @@ -2890,8 +3018,8 @@ ldp_bb_info::merge_pairs (insn_list_t 
>>>>>>>>>>>> _list,
>>>>>>>>>>>>  // of accesses.  If we find two sets of adjacent accesses, call
>>>>>>>>>>>>  // merge_pairs.
>>>>>>>>>>>>  void
>>>>>>>>>>>> -ldp_bb_info::transform_for_base (int encoded_lfs,
>>>>>>>>>>>> -   access_group )
>>>>>>>>>>>> +pair_fusion_bb_info::transform_for_base (int encoded_lfs,
>>>>>>>>>>>> +   access_group )
>>>>>>>>>>>>  {
>>>>>>>>>>>>const auto lfs = decode_lfs (encoded_lfs);
>>>>>>>>>>>>const unsigned access_size = lfs.size;
>>>>>>>>>>>> @@ -2909,7 +3037,7 @@ ldp_bb_info::transform_for_base (int 
>>>>>>>>>>>> encoded_lfs,
>>>>>>>>>>>>   access.cand_insns,
>>>>>>>>>>>>   lfs.load_p,
>>>>>>>>>>>>   access_size);
>>>>>>>>>>>> -skip_next = access.cand_insns.empty ();
>>>>>>>>>>>> +skip_next = bb_state->cand_insns_empty_p (access.cand_insns);
>>>>>>>>>>>
>>>>>>>>>>> As above, why is this needed?
>>>>>>>>>>
>>>>>>>>>> For rs6000 we want to return always true. as load store pair
>>>>>>>>>> that are to be merged with 8/16 16/32 32/64 is occuring for rs6000.
>>>>>>>>>> And we want load store pair to 8/16 32/64. Thats why we want
>>>>>>>>>> to generate always true for rs6000 to skip pairs as above.
>>>>>>>>>
>>>>>>>>> Hmm, sorry, I'm not sure I follow.  Are you saying that for rs6000 
>>>>>>>>> you have
>>>>>>>>> load/store pair instructions where the two arms of the access are 
>>>>>>>>> storing
>>>>>>>>> operands of different sizes?  Or something else?
>>>>>>>>>
>>>>>>>>> As it stands the logic is to skip the next iteration only if we
>>>>>>>>> exhausted all the candidate insns for the current access.  In the case
>>>>>>>>> that we didn't exhaust all such candidates, then the idea is that when
>>>>>>>>> access becomes prev_access, we can attempt to use those candidates as
>>>>>>>>> the "left-hand side" of a pair in the next iteration since we failed 
>>>>>>>>> to
>>>>>>>>> use them as the "right-hand side" of a pair in the current iteration.
>>>>>>>>> I don't see why you wouldn't want that behaviour.  Please can you
>>>>>>>>> explain?
>>>>>>>>>
>>>>>>>>

Re: [PATCH V4 1/3] aarch64: Place target independent and dependent changed code in one file

2024-04-10 Thread Ajit Agarwal
Hello Alex:

On 10/04/24 7:52 pm, Alex Coplan wrote:
> Hi Ajit,
> 
> On 10/04/2024 15:31, Ajit Agarwal wrote:
>> Hello Alex:
>>
>> On 10/04/24 1:42 pm, Alex Coplan wrote:
>>> Hi Ajit,
>>>
>>> On 09/04/2024 20:59, Ajit Agarwal wrote:
>>>> Hello Alex:
>>>>
>>>> On 09/04/24 8:39 pm, Alex Coplan wrote:
>>>>> On 09/04/2024 20:01, Ajit Agarwal wrote:
>>>>>> Hello Alex:
>>>>>>
>>>>>> On 09/04/24 7:29 pm, Alex Coplan wrote:
>>>>>>> On 09/04/2024 17:30, Ajit Agarwal wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>> On 05/04/24 10:03 pm, Alex Coplan wrote:
>>>>>>>>> On 05/04/2024 13:53, Ajit Agarwal wrote:
>>>>>>>>>> Hello Alex/Richard:
>>>>>>>>>>
>>>>>>>>>> All review comments are incorporated.
> 
>>>>>>>>>> @@ -2890,8 +3018,8 @@ ldp_bb_info::merge_pairs (insn_list_t 
>>>>>>>>>> _list,
>>>>>>>>>>  // of accesses.  If we find two sets of adjacent accesses, call
>>>>>>>>>>  // merge_pairs.
>>>>>>>>>>  void
>>>>>>>>>> -ldp_bb_info::transform_for_base (int encoded_lfs,
>>>>>>>>>> - access_group )
>>>>>>>>>> +pair_fusion_bb_info::transform_for_base (int encoded_lfs,
>>>>>>>>>> + access_group )
>>>>>>>>>>  {
>>>>>>>>>>const auto lfs = decode_lfs (encoded_lfs);
>>>>>>>>>>const unsigned access_size = lfs.size;
>>>>>>>>>> @@ -2909,7 +3037,7 @@ ldp_bb_info::transform_for_base (int 
>>>>>>>>>> encoded_lfs,
>>>>>>>>>> access.cand_insns,
>>>>>>>>>> lfs.load_p,
>>>>>>>>>> access_size);
>>>>>>>>>> -  skip_next = access.cand_insns.empty ();
>>>>>>>>>> +  skip_next = bb_state->cand_insns_empty_p (access.cand_insns);
>>>>>>>>>
>>>>>>>>> As above, why is this needed?
>>>>>>>>
>>>>>>>> For rs6000 we want to return always true. as load store pair
>>>>>>>> that are to be merged with 8/16 16/32 32/64 is occuring for rs6000.
>>>>>>>> And we want load store pair to 8/16 32/64. Thats why we want
>>>>>>>> to generate always true for rs6000 to skip pairs as above.
>>>>>>>
>>>>>>> Hmm, sorry, I'm not sure I follow.  Are you saying that for rs6000 you 
>>>>>>> have
>>>>>>> load/store pair instructions where the two arms of the access are 
>>>>>>> storing
>>>>>>> operands of different sizes?  Or something else?
>>>>>>>
>>>>>>> As it stands the logic is to skip the next iteration only if we
>>>>>>> exhausted all the candidate insns for the current access.  In the case
>>>>>>> that we didn't exhaust all such candidates, then the idea is that when
>>>>>>> access becomes prev_access, we can attempt to use those candidates as
>>>>>>> the "left-hand side" of a pair in the next iteration since we failed to
>>>>>>> use them as the "right-hand side" of a pair in the current iteration.
>>>>>>> I don't see why you wouldn't want that behaviour.  Please can you
>>>>>>> explain?
>>>>>>>
>>>>>>
>>>>>> In merge_pair we get the 2 load candiates one load from 0 offset and
>>>>>> other load is from 16th offset. Then in next iteration we get load
>>>>>> from 16th offset and other load from 32 offset. In next iteration
>>>>>> we get load from 32 offset and other load from 48 offset.
>>>>>>
>>>>>> For example:
>>>>>>
>>>>>> Currently we get the load candiates as follows.
>>>>>>
>>>>>> pairs:
>>>>>>
>>>>>> load from 0th offset.
>>>>>> load from 16th of

[PATCH v1] aarch64: Preparatory Patch to place target independent and dependent changed code in one file

2024-04-10 Thread Ajit Agarwal
Hello Alex/Richard:

All comments are addressed in this version-1 of the patch.

Common infrastructure of load store pair fusion is divded into target
independent and target dependent changed code.

Target independent code is the Generic code with pure virtual function
to interface betwwen target independent and dependent code.

Target dependent code is the implementation of pure virtual function for
aarch64 target and the call to target independent code.

Thanks & Regards
Ajit


aarch64: Place target independent and dependent changed code in one file

Common infrastructure of load store pair fusion is divided into target
independent and target dependent changed code.

Target independent code is the Generic code with pure virtual function
to interface betwwen target independent and dependent code.

Target dependent code is the implementation of pure virtual function for
aarch64 target and the call to target independent code.

2024-04-10  Ajit Kumar Agarwal  

gcc/ChangeLog:

* config/aarch64/aarch64-ldp-fusion.cc: Place target
independent and dependent changed code.
---
 gcc/config/aarch64/aarch64-ldp-fusion.cc | 497 +++
 1 file changed, 337 insertions(+), 160 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc 
b/gcc/config/aarch64/aarch64-ldp-fusion.cc
index 365dcf48b22..03e8572ebfd 100644
--- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
+++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
@@ -138,6 +138,198 @@ struct alt_base
   poly_int64 offset;
 };
 
+// Virtual base class for load/store walkers used in alias analysis.
+struct alias_walker
+{
+  virtual bool conflict_p (int ) const = 0;
+  virtual insn_info *insn () const = 0;
+  virtual bool valid () const = 0;
+  virtual void advance () = 0;
+};
+
+// Forward declaration to be used inside the aarch64_pair_fusion class.
+bool ldp_operand_mode_ok_p (machine_mode mode);
+rtx aarch64_destructure_load_pair (rtx regs[2], rtx pattern);
+rtx aarch64_destructure_store_pair (rtx regs[2], rtx pattern);
+rtx aarch64_gen_writeback_pair (rtx wb_effect, rtx pair_mem, rtx regs[2],
+   bool load_p);
+enum class writeback{
+  WRITEBACK_PAIR_P,
+  WRITEBACK
+};
+
+struct pair_fusion {
+
+  pair_fusion ()
+  {
+calculate_dominance_info (CDI_DOMINATORS);
+df_analyze ();
+crtl->ssa = new rtl_ssa::function_info (cfun);
+  };
+  // Return true if GPR is FP or SIMD accesses, passed
+  // with GPR reg_op rtx, machine mode and load_p.
+  virtual bool fpsimd_op_p (rtx, machine_mode, bool)
+  {
+return false;
+  }
+  // Return true if pair operand mode is ok. Passed with
+  // machine mode.
+  virtual bool pair_operand_mode_ok_p (machine_mode mode) = 0;
+  // Return true if reg operand is ok, passed with load_p,
+  // reg_op rtx and machine mode.
+  virtual bool pair_reg_operand_ok_p (bool load_p, rtx reg_op,
+ machine_mode mem_mode) = 0;
+  // Return alias check limit.
+  virtual int pair_mem_alias_check_limit () = 0;
+  // Return true if there is writeback opportunities. Passed
+  // with enum writeback.
+  virtual bool handle_writeback_opportunities (enum writeback wback) = 0 ;
+  // Return true if mem ok ldp stp policy model passed with
+  // rtx mem, load_p and machine mode.
+  virtual bool pair_mem_ok_with_policy (rtx first_mem, bool load_p,
+   machine_mode mode) = 0;
+  // Gen load store mem pair. Return load store rtx passed
+  // with arguments load store pattern, writeback rtx and
+  // load_p.
+  virtual rtx gen_mem_pair (rtx *pats, rtx writeback,
+   bool load_p) = 0;
+  // Return true if memory writeback can be promoted, passed
+  // with insn, rtx pattern and load_p. load_p is set by this
+  // hook.
+  virtual bool pair_mem_promote_writeback_p (insn_info *, rtx, bool &)
+  {
+ return false;
+  }
+  // Return true if we track loads.
+  virtual bool track_loads_p ()
+  {
+return true;
+  }
+  // Return true if we track stores.
+  virtual bool track_stores_p ()
+  {
+return true;
+  }
+  // Return true if offset is out of range.
+  virtual bool pair_mem_out_of_range_p (HOST_WIDE_INT off) = 0;
+  // Return destructure pair. Passed with rtx reg, insn pattern
+  // and load_p.
+  virtual rtx gen_destructure_pair (rtx regs[2], rtx rti, bool load_p) = 0;
+  // Return writeback pair. Passed with rtx writeback effect, mem rtx
+  // regs rtx and load_p.
+  virtual rtx gen_writeback_pair (rtx wb_effect, rtx mem,
+ rtx regs[2], bool load_p) = 0;
+  // Return true if offset is aligned and multiple of 32.
+  // Passed with offset and access_size to check multiple of 32.
+  virtual bool pair_offset_alignment_ok_p (poly_int64 offset,
+  unsigned access_size) = 0;
+  void ldp_fusion_bb (bb_info *bb);
+  insn_info * find_trailing_add (insn_info *insns[2],
+const insn_range_info _range,
+  

Re: [PATCH V4 1/3] aarch64: Place target independent and dependent changed code in one file

2024-04-10 Thread Ajit Agarwal
Hello Alex:

On 10/04/24 1:42 pm, Alex Coplan wrote:
> Hi Ajit,
> 
> On 09/04/2024 20:59, Ajit Agarwal wrote:
>> Hello Alex:
>>
>> On 09/04/24 8:39 pm, Alex Coplan wrote:
>>> On 09/04/2024 20:01, Ajit Agarwal wrote:
>>>> Hello Alex:
>>>>
>>>> On 09/04/24 7:29 pm, Alex Coplan wrote:
>>>>> On 09/04/2024 17:30, Ajit Agarwal wrote:
>>>>>>
>>>>>>
>>>>>> On 05/04/24 10:03 pm, Alex Coplan wrote:
>>>>>>> On 05/04/2024 13:53, Ajit Agarwal wrote:
>>>>>>>> Hello Alex/Richard:
>>>>>>>>
>>>>>>>> All review comments are incorporated.
>>>>>>>
>>>>>>> Thanks, I was kind-of expecting you to also send the renaming patch as a
>>>>>>> preparatory patch as we discussed.
>>>>>>>
>>>>>>> Sorry for another meta comment, but: I think the reason that the Linaro
>>>>>>> CI isn't running tests on your patches is actually because you're
>>>>>>> sending 1/3 of a series but not sending the rest of the series.
>>>>>>>
>>>>>>> So please can you either send this as an individual preparatory patch
>>>>>>> (not marked as a series) or if you're going to send a series (e.g. with
>>>>>>> a preparatory rename patch as 1/2 and this as 2/2) then send the entire
>>>>>>> series when you make updates.  That way the CI should test your patches,
>>>>>>> which would be helpful.
>>>>>>>
>>>>>>
>>>>>> Addressed.
>>>>>>  
>>>>>>>>
>>>>>>>> Common infrastructure of load store pair fusion is divided into target
>>>>>>>> independent and target dependent changed code.
>>>>>>>>
>>>>>>>> Target independent code is the Generic code with pure virtual function
>>>>>>>> to interface betwwen target independent and dependent code.
>>>>>>>>
>>>>>>>> Target dependent code is the implementation of pure virtual function 
>>>>>>>> for
>>>>>>>> aarch64 target and the call to target independent code.
>>>>>>>>
>>>>>>>> Thanks & Regards
>>>>>>>> Ajit
>>>>>>>>
>>>>>>>>
>>>>>>>> aarch64: Place target independent and dependent changed code in one 
>>>>>>>> file
>>>>>>>>
>>>>>>>> Common infrastructure of load store pair fusion is divided into target
>>>>>>>> independent and target dependent changed code.
>>>>>>>>
>>>>>>>> Target independent code is the Generic code with pure virtual function
>>>>>>>> to interface betwwen target independent and dependent code.
>>>>>>>>
>>>>>>>> Target dependent code is the implementation of pure virtual function 
>>>>>>>> for
>>>>>>>> aarch64 target and the call to target independent code.
>>>>>>>>
>>>>>>>> 2024-04-06  Ajit Kumar Agarwal  
>>>>>>>>
>>>>>>>> gcc/ChangeLog:
>>>>>>>>
>>>>>>>>* config/aarch64/aarch64-ldp-fusion.cc: Place target
>>>>>>>>independent and dependent changed code.
>>>>>>>
>>>>>>> You're going to need a proper ChangeLog eventually, but I guess there's
>>>>>>> no need for that right now.
>>>>>>>
>>>>>>>> ---
>>>>>>>>  gcc/config/aarch64/aarch64-ldp-fusion.cc | 371 +++
>>>>>>>>  1 file changed, 249 insertions(+), 122 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc 
>>>>>>>> b/gcc/config/aarch64/aarch64-ldp-fusion.cc
>>>>>>>> index 22ed95eb743..cb21b514ef7 100644
>>>>>>>> --- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
>>>>>>>> +++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
>>>>>>>> @@ -138,8 +138,122 @@ struct alt_base

Re: [PATCH V4 1/3] aarch64: Place target independent and dependent changed code in one file

2024-04-09 Thread Ajit Agarwal
Hello Alex:

On 09/04/24 8:39 pm, Alex Coplan wrote:
> On 09/04/2024 20:01, Ajit Agarwal wrote:
>> Hello Alex:
>>
>> On 09/04/24 7:29 pm, Alex Coplan wrote:
>>> On 09/04/2024 17:30, Ajit Agarwal wrote:
>>>>
>>>>
>>>> On 05/04/24 10:03 pm, Alex Coplan wrote:
>>>>> On 05/04/2024 13:53, Ajit Agarwal wrote:
>>>>>> Hello Alex/Richard:
>>>>>>
>>>>>> All review comments are incorporated.
>>>>>
>>>>> Thanks, I was kind-of expecting you to also send the renaming patch as a
>>>>> preparatory patch as we discussed.
>>>>>
>>>>> Sorry for another meta comment, but: I think the reason that the Linaro
>>>>> CI isn't running tests on your patches is actually because you're
>>>>> sending 1/3 of a series but not sending the rest of the series.
>>>>>
>>>>> So please can you either send this as an individual preparatory patch
>>>>> (not marked as a series) or if you're going to send a series (e.g. with
>>>>> a preparatory rename patch as 1/2 and this as 2/2) then send the entire
>>>>> series when you make updates.  That way the CI should test your patches,
>>>>> which would be helpful.
>>>>>
>>>>
>>>> Addressed.
>>>>  
>>>>>>
>>>>>> Common infrastructure of load store pair fusion is divided into target
>>>>>> independent and target dependent changed code.
>>>>>>
>>>>>> Target independent code is the Generic code with pure virtual function
>>>>>> to interface betwwen target independent and dependent code.
>>>>>>
>>>>>> Target dependent code is the implementation of pure virtual function for
>>>>>> aarch64 target and the call to target independent code.
>>>>>>
>>>>>> Thanks & Regards
>>>>>> Ajit
>>>>>>
>>>>>>
>>>>>> aarch64: Place target independent and dependent changed code in one file
>>>>>>
>>>>>> Common infrastructure of load store pair fusion is divided into target
>>>>>> independent and target dependent changed code.
>>>>>>
>>>>>> Target independent code is the Generic code with pure virtual function
>>>>>> to interface betwwen target independent and dependent code.
>>>>>>
>>>>>> Target dependent code is the implementation of pure virtual function for
>>>>>> aarch64 target and the call to target independent code.
>>>>>>
>>>>>> 2024-04-06  Ajit Kumar Agarwal  
>>>>>>
>>>>>> gcc/ChangeLog:
>>>>>>
>>>>>>  * config/aarch64/aarch64-ldp-fusion.cc: Place target
>>>>>>  independent and dependent changed code.
>>>>>
>>>>> You're going to need a proper ChangeLog eventually, but I guess there's
>>>>> no need for that right now.
>>>>>
>>>>>> ---
>>>>>>  gcc/config/aarch64/aarch64-ldp-fusion.cc | 371 +++
>>>>>>  1 file changed, 249 insertions(+), 122 deletions(-)
>>>>>>
>>>>>> diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc 
>>>>>> b/gcc/config/aarch64/aarch64-ldp-fusion.cc
>>>>>> index 22ed95eb743..cb21b514ef7 100644
>>>>>> --- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
>>>>>> +++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
>>>>>> @@ -138,8 +138,122 @@ struct alt_base
>>>>>>poly_int64 offset;
>>>>>>  };
>>>>>>  
>>>>>> +// Virtual base class for load/store walkers used in alias analysis.
>>>>>> +struct alias_walker
>>>>>> +{
>>>>>> +  virtual bool conflict_p (int ) const = 0;
>>>>>> +  virtual insn_info *insn () const = 0;
>>>>>> +  virtual bool valid () const  = 0;
>>>>>
>>>>> Heh, looking at this made me realise there is a whitespace bug here in
>>>>> the existing code (double space after const).  Sorry about that!  I'll
>>>>> push an obvious fix for that.
>>>>>
>>>>>> +  virtual void advance () = 0;
>>>>>> +};
>>>>>> +
>>>

Re: [PATCH V4 1/3] aarch64: Place target independent and dependent changed code in one file

2024-04-09 Thread Ajit Agarwal
Hello Alex:

On 09/04/24 7:29 pm, Alex Coplan wrote:
> On 09/04/2024 17:30, Ajit Agarwal wrote:
>>
>>
>> On 05/04/24 10:03 pm, Alex Coplan wrote:
>>> On 05/04/2024 13:53, Ajit Agarwal wrote:
>>>> Hello Alex/Richard:
>>>>
>>>> All review comments are incorporated.
>>>
>>> Thanks, I was kind-of expecting you to also send the renaming patch as a
>>> preparatory patch as we discussed.
>>>
>>> Sorry for another meta comment, but: I think the reason that the Linaro
>>> CI isn't running tests on your patches is actually because you're
>>> sending 1/3 of a series but not sending the rest of the series.
>>>
>>> So please can you either send this as an individual preparatory patch
>>> (not marked as a series) or if you're going to send a series (e.g. with
>>> a preparatory rename patch as 1/2 and this as 2/2) then send the entire
>>> series when you make updates.  That way the CI should test your patches,
>>> which would be helpful.
>>>
>>
>> Addressed.
>>  
>>>>
>>>> Common infrastructure of load store pair fusion is divided into target
>>>> independent and target dependent changed code.
>>>>
>>>> Target independent code is the Generic code with pure virtual function
>>>> to interface betwwen target independent and dependent code.
>>>>
>>>> Target dependent code is the implementation of pure virtual function for
>>>> aarch64 target and the call to target independent code.
>>>>
>>>> Thanks & Regards
>>>> Ajit
>>>>
>>>>
>>>> aarch64: Place target independent and dependent changed code in one file
>>>>
>>>> Common infrastructure of load store pair fusion is divided into target
>>>> independent and target dependent changed code.
>>>>
>>>> Target independent code is the Generic code with pure virtual function
>>>> to interface betwwen target independent and dependent code.
>>>>
>>>> Target dependent code is the implementation of pure virtual function for
>>>> aarch64 target and the call to target independent code.
>>>>
>>>> 2024-04-06  Ajit Kumar Agarwal  
>>>>
>>>> gcc/ChangeLog:
>>>>
>>>>* config/aarch64/aarch64-ldp-fusion.cc: Place target
>>>>independent and dependent changed code.
>>>
>>> You're going to need a proper ChangeLog eventually, but I guess there's
>>> no need for that right now.
>>>
>>>> ---
>>>>  gcc/config/aarch64/aarch64-ldp-fusion.cc | 371 +++
>>>>  1 file changed, 249 insertions(+), 122 deletions(-)
>>>>
>>>> diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc 
>>>> b/gcc/config/aarch64/aarch64-ldp-fusion.cc
>>>> index 22ed95eb743..cb21b514ef7 100644
>>>> --- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
>>>> +++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
>>>> @@ -138,8 +138,122 @@ struct alt_base
>>>>poly_int64 offset;
>>>>  };
>>>>  
>>>> +// Virtual base class for load/store walkers used in alias analysis.
>>>> +struct alias_walker
>>>> +{
>>>> +  virtual bool conflict_p (int ) const = 0;
>>>> +  virtual insn_info *insn () const = 0;
>>>> +  virtual bool valid () const  = 0;
>>>
>>> Heh, looking at this made me realise there is a whitespace bug here in
>>> the existing code (double space after const).  Sorry about that!  I'll
>>> push an obvious fix for that.
>>>
>>>> +  virtual void advance () = 0;
>>>> +};
>>>> +
>>>> +struct pair_fusion {
>>>> +
>>>> +  pair_fusion () {};
>>>
>>> This ctor looks pointless at the moment.  Perhaps instead we could put
>>> the contents of ldp_fusion_init in here and then delete that function?
>>>
>>
>> Addressed.
>>
>>>> +  virtual bool fpsimd_op_p (rtx reg_op, machine_mode mem_mode,
>>>> + bool load_p) = 0;
>>>
>>> Please can we have comments above each of these virtual functions
>>> describing any parameters, what the purpose of the hook is, and the
>>> interpretation of the return value?  This will serve as the
>>> documentation for other targets that want to make use of the pass.
>>>
>>> It might make sense to ha

  1   2   3   4   >