Re: [vect] Re-analyze all modes for epilogues

2021-12-17 Thread Andre Vieira (lists) via Gcc-patches

Made the suggested changes.

Regarding the name change to partial vectors, I agree in the name change 
since that is the terminology we are using in the loop_vinfo members 
too, but is there an actual difference between predication/masking and 
partial vectors that I am missing?


OK for trunk?

gcc/ChangeLog:

    * tree-vect-loop.c (vect_better_loop_vinfo_p): Round factors up 
for epilogue costing.
    (vect_analyze_loop): Re-analyze all modes for epilogues, unless 
we are guaranteed that we can't

    have partial vectors.
    (genopinit.c) (partial_vectors_supported): Generate new function.

gcc/testsuite/ChangeLog:

    * gcc.target/aarch64/masked_epilogue.c: New test.
diff --git a/gcc/genopinit.c b/gcc/genopinit.c
index 
195ddf74fa2b7d89760622073dcec9d5d339a097..2bc7cdbf53337beae181afd7bb05b366ab068c6a
 100644
--- a/gcc/genopinit.c
+++ b/gcc/genopinit.c
@@ -321,6 +321,7 @@ main (int argc, const char **argv)
   "  bool supports_vec_scatter_store_cached;\n"
   "};\n"
   "extern void init_all_optabs (struct target_optabs *);\n"
+  "extern bool partial_vectors_supported_p (void);\n"
   "\n"
   "extern struct target_optabs default_target_optabs;\n"
   "extern struct target_optabs *this_fn_optabs;\n"
@@ -373,6 +374,33 @@ main (int argc, const char **argv)
 fprintf (s_file, "  ena[%u] = HAVE_%s;\n", i, p->name);
   fprintf (s_file, "}\n\n");
 
+  fprintf (s_file,
+  "/* Returns TRUE if the target supports any of the partial vector\n"
+  "   optabs: while_ult_optab, len_load_optab or len_store_optab,\n"
+  "   for any mode.  */\n"
+  "bool\npartial_vectors_supported_p (void)\n{\n");
+  bool any_match = false;
+  fprintf (s_file, "\treturn");
+  bool first = true;
+  for (i = 0; patterns.iterate (i, &p); ++i)
+{
+#define CMP_NAME(N) !strncmp (p->name, (N), strlen ((N)))
+  if (CMP_NAME("while_ult") || CMP_NAME ("len_load")
+ || CMP_NAME ("len_store"))
+   {
+ if (first)
+   fprintf (s_file, " HAVE_%s", p->name);
+ else
+   fprintf (s_file, " || HAVE_%s", p->name);
+ first = false;
+ any_match = true;
+   }
+}
+  if (!any_match)
+fprintf (s_file, " false");
+  fprintf (s_file, ";\n}\n");
+
+
   /* Perform a binary search on a pre-encoded optab+mode*2.  */
   /* ??? Perhaps even better to generate a minimal perfect hash.
  Using gperf directly is awkward since it's so geared to working
diff --git a/gcc/testsuite/gcc.target/aarch64/masked_epilogue.c 
b/gcc/testsuite/gcc.target/aarch64/masked_epilogue.c
new file mode 100644
index 
..286a7be236f337fee4c4650f42da72000855c5e6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/masked_epilogue.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details 
-march=armv8-a+sve -msve-vector-bits=scalable" } */
+
+void f(unsigned char y[restrict],
+   unsigned char x[restrict], int n) {
+  for (int i = 0; i < n; ++i)
+y[i] = (y[i] + x[i] + 1) >> 1;
+}
+
+/* { dg-final { scan-tree-dump {LOOP EPILOGUE VECTORIZED \(MODE=VNx} "vect" } 
} */
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 
a28bb6321d76b8222bc8cfdade151ca9b4dca406..5af98a36678ae61e99f93beb90920e2d0940c53a
 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -2824,11 +2824,13 @@ vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
{
  unsigned HOST_WIDE_INT main_vf_max
= estimated_poly_value (main_poly_vf, POLY_VALUE_MAX);
+ unsigned HOST_WIDE_INT old_vf_max
+   = estimated_poly_value (old_vf, POLY_VALUE_MAX);
+ unsigned HOST_WIDE_INT new_vf_max
+   = estimated_poly_value (new_vf, POLY_VALUE_MAX);
 
- old_factor = main_vf_max / estimated_poly_value (old_vf,
-  POLY_VALUE_MAX);
- new_factor = main_vf_max / estimated_poly_value (new_vf,
-  POLY_VALUE_MAX);
+ old_factor = CEIL (main_vf_max, old_vf_max);
+ new_factor = CEIL (main_vf_max, new_vf_max);
 
  /* If the loop is not using partial vectors then it will iterate one
 time less than one that does.  It is safe to subtract one here,
@@ -3069,8 +3071,6 @@ vect_analyze_loop (class loop *loop, vec_info_shared 
*shared)
   machine_mode autodetected_vector_mode = VOIDmode;
   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
   unsigned int mode_i = 0;
-  unsigned int first_loop_i = 0;
-  unsigned int first_loop_next_i = 0;
   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
 
   /* First determine the main loop vectorization mode, either the first
@@ -3079,7 +3079,6 @@ vect_analyze_loop (class loop *loop, vec_info_shared 
*shared)
  lowest cost if pick_lowest_cost_p.  */
   while (1)
 {
-  unsigned 

Re: [vect] Re-analyze all modes for epilogues

2021-12-13 Thread Andre Vieira (lists) via Gcc-patches

Hi,

Added an extra step to skip unusable epilogue modes when we know the 
target does not support predication. This uses a new function 
'support_predication_p' that is generated at build time and checks 
whether the target supports at least one optab that can be used for 
predicated code-generation.


Bootstrapped and regression tested on aarch64-none-linux-gnu.

OK for trunk?

gcc/ChangeLog:

    * tree-vect-loop.c (vect_better_loop_vinfo_p): Round factors up 
for epilogue costing.
    (vect_analyze_loop): Re-analyze all modes for epilogues, unless 
we are guaranteed that no

    predication is possible.
    (genopinit.c) (support_predication_p): Generate new function.

gcc/testsuite/ChangeLog:

    * gcc.target/aarch64/masked_epilogue.c: New test.diff --git a/gcc/genopinit.c b/gcc/genopinit.c
index 
195ddf74fa2b7d89760622073dcec9d5d339a097..e0958bc6c849911395341611a53b0fcb69565827
 100644
--- a/gcc/genopinit.c
+++ b/gcc/genopinit.c
@@ -321,6 +321,7 @@ main (int argc, const char **argv)
   "  bool supports_vec_scatter_store_cached;\n"
   "};\n"
   "extern void init_all_optabs (struct target_optabs *);\n"
+  "extern bool support_predication_p (void);\n"
   "\n"
   "extern struct target_optabs default_target_optabs;\n"
   "extern struct target_optabs *this_fn_optabs;\n"
@@ -373,6 +374,33 @@ main (int argc, const char **argv)
 fprintf (s_file, "  ena[%u] = HAVE_%s;\n", i, p->name);
   fprintf (s_file, "}\n\n");
 
+  fprintf (s_file,
+  "/* Returns TRUE if the target supports any of the predication\n"
+  "   specific optabs: while_ult_optab, len_load_optab or 
len_store_optab,\n"
+  "   for any mode.  */\n"
+  "bool\nsupport_predication_p (void)\n{\n");
+  bool any_match = false;
+  fprintf (s_file, "\treturn");
+  bool first = true;
+  for (i = 0; patterns.iterate (i, &p); ++i)
+{
+#define CMP_NAME(N) !strncmp (p->name, (N), strlen ((N)))
+  if (CMP_NAME("while_ult") || CMP_NAME ("len_load")
+ || CMP_NAME ("len_store"))
+   {
+ if (first)
+   fprintf (s_file, " HAVE_%s", p->name);
+ else
+   fprintf (s_file, " || HAVE_%s", p->name);
+ first = false;
+ any_match = true;
+   }
+}
+  if (!any_match)
+fprintf (s_file, " false");
+  fprintf (s_file, ";\n}\n");
+
+
   /* Perform a binary search on a pre-encoded optab+mode*2.  */
   /* ??? Perhaps even better to generate a minimal perfect hash.
  Using gperf directly is awkward since it's so geared to working
diff --git a/gcc/testsuite/gcc.target/aarch64/masked_epilogue.c 
b/gcc/testsuite/gcc.target/aarch64/masked_epilogue.c
new file mode 100644
index 
..286a7be236f337fee4c4650f42da72000855c5e6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/masked_epilogue.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details 
-march=armv8-a+sve -msve-vector-bits=scalable" } */
+
+void f(unsigned char y[restrict],
+   unsigned char x[restrict], int n) {
+  for (int i = 0; i < n; ++i)
+y[i] = (y[i] + x[i] + 1) >> 1;
+}
+
+/* { dg-final { scan-tree-dump {LOOP EPILOGUE VECTORIZED \(MODE=VNx} "vect" } 
} */
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 
a28bb6321d76b8222bc8cfdade151ca9b4dca406..86e0cb47aef2919fdf7d87228f7f6a8378893e68
 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -2824,11 +2824,13 @@ vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
{
  unsigned HOST_WIDE_INT main_vf_max
= estimated_poly_value (main_poly_vf, POLY_VALUE_MAX);
+ unsigned HOST_WIDE_INT old_vf_max
+   = estimated_poly_value (old_vf, POLY_VALUE_MAX);
+ unsigned HOST_WIDE_INT new_vf_max
+   = estimated_poly_value (new_vf, POLY_VALUE_MAX);
 
- old_factor = main_vf_max / estimated_poly_value (old_vf,
-  POLY_VALUE_MAX);
- new_factor = main_vf_max / estimated_poly_value (new_vf,
-  POLY_VALUE_MAX);
+ old_factor = CEIL (main_vf_max, old_vf_max);
+ new_factor = CEIL (main_vf_max, new_vf_max);
 
  /* If the loop is not using partial vectors then it will iterate one
 time less than one that does.  It is safe to subtract one here,
@@ -3069,8 +3071,6 @@ vect_analyze_loop (class loop *loop, vec_info_shared 
*shared)
   machine_mode autodetected_vector_mode = VOIDmode;
   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
   unsigned int mode_i = 0;
-  unsigned int first_loop_i = 0;
-  unsigned int first_loop_next_i = 0;
   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
 
   /* First determine the main loop vectorization mode, either the first
@@ -3079,7 +3079,6 @@ vect_analyze_loop (class loop *loop, vec_info_shared 
*shared)
  lo

Re: [vect] Re-analyze all modes for epilogues

2021-12-07 Thread Andre Vieira (lists) via Gcc-patches



On 07/12/2021 11:45, Richard Biener wrote:

Can you check whether, give we know the main VF, the epilogue analysis
does not start with am autodetected vector mode that needs a too large VF?


Hmm struggling to see how we could check this here. AFAIU before we 
analyze the loop for a given vector mode we won't know the VF? Are you 
saying that we could reject an autodetected mode which NUNITS > main VF 
for !LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P? An by reject I mean we'd 
start with mode_i = 1.


Or did I misunderstand something here.

FWIW this is just to prevent extra analysis right? If the epilogue's VF 
isn't appropriate it will be rejected later.




Re: [vect] Re-analyze all modes for epilogues

2021-12-07 Thread Andre Vieira (lists) via Gcc-patches

Hi,

Rebased on top of the epilogue mode patch.

OK for trunk?


gcc/ChangeLog:

    * tree-vect-loop.c (vect_estimate_min_profitable_iters): Pass 
new argument

    suggested_unroll_factor.
    (vect_analyze_loop_costing): Likewise.
    (_loop_vec_info::_loop_vec_info): Initialize new member 
suggested_unroll_factor.
    (vect_determine_partial_vectors_and_peeling): Make epilogue of 
unrolled

    main loop use partial vectors.
    (vect_analyze_loop_2): Pass and use new argument 
suggested_unroll_factor.

    (vect_analyze_loop_1): Likewise.
    (vect_analyze_loop): Change to intialize local 
suggested_unroll_factor and use it.
    (vectorizable_reduction): Don't use single_defuse_cycle when 
unrolling.
    * tree-vectorizer.h (_loop_vec_info::_loop_vec_info): Add new 
member suggested_unroll_factor.
    (vector_costs::vector_costs): Add new member 
m_suggested_unroll_factor.

    (vector_costs::suggested_unroll_factor): New getter function.
    (finish_cost): Set return argument suggested_unroll_factor.



Regards,
Andre

On 07/12/2021 11:27, Andre Vieira (lists) via Gcc-patches wrote:

Hi,

I've split this particular part off, since it's not only relevant to 
unrolling. The new test shows how this is useful for existing 
(non-unrolling) cases. I also had to fix the costing function, the 
main_vf / epilogue_vf calculations for old and new didn't take into 
consideration that the main_vf could be lower, nor did it take into 
consideration that they were not necessarily always a multiple of each 
other.  So using CEIL here is the correct approach.


Bootstrapped and regression tested on aarch64-none-linux-gnu.

OK for trunk?

gcc/ChangeLog:

    * tree-vect-loop.c (vect_better_loop_vinfo_p): Round factors 
up for epilogue costing.

    (vect_analyze_loop): Re-analyze all modes for epilogues.

gcc/testsuite/ChangeLog:

    * gcc.target/aarch64/masked_epilogue.c: New test.

On 30/11/2021 13:56, Richard Biener wrote:

On Tue, 30 Nov 2021, Andre Vieira (lists) wrote:


On 25/11/2021 12:46, Richard Biener wrote:

Oops, my fault, yes, it does.  I would suggest to refactor things so
that the mode_i = first_loop_i case is there only once.  I also wonder
if all the argument about starting at 0 doesn't apply to the
not unrolled LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P as well?  So
what's the reason to differ here?  So in the end I'd just change
the existing

    if (LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (first_loop_vinfo))
  {

to

    if (LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (first_loop_vinfo)
    || first_loop_vinfo->suggested_unroll_factor > 1)
  {

and maybe revisit this when we have an actual testcase showing that
doing sth else has a positive effect?

Thanks,
Richard.
So I had a quick chat with Richard Sandiford and he is suggesting 
resetting

mode_i to 0 for all cases.

He pointed out that for some tunings the SVE mode might come after 
the NEON
mode, which means that even for not-unrolled loop_vinfos we could 
end up with
a suboptimal choice of mode for the epilogue. I.e. it could be that 
we pick
V16QI for main vectorization, but that's VNx16QI + 1 in the array, 
so we'd not

try VNx16QI for the epilogue.

This would simplify the mode selecting cases, by just simply 
restarting at

mode_i in all epilogue cases. Is that something you'd be OK?

Works for me with an updated comment.  Even better with showing a
testcase exercising such tuning.

Richard.diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 
17b090170d4a5dad22097a727bc25a63e230e278..29cf14c83ac02e2f1372e56f7f96427dafcd4d11
 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -153,7 +153,8 @@ along with GCC; see the file COPYING3.  If not see
http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 */
 
-static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
+static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
+   unsigned *);
 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
   bool *, bool *);
 
@@ -828,6 +829,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, 
vec_info_shared *shared)
 skip_main_loop_edge (nullptr),
 skip_this_loop_edge (nullptr),
 reusable_accumulators (),
+suggested_unroll_factor (1),
 max_vectorization_factor (0),
 mask_skip_niters (NULL_TREE),
 rgroup_compare_type (NULL_TREE),
@@ -1811,7 +1813,8 @@ vect_known_niters_smaller_than_vf (loop_vec_info 
loop_vinfo)
definitely no, or -1 if it's worth retrying.  */
 
 static int
-vect_analyze_loop_costing (loop_vec_info loop_vinfo)
+vect_analyze_loop_costing (loop_vec_info loop_vinfo,
+  unsigned *suggested_unroll_factor)
 {
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   unsign

Re: [AArch64] Enable generation of FRINTNZ instructions

2021-12-07 Thread Andre Vieira (lists) via Gcc-patches

ping

On 25/11/2021 13:53, Andre Vieira (lists) via Gcc-patches wrote:


On 22/11/2021 11:41, Richard Biener wrote:



On 18/11/2021 11:05, Richard Biener wrote:
This is a good shout and made me think about something I hadn't 
before... I
thought I could handle the vector forms later, but the problem is 
if I add

support for the scalar, it will stop the vectorizer. It seems
vectorizable_call expects all arguments to have the same type, 
which doesn't

work with passing the integer type as an operand work around.

We already special case some IFNs there (masked load/store and gather)
to ignore some args, so that would just add to this set.

Richard.

Hi,

Reworked it to add support of the new IFN to the vectorizer. Was 
initially trying to make vectorizable_call and 
vectorizable_internal_function handle IFNs with different inputs more 
generically, using the information we have in the _direct structs 
regarding what operands to get the modes from. Unfortunately, that 
wasn't straightforward because of how vectorizable_call assumes 
operands have the same type and uses the type of the DEF_STMT_INFO of 
the non-constant operands (either output operand or non-constant 
inputs) to determine the type of constants. I assume there is some 
reason why we use the DEF_STMT_INFO and not always use 
get_vectype_for_scalar_type on the argument types. That is why I ended 
up with this sort of half-way mix of both, which still allows room to 
add more IFNs that don't take inputs of the same type, but require 
adding a bit of special casing similar to the IFN_FTRUNC_INT and 
masking ones.


Bootstrapped on aarch64-none-linux.

OK for trunk?

gcc/ChangeLog:

    * config/aarch64/aarch64.md (ftrunc2): New 
pattern.

    * config/aarch64/iterators.md (FRINTNZ): New iterator.
    (frintnz_mode): New int attribute.
    (VSFDF): Make iterator conditional.
    * internal-fn.def (FTRUNC_INT): New IFN.
    * internal-fn.c (ftrunc_int_direct): New define.
    (expand_ftrunc_int_optab_fn): New custom expander.
    (direct_ftrunc_int_optab_supported_p): New supported_p.
    * match.pd: Add to the existing TRUNC pattern match.
    * optabs.def (ftrunc_int): New entry.
    * stor-layout.h (element_precision): Moved from here...
    * tree.h (element_precision): ... to here.
    (element_type): New declaration.
    * tree.c (element_type): New function.
    (element_precision): Changed to use element_type.
    * tree-vect-stmts.c (vectorizable_internal_function): Add 
support for

    IFNs with different input types.
    (vectorizable_call): Teach to handle IFN_FTRUNC_INT.
    * doc/md.texi: New entry for ftrunc pattern name.
    * doc/sourcebuild.texi (aarch64_frintzx_ok): New target.

gcc/testsuite/ChangeLog:

    * gcc.target/aarch64/merge_trunc1.c: Adapted to skip if 
frintNz instruction available.

    * lib/target-supports.exp: Added arm_v8_5a_frintnzx_ok target.
    * gcc.target/aarch64/frintnz.c: New test.
    * gcc.target/aarch64/frintnz_vec.c: New test.


[vect] Re-analyze all modes for epilogues

2021-12-07 Thread Andre Vieira (lists) via Gcc-patches

Hi,

I've split this particular part off, since it's not only relevant to 
unrolling. The new test shows how this is useful for existing 
(non-unrolling) cases. I also had to fix the costing function, the 
main_vf / epilogue_vf calculations for old and new didn't take into 
consideration that the main_vf could be lower, nor did it take into 
consideration that they were not necessarily always a multiple of each 
other.  So using CEIL here is the correct approach.


Bootstrapped and regression tested on aarch64-none-linux-gnu.

OK for trunk?

gcc/ChangeLog:

    * tree-vect-loop.c (vect_better_loop_vinfo_p): Round factors up 
for epilogue costing.

    (vect_analyze_loop): Re-analyze all modes for epilogues.

gcc/testsuite/ChangeLog:

    * gcc.target/aarch64/masked_epilogue.c: New test.

On 30/11/2021 13:56, Richard Biener wrote:

On Tue, 30 Nov 2021, Andre Vieira (lists) wrote:


On 25/11/2021 12:46, Richard Biener wrote:

Oops, my fault, yes, it does.  I would suggest to refactor things so
that the mode_i = first_loop_i case is there only once.  I also wonder
if all the argument about starting at 0 doesn't apply to the
not unrolled LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P as well?  So
what's the reason to differ here?  So in the end I'd just change
the existing

if (LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (first_loop_vinfo))
  {

to

if (LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (first_loop_vinfo)
|| first_loop_vinfo->suggested_unroll_factor > 1)
  {

and maybe revisit this when we have an actual testcase showing that
doing sth else has a positive effect?

Thanks,
Richard.

So I had a quick chat with Richard Sandiford and he is suggesting resetting
mode_i to 0 for all cases.

He pointed out that for some tunings the SVE mode might come after the NEON
mode, which means that even for not-unrolled loop_vinfos we could end up with
a suboptimal choice of mode for the epilogue. I.e. it could be that we pick
V16QI for main vectorization, but that's VNx16QI + 1 in the array, so we'd not
try VNx16QI for the epilogue.

This would simplify the mode selecting cases, by just simply restarting at
mode_i in all epilogue cases. Is that something you'd be OK?

Works for me with an updated comment.  Even better with showing a
testcase exercising such tuning.

Richard.diff --git a/gcc/testsuite/gcc.target/aarch64/masked_epilogue.c 
b/gcc/testsuite/gcc.target/aarch64/masked_epilogue.c
new file mode 100644
index 
..286a7be236f337fee4c4650f42da72000855c5e6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/masked_epilogue.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details 
-march=armv8-a+sve -msve-vector-bits=scalable" } */
+
+void f(unsigned char y[restrict],
+   unsigned char x[restrict], int n) {
+  for (int i = 0; i < n; ++i)
+y[i] = (y[i] + x[i] + 1) >> 1;
+}
+
+/* { dg-final { scan-tree-dump {LOOP EPILOGUE VECTORIZED \(MODE=VNx} "vect" } 
} */
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 
a28bb6321d76b8222bc8cfdade151ca9b4dca406..17b090170d4a5dad22097a727bc25a63e230e278
 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -2824,11 +2824,13 @@ vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
{
  unsigned HOST_WIDE_INT main_vf_max
= estimated_poly_value (main_poly_vf, POLY_VALUE_MAX);
+ unsigned HOST_WIDE_INT old_vf_max
+   = estimated_poly_value (old_vf, POLY_VALUE_MAX);
+ unsigned HOST_WIDE_INT new_vf_max
+   = estimated_poly_value (new_vf, POLY_VALUE_MAX);
 
- old_factor = main_vf_max / estimated_poly_value (old_vf,
-  POLY_VALUE_MAX);
- new_factor = main_vf_max / estimated_poly_value (new_vf,
-  POLY_VALUE_MAX);
+ old_factor = CEIL (main_vf_max, old_vf_max);
+ new_factor = CEIL (main_vf_max, new_vf_max);
 
  /* If the loop is not using partial vectors then it will iterate one
 time less than one that does.  It is safe to subtract one here,
@@ -3069,8 +3071,6 @@ vect_analyze_loop (class loop *loop, vec_info_shared 
*shared)
   machine_mode autodetected_vector_mode = VOIDmode;
   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
   unsigned int mode_i = 0;
-  unsigned int first_loop_i = 0;
-  unsigned int first_loop_next_i = 0;
   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
 
   /* First determine the main loop vectorization mode, either the first
@@ -3079,7 +3079,6 @@ vect_analyze_loop (class loop *loop, vec_info_shared 
*shared)
  lowest cost if pick_lowest_cost_p.  */
   while (1)
 {
-  unsigned int loop_vinfo_i = mode_i;
   bool fatal;
   opt_loop_vec_info loop_vinfo
= vect_analyze_loo

Re: [PATCH 1v2/3][vect] Add main vectorized loop unrolling

2021-11-30 Thread Andre Vieira (lists) via Gcc-patches



On 25/11/2021 12:46, Richard Biener wrote:

Oops, my fault, yes, it does.  I would suggest to refactor things so
that the mode_i = first_loop_i case is there only once.  I also wonder
if all the argument about starting at 0 doesn't apply to the
not unrolled LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P as well?  So
what's the reason to differ here?  So in the end I'd just change
the existing

   if (LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (first_loop_vinfo))
 {

to

   if (LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (first_loop_vinfo)
   || first_loop_vinfo->suggested_unroll_factor > 1)
 {

and maybe revisit this when we have an actual testcase showing that
doing sth else has a positive effect?

Thanks,
Richard.


So I had a quick chat with Richard Sandiford and he is suggesting 
resetting mode_i to 0 for all cases.


He pointed out that for some tunings the SVE mode might come after the 
NEON mode, which means that even for not-unrolled loop_vinfos we could 
end up with a suboptimal choice of mode for the epilogue. I.e. it could 
be that we pick V16QI for main vectorization, but that's VNx16QI + 1 in 
the array, so we'd not try VNx16QI for the epilogue.


This would simplify the mode selecting cases, by just simply restarting 
at mode_i in all epilogue cases. Is that something you'd be OK?


Regards,
Andre


Re: [AArch64] Enable generation of FRINTNZ instructions

2021-11-29 Thread Andre Vieira (lists) via Gcc-patches



On 18/11/2021 11:05, Richard Biener wrote:


+ (if (!flag_trapping_math
+ && direct_internal_fn_supported_p (IFN_TRUNC, type,
+OPTIMIZE_FOR_BOTH))
+  (IFN_TRUNC @0)
  #endif

does IFN_FTRUNC_INT preserve the same exceptions as doing
explicit intermediate float->int conversions?  I think I'd
prefer to have !flag_trapping_math on both cases.
I realized I never responded to this. The AArch64 instructions mimic the 
behaviour you'd see if you were doing explicit conversions, so I'll be 
defining the new IFN and optab to require the same, such that these can 
be used by the compiler when flag_trapping_math. In the patch I sent 
last I added some likes to the md.texi description of the optab to that 
intent.


Re: [AArch64] Enable generation of FRINTNZ instructions

2021-11-25 Thread Andre Vieira (lists) via Gcc-patches


On 22/11/2021 11:41, Richard Biener wrote:



On 18/11/2021 11:05, Richard Biener wrote:

This is a good shout and made me think about something I hadn't before... I
thought I could handle the vector forms later, but the problem is if I add
support for the scalar, it will stop the vectorizer. It seems
vectorizable_call expects all arguments to have the same type, which doesn't
work with passing the integer type as an operand work around.

We already special case some IFNs there (masked load/store and gather)
to ignore some args, so that would just add to this set.

Richard.

Hi,

Reworked it to add support of the new IFN to the vectorizer. Was 
initially trying to make vectorizable_call and 
vectorizable_internal_function handle IFNs with different inputs more 
generically, using the information we have in the _direct structs 
regarding what operands to get the modes from. Unfortunately, that 
wasn't straightforward because of how vectorizable_call assumes operands 
have the same type and uses the type of the DEF_STMT_INFO of the 
non-constant operands (either output operand or non-constant inputs) to 
determine the type of constants. I assume there is some reason why we 
use the DEF_STMT_INFO and not always use get_vectype_for_scalar_type on 
the argument types. That is why I ended up with this sort of half-way 
mix of both, which still allows room to add more IFNs that don't take 
inputs of the same type, but require adding a bit of special casing 
similar to the IFN_FTRUNC_INT and masking ones.


Bootstrapped on aarch64-none-linux.

OK for trunk?

gcc/ChangeLog:

    * config/aarch64/aarch64.md (ftrunc2): New 
pattern.

    * config/aarch64/iterators.md (FRINTNZ): New iterator.
    (frintnz_mode): New int attribute.
    (VSFDF): Make iterator conditional.
    * internal-fn.def (FTRUNC_INT): New IFN.
    * internal-fn.c (ftrunc_int_direct): New define.
    (expand_ftrunc_int_optab_fn): New custom expander.
    (direct_ftrunc_int_optab_supported_p): New supported_p.
    * match.pd: Add to the existing TRUNC pattern match.
    * optabs.def (ftrunc_int): New entry.
    * stor-layout.h (element_precision): Moved from here...
    * tree.h (element_precision): ... to here.
    (element_type): New declaration.
    * tree.c (element_type): New function.
    (element_precision): Changed to use element_type.
    * tree-vect-stmts.c (vectorizable_internal_function): Add 
support for

    IFNs with different input types.
    (vectorizable_call): Teach to handle IFN_FTRUNC_INT.
    * doc/md.texi: New entry for ftrunc pattern name.
    * doc/sourcebuild.texi (aarch64_frintzx_ok): New target.

gcc/testsuite/ChangeLog:

    * gcc.target/aarch64/merge_trunc1.c: Adapted to skip if frintNz 
instruction available.

    * lib/target-supports.exp: Added arm_v8_5a_frintnzx_ok target.
    * gcc.target/aarch64/frintnz.c: New test.
    * gcc.target/aarch64/frintnz_vec.c: New test.diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 
4035e061706793849c68ae09bcb2e4b9580ab7b6..c5c60e7a810e22b0ea9ed6bf056ddd6431d60269
 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -7345,12 +7345,18 @@ (define_insn "despeculate_simpleti"
(set_attr "speculation_barrier" "true")]
 )
 
+(define_expand "ftrunc2"
+  [(set (match_operand:VSFDF 0 "register_operand" "=w")
+(unspec:VSFDF [(match_operand:VSFDF 1 "register_operand" "w")]
+ FRINTNZ))]
+  "TARGET_FRINT"
+)
+
 (define_insn "aarch64_"
   [(set (match_operand:VSFDF 0 "register_operand" "=w")
(unspec:VSFDF [(match_operand:VSFDF 1 "register_operand" "w")]
  FRINTNZX))]
-  "TARGET_FRINT && TARGET_FLOAT
-   && !(VECTOR_MODE_P (mode) && !TARGET_SIMD)"
+  "TARGET_FRINT"
   "\\t%0, %1"
   [(set_attr "type" "f_rint")]
 )
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 
bdc8ba3576cf2c9b4ae96b45a382234e4e25b13f..51f00344b02d0d1d4adf97463f6a46f9fd0fb43f
 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -160,7 +160,11 @@ (define_mode_iterator VHSDF_HSDF [(V4HF 
"TARGET_SIMD_F16INST")
  SF DF])
 
 ;; Scalar and vetor modes for SF, DF.
-(define_mode_iterator VSFDF [V2SF V4SF V2DF DF SF])
+(define_mode_iterator VSFDF [ (V2SF "TARGET_SIMD")
+ (V4SF "TARGET_SIMD")
+ (V2DF "TARGET_SIMD")
+ (DF "TARGET_FLOAT")
+ (SF "TARGET_FLOAT")])
 
 ;; Advanced SIMD single Float modes.
 (define_mode_iterator VDQSF [V2SF V4SF])
@@ -3067,6 +3071,8 @@ (define_int_iterator FCMLA [UNSPEC_FCMLA
 (define_int_iterator FRINTNZX [UNSPEC_FRINT32Z UNSPEC_FRINT32X
   UNSPEC_FRINT64Z UNSPEC_FRINT64X])
 
+(define_int_iterator FRINTNZ [UNSPEC_FRINT32Z UNSPEC_FRINT64Z])
+
 (define_int_iter

Re: [PATCH 1v2/3][vect] Add main vectorized loop unrolling

2021-11-25 Thread Andre Vieira (lists) via Gcc-patches



On 24/11/2021 11:00, Richard Biener wrote:

On Wed, 24 Nov 2021, Andre Vieira (lists) wrote:


On 22/11/2021 12:39, Richard Biener wrote:

+  if (first_loop_vinfo->suggested_unroll_factor > 1)
+{
+  if (LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (first_loop_vinfo))
+   {
+ if (dump_enabled_p ())
+   dump_printf_loc (MSG_NOTE, vect_location,
+"* Re-trying analysis with first vector
mode"
+" %s for epilogue with partial vectors of"
+" unrolled first loop.\n",
+GET_MODE_NAME (vector_modes[0]));
+ mode_i = 0;

and the later done check for bigger VF than main loop - why would
we re-start at 0 rather than at the old mode?  Maybe we want to
remember the iterator value we started at when arriving at the
main loop mode?  So if we analyzed successfully with mode_i == 2,
then sucessfully at mode_i == 4 which suggested an unroll of 2,
re-start at the mode_i we continued after the mode_i == 2
successful analysis?  To just consider the "simple" case of
AVX vs SSE it IMHO doesn't make much sense to succeed with
AVX V4DF, succeed with SSE V2DF and figure it's better than V4DF AVX
but get a suggestion of 2 times unroll and then re-try AVX V4DF
just to re-compute that yes, it's worse than SSE V2DF?  You
are probably thinking of SVE vs ADVSIMD here but do we need to
start at 0?  Adding a comment to the code would be nice.

Thanks,

I was indeed thinking SVE vs Advanced SIMD where we end up having to compare
different vectorization strategies, which will have different costs depending.
The hypothetical case, as in I don't think I've come across one, is where if
we decide to vectorize the main loop for V8QI and unroll 2x, yielding a VF of
16, we may then want to then use a predicated VNx16QI epilogue.

But this isn't the epilogue handling ...
Am I misunderstanding the code here? To me it looks like this is picking 
what mode_i to start the 'while (1)' loop does the loop analysis for the 
epilogues?


Re: [PATCH 1v2/3][vect] Add main vectorized loop unrolling

2021-11-24 Thread Andre Vieira (lists) via Gcc-patches



On 22/11/2021 12:39, Richard Biener wrote:

+  if (first_loop_vinfo->suggested_unroll_factor > 1)
+{
+  if (LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (first_loop_vinfo))
+   {
+ if (dump_enabled_p ())
+   dump_printf_loc (MSG_NOTE, vect_location,
+"* Re-trying analysis with first vector
mode"
+" %s for epilogue with partial vectors of"
+" unrolled first loop.\n",
+GET_MODE_NAME (vector_modes[0]));
+ mode_i = 0;

and the later done check for bigger VF than main loop - why would
we re-start at 0 rather than at the old mode?  Maybe we want to
remember the iterator value we started at when arriving at the
main loop mode?  So if we analyzed successfully with mode_i == 2,
then sucessfully at mode_i == 4 which suggested an unroll of 2,
re-start at the mode_i we continued after the mode_i == 2
successful analysis?  To just consider the "simple" case of
AVX vs SSE it IMHO doesn't make much sense to succeed with
AVX V4DF, succeed with SSE V2DF and figure it's better than V4DF AVX
but get a suggestion of 2 times unroll and then re-try AVX V4DF
just to re-compute that yes, it's worse than SSE V2DF?  You
are probably thinking of SVE vs ADVSIMD here but do we need to
start at 0?  Adding a comment to the code would be nice.

Thanks,


I was indeed thinking SVE vs Advanced SIMD where we end up having to 
compare different vectorization strategies, which will have different 
costs depending. The hypothetical case, as in I don't think I've come 
across one, is where if we decide to vectorize the main loop for V8QI 
and unroll 2x, yielding a VF of 16, we may then want to then use a 
predicated VNx16QI epilogue. Though the question here is whether it is 
possible for an Advanced SIMD V8QI vectorization to beat V16QI but a SVE 
predicated VNx16QI to beat a VNx8QI for the same loop.  Might be good to 
get Sandiford's opinion on this.


I do think that initially I was more concerned with skipping a VNx8QI 
after selecting a V8QI but I just checked and Advanced SIMD modes are 
listed before SVE for (among others) this reason.


Regards,
Andre



Re: [PATCH 1v2/3][vect] Add main vectorized loop unrolling

2021-11-22 Thread Andre Vieira (lists) via Gcc-patches


On 12/11/2021 13:12, Richard Biener wrote:

On Thu, 11 Nov 2021, Andre Vieira (lists) wrote:


Hi,

This is the rebased and reworked version of the unroll patch.  I wasn't
entirely sure whether I should compare the costs of the unrolled loop_vinfo
with the original loop_vinfo it was unrolled of. I did now, but I wasn't too
sure whether it was a good idea to... Any thoughts on this?

+  /* Apply the suggested unrolling factor, this was determined by the
backend
+ during finish_cost the first time we ran the analyzis for this
+ vector mode.  */
+  if (loop_vinfo->suggested_unroll_factor > 1)
+{
+  poly_uint64 unrolled_vf
+   = LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
loop_vinfo->suggested_unroll_factor;
+  /* Make sure the unrolled vectorization factor is less than the max
+ vectorization factor.  */
+  unsigned HOST_WIDE_INT max_vf = LOOP_VINFO_MAX_VECT_FACTOR
(loop_vinfo);
+  if (max_vf == MAX_VECTORIZATION_FACTOR || known_le (unrolled_vf,
max_vf))
+   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = unrolled_vf;
+  else
+   return opt_result::failure_at (vect_location,
+  "unrolling failed: unrolled"
+  " vectorization factor larger than"
+  " maximum vectorization factor:
%d\n",
+  LOOP_VINFO_MAX_VECT_FACTOR
(loop_vinfo));
+}
+
/* This is the point where we can re-start analysis with SLP forced
off.  */
  start_over:

So we're honoring suggested_unroll_factor here but you still have the
now unused hunk

+vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
+unsigned *suggested_unroll_factor, poly_uint64 min_vf
= 2)
  {

I also wonder whether vect_analyze_loop_2 could at least prune
suggested_unroll_factor as set by vect_analyze_loop_costing with its
knowledge of max_vf itself?  That would avoid using the at the moment
unused LOOP_VINFO_MAX_VECT_FACTOR?

I think all the things you do in vect_can_unroll should be figured
out with the re-analysis, and I'd just amend vect_analyze_loop_1
with a suggested unroll factor parameter like it has main_loop_vinfo
for the epilogue case.  The main loop adjustment would the be in the

   if (first_loop_vinfo == NULL)
 {
   first_loop_vinfo = loop_vinfo;
   first_loop_i = loop_vinfo_i;
   first_loop_next_i = mode_i;
 }

Sounds good.


spot only, adding

if (loop_vinfo->suggested_unroll_factor != 1)
   {
 suggested_unroll_factor = loop_vinfo->suggested_unroll_factor;
 mode_i = first_loop_i;
 if (dump)
   dump_print ("Trying unrolling by %d\n");
 continue;
   }
Not quite like this because of how we need to keep the suggestion given 
at finish_costs, put into suggested_unroll_factor, separate from how we 
tell vect_analyze_loop_1 that we are now vectorizing an unrolled vector 
loop, which we do by writing to loop_vinfo->suggested_unroll_factor. 
Perhaps I should renamed the latter, to avoid confusion? Let me know if 
you think that would help and in the mean-time this is what the patch 
looks like now. I'll follow up with a ChangeLog when we settle on the 
name & structure.diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 
a28bb6321d76b8222bc8cfdade151ca9b4dca406..c84f1df9cd9a1325135defcbe1d101642a867373
 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -153,7 +153,8 @@ along with GCC; see the file COPYING3.  If not see
http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 */
 
-static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
+static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
+   unsigned *);
 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
   bool *, bool *);
 
@@ -828,6 +829,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, 
vec_info_shared *shared)
 skip_main_loop_edge (nullptr),
 skip_this_loop_edge (nullptr),
 reusable_accumulators (),
+suggested_unroll_factor (1),
 max_vectorization_factor (0),
 mask_skip_niters (NULL_TREE),
 rgroup_compare_type (NULL_TREE),
@@ -1811,7 +1813,8 @@ vect_known_niters_smaller_than_vf (loop_vec_info 
loop_vinfo)
definitely no, or -1 if it's worth retrying.  */
 
 static int
-vect_analyze_loop_costing (loop_vec_info loop_vinfo)
+vect_analyze_loop_costing (loop_vec_info loop_vinfo,
+  unsigned *suggested_unroll_factor)
 {
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
@@ -1845,7 +1848,8 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo)
 
   int min_profitable_iters, m

Re: [AArch64] Enable generation of FRINTNZ instructions

2021-11-22 Thread Andre Vieira (lists) via Gcc-patches



On 18/11/2021 11:05, Richard Biener wrote:


@@ -3713,12 +3713,21 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
 trapping behaviour, so require !flag_trapping_math. */
  #if GIMPLE
  (simplify
-   (float (fix_trunc @0))
-   (if (!flag_trapping_math
-   && types_match (type, TREE_TYPE (@0))
-   && direct_internal_fn_supported_p (IFN_TRUNC, type,
- OPTIMIZE_FOR_BOTH))
-  (IFN_TRUNC @0)))
+   (float (fix_trunc@1 @0))
+   (if (types_match (type, TREE_TYPE (@0)))
+(if (TYPE_SIGN (TREE_TYPE (@1)) == SIGNED
+&& direct_internal_fn_supported_p (IFN_FTRUNC_INT, type,
+   TREE_TYPE (@1),
OPTIMIZE_FOR_BOTH))
+ (with {
+  tree int_type = TREE_TYPE (@1);
+  unsigned HOST_WIDE_INT max_int_c
+   = (1ULL << (element_precision (int_type) - 1)) - 1;

That's only half-way supporting vector types I fear - you use
element_precision but then build a vector integer constant
in an unsupported way.  I suppose vector support isn't present
for arm?  The cleanest way would probably be to do

tree int_type = element_type (@1);

with providing element_type in tree.[ch] like we provide
element_precision.
This is a good shout and made me think about something I hadn't 
before... I thought I could handle the vector forms later, but the 
problem is if I add support for the scalar, it will stop the vectorizer. 
It seems vectorizable_call expects all arguments to have the same type, 
which doesn't work with passing the integer type as an operand work around.


Should I go back to two separate IFN's, could still have the single optab.

Regards,
Andre



Re: [AArch64] Enable generation of FRINTNZ instructions

2021-11-17 Thread Andre Vieira (lists) via Gcc-patches


On 16/11/2021 12:10, Richard Biener wrote:

On Fri, 12 Nov 2021, Andre Simoes Dias Vieira wrote:


On 12/11/2021 10:56, Richard Biener wrote:

On Thu, 11 Nov 2021, Andre Vieira (lists) wrote:


Hi,

This patch introduces two IFN's FTRUNC32 and FTRUNC64, the corresponding
optabs and mappings. It also creates a backend pattern to implement them
for
aarch64 and a match.pd pattern to idiom recognize these.
These IFN's (and optabs) represent a truncation towards zero, as if
performed
by first casting it to a signed integer of 32 or 64 bits and then back to
the
same floating point type/mode.

The match.pd pattern choses to use these, when supported, regardless of
trapping math, since these new patterns mimic the original behavior of
truncating through an integer.

I didn't think any of the existing IFN's represented these. I know it's a
bit
late in stage 1, but I thought this might be OK given it's only used by a
single target and should have very little impact on anything else.

Bootstrapped on aarch64-none-linux.

OK for trunk?

On the RTL side ftrunc32/ftrunc64 would probably be better a conversion
optab (with two modes), so not

+OPTAB_D (ftrunc32_optab, "ftrunc$asi2")
+OPTAB_D (ftrunc64_optab, "ftrunc$adi2")

but

OPTAB_CD (ftrunc_shrt_optab, "ftrunc$a$I$b2")

or so?  I know that gets somewhat awkward for the internal function,
but IMHO we shouldn't tie our hands because of that?

I tried doing this originally, but indeed I couldn't find a way to correctly
tie the internal function to it.

direct_optab_supported_p with multiple types expect those to be of the same
mode. I see convert_optab_supported_p does but I don't know how that is
used...

Any ideas?

No "nice" ones.  The "usual" way is to provide fake arguments that
specify the type/mode.  We could use an integer argument directly
secifying the mode (then the IL would look host dependent - ugh),
or specify a constant zero in the intended mode (less visibly
obvious - but at least with -gimple dumping you'd see the type...).

Hi,

So I reworked this to have a single optab and IFN. This required a bit 
of fiddling with custom expander and supported_p functions for the IFN. 
I decided to pass a MAX_INT for the 'int' type to the IFN to be able to 
pass on the size of the int we use as an intermediate cast.  I tried 0 
first, but gcc was being too smart and just demoted it to an 'int' for 
the long long test-cases.


Bootstrapped on aarch64-none-linux.

OK for trunk?

gcc/ChangeLog:

    * config/aarch64/aarch64.md (ftrunc2): New 
pattern.

    * config/aarch64/iterators.md (FRINTZ): New iterator.
    * doc/md.texi: New entry for ftrunc pattern name.
    * internal-fn.def (FTRUNC_INT): New IFN.
    * match.pd: Add to the existing TRUNC pattern match.
    * optabs.def (ftrunc_int): New entry.

gcc/testsuite/ChangeLog:

    * gcc.target/aarch64/merge_trunc1.c: Adapted to skip if frintNz 
instruction available.

    * lib/target-supports.exp: Added arm_v8_5a_frintnzx_ok target.
    * gcc.target/aarch64/frintnz.c: New test.diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 
4035e061706793849c68ae09bcb2e4b9580ab7b6..62adbc4cb6bbbe0c856f9fbe451aee08f2dea3b5
 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -7345,6 +7345,14 @@ (define_insn "despeculate_simpleti"
(set_attr "speculation_barrier" "true")]
 )
 
+(define_expand "ftrunc2"
+  [(set (match_operand:VSFDF 0 "register_operand" "=w")
+(unspec:VSFDF [(match_operand:VSFDF 1 "register_operand" "w")]
+ FRINTNZ))]
+  "TARGET_FRINT && TARGET_FLOAT
+   && !(VECTOR_MODE_P (mode) && !TARGET_SIMD)"
+)
+
 (define_insn "aarch64_"
   [(set (match_operand:VSFDF 0 "register_operand" "=w")
(unspec:VSFDF [(match_operand:VSFDF 1 "register_operand" "w")]
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 
bdc8ba3576cf2c9b4ae96b45a382234e4e25b13f..49510488a2a800689e95c399f2e6c967b566516d
 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -3067,6 +3067,8 @@ (define_int_iterator FCMLA [UNSPEC_FCMLA
 (define_int_iterator FRINTNZX [UNSPEC_FRINT32Z UNSPEC_FRINT32X
   UNSPEC_FRINT64Z UNSPEC_FRINT64X])
 
+(define_int_iterator FRINTNZ [UNSPEC_FRINT32Z UNSPEC_FRINT64Z])
+
 (define_int_iterator SVE_BRK_UNARY [UNSPEC_BRKA UNSPEC_BRKB])
 
 (define_int_iterator SVE_BRK_BINARY [UNSPEC_BRKN UNSPEC_BRKPA UNSPEC_BRKPB])
@@ -3482,6 +3484,8 @@ (define_int_attr f16mac1 [(UNSPEC_FMLAL "a") 
(UNSPEC_FMLSL "s")
 (define_int_attr frintnzs_op [(UNSPEC_FRINT32Z "frint32z&qu

[AArch64] Enable generation of FRINTNZ instructions

2021-11-11 Thread Andre Vieira (lists) via Gcc-patches

Hi,

This patch introduces two IFN's FTRUNC32 and FTRUNC64, the corresponding 
optabs and mappings. It also creates a backend pattern to implement them 
for aarch64 and a match.pd pattern to idiom recognize these.
These IFN's (and optabs) represent a truncation towards zero, as if 
performed by first casting it to a signed integer of 32 or 64 bits and 
then back to the same floating point type/mode.


The match.pd pattern choses to use these, when supported, regardless of 
trapping math, since these new patterns mimic the original behavior of 
truncating through an integer.


I didn't think any of the existing IFN's represented these. I know it's 
a bit late in stage 1, but I thought this might be OK given it's only 
used by a single target and should have very little impact on anything else.


Bootstrapped on aarch64-none-linux.

OK for trunk?

gcc/ChangeLog:

    * config/aarch64/aarch64.md (ftrunc2): New 
pattern.

    * config/aarch64/iterators.md (FRINTZ): New iterator.
    * doc/md.texi: New entry for ftrunc pattern name.
    * internal-fn.def (FTRUNC32): New IFN.
    (FTRUNC64): Likewise.
    * match.pd: Add to the existing TRUNC pattern match.
    * optabs.def (OPTAB_D): New entries for ftrunc.

gcc/testsuite/ChangeLog:

    * gcc.target/aarch64/merge_trunc1.c: Adapted to skip if frintNz 
instruction available.

    * lib/target-supports.exp: Added arm_v8_5a_frintnzx_ok target.
    * gcc.target/aarch64/frintnz.c: New test.
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 
4035e061706793849c68ae09bcb2e4b9580ab7b6..ad4e04d7c874da095513442e7d7f247791d8921d
 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -7345,6 +7345,16 @@ (define_insn "despeculate_simpleti"
(set_attr "speculation_barrier" "true")]
 )
 
+(define_insn "ftrunc2"
+  [(set (match_operand:VSFDF 0 "register_operand" "=w")
+(unspec:VSFDF [(match_operand:VSFDF 1 "register_operand" "w")]
+ FRINTNZ))]
+  "TARGET_FRINT && TARGET_FLOAT
+   && !(VECTOR_MODE_P (mode) && !TARGET_SIMD)"
+  "\\t%0, %1"
+  [(set_attr "type" "f_rint")]
+)
+
 (define_insn "aarch64_"
   [(set (match_operand:VSFDF 0 "register_operand" "=w")
(unspec:VSFDF [(match_operand:VSFDF 1 "register_operand" "w")]
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 
bdc8ba3576cf2c9b4ae96b45a382234e4e25b13f..49510488a2a800689e95c399f2e6c967b566516d
 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -3067,6 +3067,8 @@ (define_int_iterator FCMLA [UNSPEC_FCMLA
 (define_int_iterator FRINTNZX [UNSPEC_FRINT32Z UNSPEC_FRINT32X
   UNSPEC_FRINT64Z UNSPEC_FRINT64X])
 
+(define_int_iterator FRINTNZ [UNSPEC_FRINT32Z UNSPEC_FRINT64Z])
+
 (define_int_iterator SVE_BRK_UNARY [UNSPEC_BRKA UNSPEC_BRKB])
 
 (define_int_iterator SVE_BRK_BINARY [UNSPEC_BRKN UNSPEC_BRKPA UNSPEC_BRKPB])
@@ -3482,6 +3484,8 @@ (define_int_attr f16mac1 [(UNSPEC_FMLAL "a") 
(UNSPEC_FMLSL "s")
 (define_int_attr frintnzs_op [(UNSPEC_FRINT32Z "frint32z") (UNSPEC_FRINT32X 
"frint32x")
  (UNSPEC_FRINT64Z "frint64z") (UNSPEC_FRINT64X 
"frint64x")])
 
+(define_int_attr frintnz_mode [(UNSPEC_FRINT32Z "si") (UNSPEC_FRINT64Z "di")])
+
 ;; The condition associated with an UNSPEC_COND_.
 (define_int_attr cmp_op [(UNSPEC_COND_CMPEQ_WIDE "eq")
 (UNSPEC_COND_CMPGE_WIDE "ge")
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 
41f1850bf6e95005647ca97a495a97d7e184d137..7bd66818144e87e1dca2ef13bef1d6f21f239570
 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -6175,6 +6175,13 @@ operands; otherwise, it may not.
 
 This pattern is not allowed to @code{FAIL}.
 
+@cindex @code{ftrunc@var{m}@var{n}2} instruction pattern
+@item @samp{ftrunc@var{m}@var{n}2}
+Truncate operand 1 to a @var{n} mode signed integer, towards zero, and store
+the result in operand 0. Both operands have mode @var{m}, which is a scalar or
+vector floating-point mode.
+
+
 @cindex @code{round@var{m}2} instruction pattern
 @item @samp{round@var{m}2}
 Round operand 1 to the nearest integer, rounding away from zero in the
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 
bb13c6cce1bf55633760bc14980402f1f0ac1689..64263cbb83548b140f613cb4bf5ce6565373f96d
 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -269,6 +269,8 @@ DEF_INTERNAL_FLT_FLOATN_FN (RINT, ECF_CONST, rint, unary)
 DEF_INTERNAL_FLT_FLOATN_FN (ROUND, ECF_CONST, round, unary)
 DEF_INTERNAL_FLT_FLOATN_FN (ROUNDEVEN, ECF_CONST, roundeven, unary)
 DEF_INTERNAL_FLT_FLOATN_FN (TRUNC, ECF_CONST, btrunc, unary)
+DEF_INTERNAL_OPTAB_FN (FTRUNC32, ECF_CONST, ftrunc32, unary)
+DEF_INTERNAL_OPTAB_FN (FTRUNC64, ECF_CONST, ftrunc64, unary)
 
 /* Binary math functions.  */
 DEF_INTERNAL_FLT_FN (ATAN2, ECF_CONST, atan2, binary)
diff --git a/gcc/match.pd b/gcc/match.pd
index 
a319aefa8081ac177981ad425c461f8a771128f4..

Re: [PATCH 1v2/3][vect] Add main vectorized loop unrolling

2021-11-11 Thread Andre Vieira (lists) via Gcc-patches

Hi,

This is the rebased and reworked version of the unroll patch.  I wasn't 
entirely sure whether I should compare the costs of the unrolled 
loop_vinfo with the original loop_vinfo it was unrolled of. I did now, 
but I wasn't too sure whether it was a good idea to... Any thoughts on 
this?


Regards,

Andre


gcc/ChangeLog:

    * tree-vect-loop.c (vect_estimate_min_profitable_iters): Add 
suggested_unroll_factor parameter.

    (vect_analyze_loop_costing): Likewise.
    (vect_determine_partial_vectors_and_peeling): Don't mask an 
unrolled loop.

    (vect_analyze_loop_2): Support unrolling of loops.
    (vect_can_unroll): New function.
    (vect_try_unrolling): New function.
    (vect_analyze_loop_1): Add suggested_unroll_factor parameter 
and use it.
    (vect_analyze_loop): Call vect_try_unrolling when unrolling 
suggested.

    (vectorizable_reduction): Don't single_defuse_cycle when unrolling.
    * tree-vectorizer.h (_loop_vec_info::_loop_vec_info):  Add 
suggested_unroll_factor member.

        (vector_costs::vector_costs): Add m_suggested_unroll_factor member.
    (vector_costs::suggested_unroll_factor): New getter.
    (finish_cost): Add suggested_unroll_factor out parameter and 
set it.
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 
a28bb6321d76b8222bc8cfdade151ca9b4dca406..cfce7de0430c852d37f1a93e2d6a2f630694f613
 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -153,7 +153,8 @@ along with GCC; see the file COPYING3.  If not see
http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 */
 
-static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
+static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
+   unsigned *);
 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
   bool *, bool *);
 
@@ -828,6 +829,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, 
vec_info_shared *shared)
 skip_main_loop_edge (nullptr),
 skip_this_loop_edge (nullptr),
 reusable_accumulators (),
+suggested_unroll_factor (1),
 max_vectorization_factor (0),
 mask_skip_niters (NULL_TREE),
 rgroup_compare_type (NULL_TREE),
@@ -1811,7 +1813,8 @@ vect_known_niters_smaller_than_vf (loop_vec_info 
loop_vinfo)
definitely no, or -1 if it's worth retrying.  */
 
 static int
-vect_analyze_loop_costing (loop_vec_info loop_vinfo)
+vect_analyze_loop_costing (loop_vec_info loop_vinfo,
+  unsigned *suggested_unroll_factor)
 {
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
@@ -1845,7 +1848,8 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo)
 
   int min_profitable_iters, min_profitable_estimate;
   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
- &min_profitable_estimate);
+ &min_profitable_estimate,
+ suggested_unroll_factor);
 
   if (min_profitable_iters < 0)
 {
@@ -2129,10 +2133,16 @@ vect_determine_partial_vectors_and_peeling 
(loop_vec_info loop_vinfo,
 vectors to the epilogue, with the main loop continuing to operate
 on full vectors.
 
+If we are unrolling we also do not want to use partial vectors. This
+is to avoid the overhead of generating multiple masks and also to
+avoid having to execute entire iterations of FALSE masked instructions
+when dealing with one or less full iterations.
+
 ??? We could then end up failing to use partial vectors if we
 decide to peel iterations into a prologue, and if the main loop
 then ends up processing fewer than VF iterations.  */
-  if (param_vect_partial_vector_usage == 1
+  if ((param_vect_partial_vector_usage == 1
+  || loop_vinfo->suggested_unroll_factor > 1)
  && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
  && !vect_known_niters_smaller_than_vf (loop_vinfo))
LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
@@ -2199,12 +2209,12 @@ vect_determine_partial_vectors_and_peeling 
(loop_vec_info loop_vinfo,
for it.  The different analyses will record information in the
loop_vec_info struct.  */
 static opt_result
-vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
+vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
+unsigned *suggested_unroll_factor, poly_uint64 min_vf = 2)
 {
   opt_result ok = opt_result::success ();
   int res;
   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
-  poly_uint64 min_vf = 2;
   loop_vec_info orig_loop_vinfo = NULL;
 
   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
@@ -2359,6 +2369,26 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool 
&fatal)
  set of rgroups.  

[committed][AArch64] Fix bootstrap failure due to missing ATTRIBUTE_UNUSED,andsim01,Wed 10-Nov-21 12:58 PM,View with a light background,Like,Reply,Reply all,Forward

2021-11-10 Thread Andre Vieira (lists) via Gcc-patches

Hi,

Committed this as obvious. My earlier patch removed the need for the GSI 
to be used.


gcc/ChangeLog:

    * config/aarch64/aarch64-builtins.c
    (aarch64_general_gimple_fold_builtin): Mark argument as unused.
diff --git a/gcc/config/aarch64/aarch64-builtins.c 
b/gcc/config/aarch64/aarch64-builtins.c
index 
e06131a7c61d31c1be3278dcdccc49c3053c78cb..d5b16081264ca43416a53dafb8c6ee6efad88133
 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -2458,7 +2458,7 @@ get_mem_type_for_load_store (unsigned int fcode)
failure.  */
 gimple *
 aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt,
-gimple_stmt_iterator *gsi)
+gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED)
 {
   gimple *new_stmt = NULL;
   unsigned nargs = gimple_call_num_args (stmt);


[AArch64] Fix TBAA information when lowering NEON loads and stores to gimple

2021-11-09 Thread Andre Vieira (lists) via Gcc-patches

And second (also added a test):

[AArch64] Fix TBAA information when lowering NEON loads and stores to gimple

This patch fixes the wrong TBAA information when lowering NEON loads and 
stores

to gimple that showed up when bootstrapping with UBSAN.

gcc/ChangeLog:

    * config/aarch64/aarch64-builtins.c 
(aarch64_general_gimple_fold_builtin): Change pointer alignment and alias.


gcc/testsuite/ChangeLog:

    * gcc.target/aarch64/simd/lowering_tbaa.c: New test.diff --git a/gcc/config/aarch64/aarch64-builtins.c 
b/gcc/config/aarch64/aarch64-builtins.c
index 
a815e4cfbccab692ca688ba87c71b06c304abbfb..e06131a7c61d31c1be3278dcdccc49c3053c78cb
 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -2485,18 +2485,18 @@ aarch64_general_gimple_fold_builtin (unsigned int 
fcode, gcall *stmt,
  = get_mem_type_for_load_store(fcode);
aarch64_simd_type_info simd_type
  = aarch64_simd_types[mem_type];
-   tree elt_ptr_type = build_pointer_type (simd_type.eltype);
+   tree elt_ptr_type = build_pointer_type_for_mode (simd_type.eltype,
+VOIDmode, true);
tree zero = build_zero_cst (elt_ptr_type);
-   gimple_seq stmts = NULL;
-   tree base = gimple_convert (&stmts, elt_ptr_type,
-   args[0]);
-   if (stmts)
- gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+   /* Use element type alignment.  */
+   tree access_type
+ = build_aligned_type (simd_type.itype,
+   TYPE_ALIGN (simd_type.eltype));
new_stmt
  = gimple_build_assign (gimple_get_lhs (stmt),
 fold_build2 (MEM_REF,
- simd_type.itype,
- base, zero));
+ access_type,
+ args[0], zero));
  }
break;
 
@@ -2507,18 +2507,17 @@ aarch64_general_gimple_fold_builtin (unsigned int 
fcode, gcall *stmt,
  = get_mem_type_for_load_store(fcode);
aarch64_simd_type_info simd_type
  = aarch64_simd_types[mem_type];
-   tree elt_ptr_type = build_pointer_type (simd_type.eltype);
+   tree elt_ptr_type = build_pointer_type_for_mode (simd_type.eltype,
+VOIDmode, true);
tree zero = build_zero_cst (elt_ptr_type);
-   gimple_seq stmts = NULL;
-   tree base = gimple_convert (&stmts, elt_ptr_type,
-   args[0]);
-   if (stmts)
- gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+   /* Use element type alignment.  */
+   tree access_type
+ = build_aligned_type (simd_type.itype,
+   TYPE_ALIGN (simd_type.eltype));
new_stmt
- = gimple_build_assign (fold_build2 (MEM_REF,
-simd_type.itype,
-base,
-zero), args[1]);
+ = gimple_build_assign (fold_build2 (MEM_REF, access_type,
+ args[0], zero),
+args[1]);
  }
break;
 
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/lowering_tbaa.c 
b/gcc/testsuite/gcc.target/aarch64/simd/lowering_tbaa.c
new file mode 100644
index 
..eaeae21f19c7d2d8d4e032f2f8b1b22bb96b7ca4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/lowering_tbaa.c
@@ -0,0 +1,30 @@
+/* Tests the TBAA information of lowered AArch64 SIMD loads.  */
+/* { dg-do run } */
+/* { dg-options "-save-temps -O2" } */
+
+#include 
+
+void __attribute__((noipa))
+g (float *)
+{
+}
+
+int32x4_t __attribute__((noipa))
+f (void)
+{
+  float a[4] = { 1, 2, 3, 4 };
+  g (a);
+  a[0] = a[1] = a[2] = a[3] = 0;
+  void *volatile ptr = a;
+  return vld1q_s32 ((int32_t *) ptr);
+}
+
+int
+main (void)
+{
+  int32x4_t x = f ();
+  int32x4_t y = vdupq_n_s32 (0);
+  if (__builtin_memcmp (&x, &y, 16) != 0)
+__builtin_abort ();
+  return 0;
+}


[AArch64] Fix big-endian testisms introduced by NEON gimple lowering patch

2021-11-09 Thread Andre Vieira (lists) via Gcc-patches
Decided to split the patches up to make it clear that the testisms fixes 
had nothing to do with the TBAA fix. I'll be committing these two separately


First:

[AArch64] Fix big-endian testisms introduced by NEON gimple lowering patch

This patch reverts the tests for big-endian after the NEON gimple lowering
patch.  The earlier patch only lowers NEON loads and stores for 
little-endian,
meaning the codegen now differs between endinanness so we need target 
specific

testing.

gcc/testsuite/ChangeLog:

    * gcc.target/aarch64/fmla_intrinsic_1.c: Fix big-endian testism.
    * gcc.target/aarch64/fmls_intrinsic_1.c: Likewise.
    * gcc.target/aarch64/fmul_intrinsic_1.c: Likewise.diff --git a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c 
b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
index 
adb787a8599af23847dd62dcd153d7cfe43dacc0..c1aeb06e74753052c2ee441b361b92148f1b4b0a
 100644
--- a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
@@ -107,10 +107,12 @@ main (int argc, char **argv)
 
 /* vfma_lane_f64.
vfma_laneq_f64. */
-/* { dg-final { scan-assembler-times "fmadd\\td\[0-9\]+\, d\[0-9\]+\, 
d\[0-9\]+\, d\[0-9\]+" 2 } } */
+/* { dg-final { scan-assembler-times "fmadd\\td\[0-9\]+\, d\[0-9\]+\, 
d\[0-9\]+\, d\[0-9\]+" 1 { target aarch64_big_endian } } } */
+/* { dg-final { scan-assembler-times "fmadd\\td\[0-9\]+\, d\[0-9\]+\, 
d\[0-9\]+\, d\[0-9\]+" 2 { target aarch64_little_endian } } } */
 
 /* vfmaq_lane_f64.
vfmaq_laneq_f64.  */
-/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, 
v\[0-9\]+\.d\\\[\[0-9\]+\\\]" 2 } } */
+/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, 
v\[0-9\]+\.d\\\[\[0-9\]+\\\]" 3 { target aarch64_big_endian } } } */
+/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, 
v\[0-9\]+\.d\\\[\[0-9\]+\\\]" 2 { target aarch64_little_endian } } } */
 
 
diff --git a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c 
b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
index 
865def28c3f4d04042ab495d232bb865cabb2b50..3137ea91e809e37de589091e9bbd43bfe4d221a1
 100644
--- a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
@@ -108,10 +108,12 @@ main (int argc, char **argv)
 
 /* vfms_lane_f64.
vfms_laneq_f64.  */
-/* { dg-final { scan-assembler-times "fmsub\\td\[0-9\]+\, d\[0-9\]+\, 
d\[0-9\]+\, d\[0-9\]+" 2 } } */
+/* { dg-final { scan-assembler-times "fmsub\\td\[0-9\]+\, d\[0-9\]+\, 
d\[0-9\]+\, d\[0-9\]+" 1 { target aarch64_big_endian } } } */
+/* { dg-final { scan-assembler-times "fmsub\\td\[0-9\]+\, d\[0-9\]+\, 
d\[0-9\]+\, d\[0-9\]+" 2 { target aarch64_little_endian } } } */
 
 /* vfmsq_lane_f64.
vfmsq_laneq_f64.  */
-/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, 
v\[0-9\]+\.d\\\[\[0-9\]+\\\]" 2 } } */
+/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, 
v\[0-9\]+\.d\\\[\[0-9\]+\\\]" 3 { target aarch64_big_endian } } } */
+/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, 
v\[0-9\]+\.d\\\[\[0-9\]+\\\]" 2 { target aarch64_little_endian } } } */
 
 
diff --git a/gcc/testsuite/gcc.target/aarch64/fmul_intrinsic_1.c 
b/gcc/testsuite/gcc.target/aarch64/fmul_intrinsic_1.c
index 
d01095e81c1e45dc1da998aa337ba551b3752ebe..7d4829c40d7042226f2f09fab9fdfa7c3dd211c4
 100644
--- a/gcc/testsuite/gcc.target/aarch64/fmul_intrinsic_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/fmul_intrinsic_1.c
@@ -107,10 +107,12 @@ main (int argc, char **argv)
 
 /* vmul_lane_f64.
Vmul_laneq_f64. */
-/* { dg-final { scan-assembler-times "fmul\\td\[0-9\]+, d\[0-9\]+, d\[0-9\]+" 
2 } } */
+/* { dg-final { scan-assembler-times "fmul\\td\[0-9\]+, d\[0-9\]+, d\[0-9\]+" 
1 { target aarch64_big_endian } } } */
+/* { dg-final { scan-assembler-times "fmul\\td\[0-9\]+, d\[0-9\]+, d\[0-9\]+" 
2 { target aarch64_little_endian } } } */
 
 /* vmulq_lane_f64.
vmulq_laneq_f64.  */
-/* { dg-final { scan-assembler-times "fmul\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, 
v\[0-9\]+\.d\\\[\[0-9\]+\\\]" 2 } } */
+/* { dg-final { scan-assembler-times "fmul\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, 
v\[0-9\]+\.d\\\[\[0-9\]+\\\]" 3 { target aarch64_big_endian } } } */
+/* { dg-final { scan-assembler-times "fmul\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, 
v\[0-9\]+\.d\\\[\[0-9\]+\\\]" 2 { target aarch64_little_endian } } } */
 
 


Re: [AArch64] Fix NEON load/store gimple lowering and big-endian testisms

2021-11-09 Thread Andre Vieira (lists) via Gcc-patches

Thank you both!

Here is a reworked version, this OK for trunk?diff --git a/gcc/config/aarch64/aarch64-builtins.c 
b/gcc/config/aarch64/aarch64-builtins.c
index 
a815e4cfbccab692ca688ba87c71b06c304abbfb..e06131a7c61d31c1be3278dcdccc49c3053c78cb
 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -2485,18 +2485,18 @@ aarch64_general_gimple_fold_builtin (unsigned int 
fcode, gcall *stmt,
  = get_mem_type_for_load_store(fcode);
aarch64_simd_type_info simd_type
  = aarch64_simd_types[mem_type];
-   tree elt_ptr_type = build_pointer_type (simd_type.eltype);
+   tree elt_ptr_type = build_pointer_type_for_mode (simd_type.eltype,
+VOIDmode, true);
tree zero = build_zero_cst (elt_ptr_type);
-   gimple_seq stmts = NULL;
-   tree base = gimple_convert (&stmts, elt_ptr_type,
-   args[0]);
-   if (stmts)
- gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+   /* Use element type alignment.  */
+   tree access_type
+ = build_aligned_type (simd_type.itype,
+   TYPE_ALIGN (simd_type.eltype));
new_stmt
  = gimple_build_assign (gimple_get_lhs (stmt),
 fold_build2 (MEM_REF,
- simd_type.itype,
- base, zero));
+ access_type,
+ args[0], zero));
  }
break;
 
@@ -2507,18 +2507,17 @@ aarch64_general_gimple_fold_builtin (unsigned int 
fcode, gcall *stmt,
  = get_mem_type_for_load_store(fcode);
aarch64_simd_type_info simd_type
  = aarch64_simd_types[mem_type];
-   tree elt_ptr_type = build_pointer_type (simd_type.eltype);
+   tree elt_ptr_type = build_pointer_type_for_mode (simd_type.eltype,
+VOIDmode, true);
tree zero = build_zero_cst (elt_ptr_type);
-   gimple_seq stmts = NULL;
-   tree base = gimple_convert (&stmts, elt_ptr_type,
-   args[0]);
-   if (stmts)
- gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+   /* Use element type alignment.  */
+   tree access_type
+ = build_aligned_type (simd_type.itype,
+   TYPE_ALIGN (simd_type.eltype));
new_stmt
- = gimple_build_assign (fold_build2 (MEM_REF,
-simd_type.itype,
-base,
-zero), args[1]);
+ = gimple_build_assign (fold_build2 (MEM_REF, access_type,
+ args[0], zero),
+args[1]);
  }
break;
 
diff --git a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c 
b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
index 
adb787a8599af23847dd62dcd153d7cfe43dacc0..c1aeb06e74753052c2ee441b361b92148f1b4b0a
 100644
--- a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
@@ -107,10 +107,12 @@ main (int argc, char **argv)
 
 /* vfma_lane_f64.
vfma_laneq_f64. */
-/* { dg-final { scan-assembler-times "fmadd\\td\[0-9\]+\, d\[0-9\]+\, 
d\[0-9\]+\, d\[0-9\]+" 2 } } */
+/* { dg-final { scan-assembler-times "fmadd\\td\[0-9\]+\, d\[0-9\]+\, 
d\[0-9\]+\, d\[0-9\]+" 1 { target aarch64_big_endian } } } */
+/* { dg-final { scan-assembler-times "fmadd\\td\[0-9\]+\, d\[0-9\]+\, 
d\[0-9\]+\, d\[0-9\]+" 2 { target aarch64_little_endian } } } */
 
 /* vfmaq_lane_f64.
vfmaq_laneq_f64.  */
-/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, 
v\[0-9\]+\.d\\\[\[0-9\]+\\\]" 2 } } */
+/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, 
v\[0-9\]+\.d\\\[\[0-9\]+\\\]" 3 { target aarch64_big_endian } } } */
+/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, 
v\[0-9\]+\.d\\\[\[0-9\]+\\\]" 2 { target aarch64_little_endian } } } */
 
 
diff --git a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c 
b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
index 
865def28c3f4d04042ab495d232bb865cabb2b50..3137ea91e809e37de589091e9bbd43bfe4d221a1
 100644
--- a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
@@ -108,10 +108,12 @@ main (int argc, char **argv)
 
 /* vfms_lane_f64.
vfms_laneq_f64.  */
-/* { dg-final { scan-assembler-times "fmsub\\td\[0-9\]+\, d\[0-9\]+\, 
d\[0-9\]+\, d\[0-9\]+" 2 } } */
+/* { dg-final { scan-assembler-times "fmsub\\td\[0-9\]+\, d\[0-9\]+\, 

[AArch64] Fix NEON load/store gimple lowering and big-endian testisms

2021-11-04 Thread Andre Vieira (lists) via Gcc-patches

Hi,

This should address the ubsan bootstrap build and big-endian testisms 
reported against the last NEON load/store gimple lowering patch. I also 
fixed a follow-up issue where the alias information was leading to a bad 
codegen transformation. The NEON intrinsics specifications do not forbid 
the use of memory accesses with different pointer types. In fact you 
will see intrinsic user code loading a int16x8_t vector from an int 
pointer, so we must make sure GCC is aware a NEON memory access of an 
'int' pointer can alias with a 'short' pointer.


Bootstrapped aarch64-linux-gnu (also did an ubsan bootstrap).

Is this OK for trunk?

gcc/ChangeLog:

    * config/aarch64/aarch64-builtins.c 
(aarch64_general_gimple_fold_builtin): Change pointer alignment and alias.


gcc/testsuite/ChangeLog:

    * gcc.target/aarch64/fmla_intrinsic_1.c: Fix big-endian testism.
    * gcc.target/aarch64/fmls_intrinsic_1.c: Likewise.
    * gcc.target/aarch64/fmul_intrinsic_1.c: Likewise.
diff --git a/gcc/config/aarch64/aarch64-builtins.c 
b/gcc/config/aarch64/aarch64-builtins.c
index 
a815e4cfbccab692ca688ba87c71b06c304abbfb..fc8fcb02c55e22963d2a3bf77b4749eb5b1c1561
 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -2486,16 +2486,22 @@ aarch64_general_gimple_fold_builtin (unsigned int 
fcode, gcall *stmt,
aarch64_simd_type_info simd_type
  = aarch64_simd_types[mem_type];
tree elt_ptr_type = build_pointer_type (simd_type.eltype);
+   elt_ptr_type = build_distinct_type_copy (elt_ptr_type);
+   TYPE_REF_CAN_ALIAS_ALL (elt_ptr_type) = true;
tree zero = build_zero_cst (elt_ptr_type);
gimple_seq stmts = NULL;
tree base = gimple_convert (&stmts, elt_ptr_type,
args[0]);
+   /* Use element type alignment.  */
+   tree access_type
+ = build_aligned_type (simd_type.itype,
+   TYPE_ALIGN (TREE_TYPE (simd_type.itype)));
if (stmts)
  gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
new_stmt
  = gimple_build_assign (gimple_get_lhs (stmt),
 fold_build2 (MEM_REF,
- simd_type.itype,
+ access_type,
  base, zero));
  }
break;
@@ -2508,17 +2514,22 @@ aarch64_general_gimple_fold_builtin (unsigned int 
fcode, gcall *stmt,
aarch64_simd_type_info simd_type
  = aarch64_simd_types[mem_type];
tree elt_ptr_type = build_pointer_type (simd_type.eltype);
+   elt_ptr_type = build_distinct_type_copy (elt_ptr_type);
+   TYPE_REF_CAN_ALIAS_ALL (elt_ptr_type) = true;
tree zero = build_zero_cst (elt_ptr_type);
gimple_seq stmts = NULL;
tree base = gimple_convert (&stmts, elt_ptr_type,
args[0]);
+   /* Use element type alignment.  */
+   tree access_type
+ = build_aligned_type (simd_type.itype,
+   TYPE_ALIGN (TREE_TYPE (simd_type.itype)));
if (stmts)
  gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
new_stmt
- = gimple_build_assign (fold_build2 (MEM_REF,
-simd_type.itype,
-base,
-zero), args[1]);
+ = gimple_build_assign (fold_build2 (MEM_REF, access_type,
+ base, zero),
+args[1]);
  }
break;
 
diff --git a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c 
b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
index 
adb787a8599af23847dd62dcd153d7cfe43dacc0..c1aeb06e74753052c2ee441b361b92148f1b4b0a
 100644
--- a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
@@ -107,10 +107,12 @@ main (int argc, char **argv)
 
 /* vfma_lane_f64.
vfma_laneq_f64. */
-/* { dg-final { scan-assembler-times "fmadd\\td\[0-9\]+\, d\[0-9\]+\, 
d\[0-9\]+\, d\[0-9\]+" 2 } } */
+/* { dg-final { scan-assembler-times "fmadd\\td\[0-9\]+\, d\[0-9\]+\, 
d\[0-9\]+\, d\[0-9\]+" 1 { target aarch64_big_endian } } } */
+/* { dg-final { scan-assembler-times "fmadd\\td\[0-9\]+\, d\[0-9\]+\, 
d\[0-9\]+\, d\[0-9\]+" 2 { target aarch64_little_endian } } } */
 
 /* vfmaq_lane_f64.
vfmaq_laneq_f64.  */
-/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, 
v\[0-9\]+\.d\\\[\[0-9\]+\\\]" 2 } } */
+/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, 
v\[0-9\]+\.d\\\[\[0-9\]+\\\]" 3 { target aarch64_big_endian } } } */
+/* { dg-final { scan-assembler-times "f

[Aarch64] Fix alignment of neon loads & stores in gimple

2021-10-25 Thread Andre Vieira (lists) via Gcc-patches

Hi,

This fixes the alignment on the memory access type for neon loads & 
stores in the gimple lowering. Bootstrap ubsan on aarch64 builds again 
with this change.



2021-10-25  Andre Vieira  

gcc/ChangeLog:

    * config/aarch64/aarch64-builtins.c 
(aarch64_general_gimple_fold_builtin): Fix memory access

    type alignment.


Is this OK for trunk?

Kind regards,
Andre
diff --git a/gcc/config/aarch64/aarch64-builtins.c 
b/gcc/config/aarch64/aarch64-builtins.c
index 
a815e4cfbccab692ca688ba87c71b06c304abbfb..f5436baf5f8a65c340e05faa491d86a7847c37d3
 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -2490,12 +2490,16 @@ aarch64_general_gimple_fold_builtin (unsigned int 
fcode, gcall *stmt,
gimple_seq stmts = NULL;
tree base = gimple_convert (&stmts, elt_ptr_type,
args[0]);
+   /* Use element type alignment.  */
+   tree access_type
+ = build_aligned_type (simd_type.itype,
+   TYPE_ALIGN (TREE_TYPE (simd_type.itype)));
if (stmts)
  gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
new_stmt
  = gimple_build_assign (gimple_get_lhs (stmt),
 fold_build2 (MEM_REF,
- simd_type.itype,
+ access_type,
  base, zero));
  }
break;
@@ -2512,13 +2516,16 @@ aarch64_general_gimple_fold_builtin (unsigned int 
fcode, gcall *stmt,
gimple_seq stmts = NULL;
tree base = gimple_convert (&stmts, elt_ptr_type,
args[0]);
+   /* Use element type alignment.  */
+   tree access_type
+ = build_aligned_type (simd_type.itype,
+   TYPE_ALIGN (TREE_TYPE (simd_type.itype)));
if (stmts)
  gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
new_stmt
- = gimple_build_assign (fold_build2 (MEM_REF,
-simd_type.itype,
-base,
-zero), args[1]);
+ = gimple_build_assign (fold_build2 (MEM_REF, access_type,
+ base, zero),
+args[1]);
  }
break;
 


Re: [PATCH 1v2/3][vect] Add main vectorized loop unrolling

2021-10-20 Thread Andre Vieira (lists) via Gcc-patches

On 15/10/2021 09:48, Richard Biener wrote:

On Tue, 12 Oct 2021, Andre Vieira (lists) wrote:


Hi Richi,

I think this is what you meant, I now hide all the unrolling cost calculations
in the existing target hooks for costs. I did need to adjust 'finish_cost' to
take the loop_vinfo so the target's implementations are able to set the newly
renamed 'suggested_unroll_factor'.

Also added the checks for the epilogue's VF.

Is this more like what you had in mind?

Not exactly (sorry..).  For the target hook I think we don't want to
pass vec_info but instead another output parameter like the existing
ones.

vect_estimate_min_profitable_iters should then via
vect_analyze_loop_costing and vect_analyze_loop_2 report the unroll
suggestion to vect_analyze_loop which should then, if the suggestion
was > 1, instead of iterating to the next vector mode run again
with a fixed VF (old VF times suggested unroll factor - there's
min_vf in vect_analyze_loop_2 which we should adjust to
the old VF times two for example and maybe store the suggested
factor as hint) - if it succeeds the result will end up in the
list of considered modes (where we now may have more than one
entry for the same mode but a different VF), we probably want to
only consider more unrolling once.

For simplicity I'd probably set min_vf = max_vf = old VF * suggested
factor, thus take the targets request literally.

Richard.


Hi,

I now pass an output parameter to finish_costs and route it through the 
various calls up to vect_analyze_loop.  I tried to rework 
vect_determine_vectorization_factor and noticed that merely setting 
min_vf and max_vf is not enough, we only use these to check whether the 
vectorization factor is within range, well actually we only use max_vf 
at that stage. We only seem to use 'min_vf' to make sure the 
data_references are valid.  I am not sure my changes are the most 
appropriate here, for instance I am pretty sure the checks for max and 
min vf I added in vect_determine_vectorization_factor are currently 
superfluous as they will pass by design, but thought they might be good 
future proofing?


Also I changed how we compare against max_vf, rather than relying on the 
'MAX_VECTORIZATION' I decided to use the estimated_poly_value with 
POLY_VALUE_MAX, to be able to bound it further in case we have knowledge 
of the VL. I am not entirely about the validity of this change, maybe we 
are better off keeping the MAX_VECTORIZATION in place and not making any 
changes to max_vf for unrolling.


What do you think?
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 
36519ccc5a58abab483c38d0a6c5f039592bfc7f..9b1e01e9b62050d7e34bc55454771e40bdbdb4cb
 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -15972,8 +15972,8 @@ aarch64_adjust_body_cost (aarch64_vector_costs *costs, 
unsigned int body_cost)
 
 /* Implement TARGET_VECTORIZE_FINISH_COST.  */
 static void
-aarch64_finish_cost (void *data, unsigned *prologue_cost,
-unsigned *body_cost, unsigned *epilogue_cost)
+aarch64_finish_cost (void *data, unsigned *prologue_cost, unsigned *body_cost,
+unsigned *epilogue_cost, unsigned *suggested_unroll_factor)
 {
   auto *costs = static_cast (data);
   *prologue_cost = costs->region[vect_prologue];
@@ -15984,6 +15984,9 @@ aarch64_finish_cost (void *data, unsigned 
*prologue_cost,
   && costs->vec_flags
   && aarch64_use_new_vector_costs_p ())
 *body_cost = aarch64_adjust_body_cost (costs, *body_cost);
+
+  if(suggested_unroll_factor)
+*suggested_unroll_factor = 1;
 }
 
 /* Implement TARGET_VECTORIZE_DESTROY_COST_DATA.  */
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 
afc2674d49da370ae0f5ef277df7e9954f303b8e..a48e43879512793907fef946c1575c3ed7f68092
 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -23048,13 +23048,15 @@ ix86_add_stmt_cost (class vec_info *vinfo, void 
*data, int count,
 /* Implement targetm.vectorize.finish_cost.  */
 
 static void
-ix86_finish_cost (void *data, unsigned *prologue_cost,
- unsigned *body_cost, unsigned *epilogue_cost)
+ix86_finish_cost (void *data, unsigned *prologue_cost, unsigned *body_cost,
+ unsigned *epilogue_cost, unsigned *suggested_unroll_factor)
 {
   unsigned *cost = (unsigned *) data;
   *prologue_cost = cost[vect_prologue];
   *body_cost = cost[vect_body];
   *epilogue_cost = cost[vect_epilogue];
+  if (suggested_unroll_factor)
+*suggested_unroll_factor = 1;
 }
 
 /* Implement targetm.vectorize.destroy_cost_data.  */
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 
ad81dfb316dff00cde810d6b1edd31fa49d5c1e8..59d30ad6fcd1758383c52e34a0f90a126c501ec3
 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -5551,8 +5551,8 @@ rs6000_adjust_vect_cost_per_loop (rs6000_cost_data 

Re: FW: [PING] Re: [Patch][GCC][middle-end] - Generate FRINTZ for (double)(int) under -ffast-math on aarch64

2021-10-20 Thread Andre Vieira (lists) via Gcc-patches


On 19/10/2021 00:22, Joseph Myers wrote:

On Fri, 15 Oct 2021, Richard Biener via Gcc-patches wrote:


On Fri, Sep 24, 2021 at 2:59 PM Jirui Wu via Gcc-patches
 wrote:

Hi,

Ping: https://gcc.gnu.org/pipermail/gcc-patches/2021-August/577846.html

The patch is attached as text for ease of use. Is there anything that needs to 
change?

Ok for master? If OK, can it be committed for me, I have no commit rights.

I'm still not sure about the correctness.  I suppose the
flag_fp_int_builtin_inexact && !flag_trapping_math is supposed to guard
against spurious inexact exceptions, shouldn't that be
!flag_fp_int_builtin_inexact || !flag_trapping_math instead?

The following remarks may be relevant here, but are not intended as an
assertion of what is correct in this case.

1. flag_fp_int_builtin_inexact is the more permissive case ("inexact" may
or may not be raised).  All existing uses in back ends are
"flag_fp_int_builtin_inexact || !flag_trapping_math" or equivalent.

2. flag_fp_int_builtin_inexact only applies to certain built-in functions
(as listed in invoke.texi).  It's always unspecified, even in C2X, whether
casts of non-integer values from floating-point to integer types raise
"inexact".  So flag_fp_int_builtin_inexact should not be checked in insn
patterns corresponding to simple casts from floating-point to integer,
only in insn patterns corresponding to the built-in functions listed for
-fno-fp-int-builtin-inexact in invoke.texi (or for operations that combine
such a built-in function with a cast of the *result* to integer type).

Hi,

I agree with Joseph, I don't think we should be checking 
flag_fp_int_builtin_inexact here because we aren't transforming the math 
function 'trunc', but rather a piece of C-code that has trunc-like 
semantics.


As for flag_trapping_math, it's definition says 'Assume floating point 
operations can trap'. I assume IFN_TRUNC would not trap, since I don't 
think IFN_TRUNC will preserve the overflow behaviour, in the cases where 
the FP value is bigger than the intermediate integer type range. So I 
think we should prevent the transformation if we are assuming the FP 
instructions can trap.


If we don't assume the FP instructions can trap, then I think it's fine 
to ignore the overflow as this behavior is undefined in C.


Also changed the comment. Slightly different to your suggestion Richard, 
in an attempt to be more generic. Do you still have concerns regarding 
the checks?


Kind regards,
Andrediff --git a/gcc/match.pd b/gcc/match.pd
index 
3ff15bc0de5aba45ade94ca6e47e01fad9a2a314..5bed2e12715ea213813ef8b84fd420475b04d201
 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -3606,6 +3606,19 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
 >= inside_prec - !inside_unsignedp)
  (convert @0)))
 
+/* (float_type)(integer_type) x -> trunc (x) if the type of x matches
+   float_type.  Only do the transformation if we do not need to preserve
+   trapping behaviour, so require !flag_trapping_math. */
+#if GIMPLE
+(simplify
+   (float (fix_trunc @0))
+   (if (!flag_trapping_math
+   && types_match (type, TREE_TYPE (@0))
+   && direct_internal_fn_supported_p (IFN_TRUNC, type,
+ OPTIMIZE_FOR_BOTH))
+  (IFN_TRUNC @0)))
+#endif
+
 /* If we have a narrowing conversion to an integral type that is fed by a
BIT_AND_EXPR, we might be able to remove the BIT_AND_EXPR if it merely
masks off bits outside the final type (and nothing else).  */
diff --git a/gcc/testsuite/gcc.target/aarch64/merge_trunc1.c 
b/gcc/testsuite/gcc.target/aarch64/merge_trunc1.c
new file mode 100644
index 
..07217064e2ba54fcf4f5edc440e6ec19ddae66e1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/merge_trunc1.c
@@ -0,0 +1,41 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ffast-math" } */
+
+float
+f1 (float x)
+{
+  int y = x;
+
+  return (float) y;
+}
+
+double
+f2 (double x)
+{
+  long y = x;
+
+  return (double) y;
+}
+
+float
+f3 (double x)
+{
+  int y = x;
+
+  return (float) y;
+}
+
+double
+f4 (float x)
+{
+  int y = x;
+
+  return (double) y;
+}
+
+/* { dg-final { scan-assembler "frintz\\ts\[0-9\]+, s\[0-9\]+" } } */
+/* { dg-final { scan-assembler "frintz\\td\[0-9\]+, d\[0-9\]+" } } */
+/* { dg-final { scan-assembler "fcvtzs\\tw\[0-9\]+, d\[0-9\]+" } } */
+/* { dg-final { scan-assembler "scvtf\\ts\[0-9\]+, w\[0-9\]+" } } */
+/* { dg-final { scan-assembler "fcvtzs\\tw\[0-9\]+, s\[0-9\]+" } } */
+/* { dg-final { scan-assembler "scvtf\\td\[0-9\]+, w\[0-9\]+" } } */


Re: [Patch][GCC][middle-end] - Lower store and load neon builtins to gimple

2021-10-20 Thread Andre Vieira (lists) via Gcc-patches

On 27/09/2021 12:54, Richard Biener via Gcc-patches wrote:

On Mon, 27 Sep 2021, Jirui Wu wrote:


Hi all,

I now use the type based on the specification of the intrinsic
instead of type based on formal argument.

I use signed Int vector types because the outputs of the neon builtins
that I am lowering is always signed. In addition, fcode and stmt
does not have information on whether the result is signed.

Because I am replacing the stmt with new_stmt,
a VIEW_CONVERT_EXPR cast is already in the code if needed.
As a result, the result assembly code is correct.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master? If OK can it be committed for me, I have no commit rights.

+   tree temp_lhs = gimple_call_lhs (stmt);
+   aarch64_simd_type_info simd_type
+ = aarch64_simd_types[mem_type];
+   tree elt_ptr_type = build_pointer_type (simd_type.eltype);
+   tree zero = build_zero_cst (elt_ptr_type);
+   gimple_seq stmts = NULL;
+   tree base = gimple_convert (&stmts, elt_ptr_type,
+   args[0]);
+   new_stmt = gimple_build_assign (temp_lhs,
+fold_build2 (MEM_REF,
+TREE_TYPE (temp_lhs),
+base,
+zero));

this now uses the alignment info as on the LHS of the call by using
TREE_TYPE (temp_lhs) as type of the MEM_REF.  So for example

  typedef int foo __attribute__((vector_size(N),aligned(256)));

  foo tem = ld1 (ptr);

will now access *ptr as if it were aligned to 256 bytes.  But I'm sure
the ld1 intrinsic documents the required alignment (either it's the
natural alignment of the vector type loaded or element alignment?).

For element alignment you'd do sth like

   tree access_type = build_aligned_type (vector_type, TYPE_ALIGN
(TREE_TYPE (vector_type)));

for example.

Richard.

Hi,

I'm taking over this patch from Jirui.

I've decided to use the vector type stored in aarch64_simd_type_info, 
since that should always have the correct alignment.


To be fair though, I do wonder whether this is actually needed as is 
right now, since the way we cast the inputs and outputs of these 
__builtins in arm_neon.h prevents these issues I think, but it is more 
future proof. Also you could argue people could use the __builtins 
directly, though I'd think that would be at their own risk.


Is this OK?

Kind regards,
Andrediff --git a/gcc/config/aarch64/aarch64-builtins.c 
b/gcc/config/aarch64/aarch64-builtins.c
index 
1a507ea59142d0b5977b0167abfe9a58a567adf7..a815e4cfbccab692ca688ba87c71b06c304abbfb
 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -46,6 +46,7 @@
 #include "emit-rtl.h"
 #include "stringpool.h"
 #include "attribs.h"
+#include "gimple-fold.h"
 
 #define v8qi_UP  E_V8QImode
 #define v4hi_UP  E_V4HImode
@@ -2399,11 +2400,65 @@ aarch64_general_fold_builtin (unsigned int fcode, tree 
type,
   return NULL_TREE;
 }
 
+enum aarch64_simd_type
+get_mem_type_for_load_store (unsigned int fcode)
+{
+  switch (fcode)
+  {
+VAR1 (LOAD1, ld1 , 0, LOAD, v8qi)
+VAR1 (STORE1, st1 , 0, STORE, v8qi)
+  return Int8x8_t;
+VAR1 (LOAD1, ld1 , 0, LOAD, v16qi)
+VAR1 (STORE1, st1 , 0, STORE, v16qi)
+  return Int8x16_t;
+VAR1 (LOAD1, ld1 , 0, LOAD, v4hi)
+VAR1 (STORE1, st1 , 0, STORE, v4hi)
+  return Int16x4_t;
+VAR1 (LOAD1, ld1 , 0, LOAD, v8hi)
+VAR1 (STORE1, st1 , 0, STORE, v8hi)
+  return Int16x8_t;
+VAR1 (LOAD1, ld1 , 0, LOAD, v2si)
+VAR1 (STORE1, st1 , 0, STORE, v2si)
+  return Int32x2_t;
+VAR1 (LOAD1, ld1 , 0, LOAD, v4si)
+VAR1 (STORE1, st1 , 0, STORE, v4si)
+  return Int32x4_t;
+VAR1 (LOAD1, ld1 , 0, LOAD, v2di)
+VAR1 (STORE1, st1 , 0, STORE, v2di)
+  return Int64x2_t;
+VAR1 (LOAD1, ld1 , 0, LOAD, v4hf)
+VAR1 (STORE1, st1 , 0, STORE, v4hf)
+  return Float16x4_t;
+VAR1 (LOAD1, ld1 , 0, LOAD, v8hf)
+VAR1 (STORE1, st1 , 0, STORE, v8hf)
+  return Float16x8_t;
+VAR1 (LOAD1, ld1 , 0, LOAD, v4bf)
+VAR1 (STORE1, st1 , 0, STORE, v4bf)
+  return Bfloat16x4_t;
+VAR1 (LOAD1, ld1 , 0, LOAD, v8bf)
+VAR1 (STORE1, st1 , 0, STORE, v8bf)
+  return Bfloat16x8_t;
+VAR1 (LOAD1, ld1 , 0, LOAD, v2sf)
+VAR1 (STORE1, st1 , 0, STORE, v2sf)
+  return Float32x2_t;
+VAR1 (LOAD1, ld1 , 0, LOAD, v4sf)
+VAR1 (STORE1, st1 , 0, STORE, v4sf)
+  return Float32x4_t;
+VAR1 (LOAD1, ld1 , 0, LOAD, v2df)
+VAR1 (STORE1, st1 , 0, STORE, v2df)
+  return Float64x2_t;
+default:
+  gcc_unreachable ();
+  break;
+  }
+}
+
 /* Try to fold STMT, given that it's a call to the built-in function with
subcode FCODE.  Return the new statement on success and null on
failure.  */
 gimple *
-aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt)
+aarch64_general_gimple_fol

Re: [PATCH 2/3][vect] Consider outside costs earlier for epilogue loops

2021-10-14 Thread Andre Vieira (lists) via Gcc-patches

Hi,

I completely forgot I still had this patch out as well, I grouped it 
together with the unrolling because it was what motivated the change, 
but it is actually wider applicable and can be reviewed separately.


On 17/09/2021 16:32, Andre Vieira (lists) via Gcc-patches wrote:

Hi,

This patch changes the order in which we check outside and inside 
costs for epilogue loops, this is to ensure that a predicated epilogue 
is more likely to be picked over an unpredicated one, since it saves 
having to enter a scalar epilogue loop.


gcc/ChangeLog:

    * tree-vect-loop.c (vect_better_loop_vinfo_p): Change how 
epilogue loop costs are compared.


Re: [arm] Fix MVE addressing modes for VLDR[BHW] and VSTR[BHW]

2021-10-13 Thread Andre Vieira (lists) via Gcc-patches



On 13/10/2021 13:37, Kyrylo Tkachov wrote:

Hi Andre,


@@ -24276,7 +24271,7 @@ arm_print_operand (FILE *stream, rtx x, int code)
else if (code == POST_MODIFY || code == PRE_MODIFY)
  {
asm_fprintf (stream, "[%r", REGNO (XEXP (addr, 0)));
-   postinc_reg = XEXP ( XEXP (x, 1), 1);
+   postinc_reg = XEXP (XEXP (addr, 1), 1);
if (postinc_reg && CONST_INT_P (postinc_reg))
  {
if (code == POST_MODIFY)

this looks like a bug fix that should be separately backported to the branches?
Otherwise, the patch looks ok for trunk to me.
Thanks,
Kyrill

Normally I'd agree with you, but this is specific for the 'E' handling, 
which is MVE only and I am pretty sure the existing code would never 
accept POST/PRE Modify codes so this issue will never trigger before my 
patch.So I'm not sure it's useful to backport a bugfix for a bug that 
won't trigger, unless we also backport the entire patch, but I suspect 
we don't want to do that?




[arm] Fix MVE addressing modes for VLDR[BHW] and VSTR[BHW]

2021-10-12 Thread Andre Vieira (lists) via Gcc-patches

Hi,

The way we were previously dealing with addressing modes for MVE was 
preventing

the use of pre, post and offset addressing modes for the normal loads and
stores, including widening and narrowing.  This patch fixes that and
adds tests to ensure we are capable of using all the available addressing
modes.

gcc/ChangeLog:
2021-10-12  Andre Vieira  

    * config/arm/arm.c (thumb2_legitimate_address_p): Use 
VALID_MVE_MODE

    when checking mve addressing modes.
    (mve_vector_mem_operand): Fix the way we handle pre, post and 
offset

    addressing modes.
    (arm_print_operand): Fix printing of POST_ and PRE_MODIFY.
    * config/arm/mve.md: Use mve_memory_operand predicate 
everywhere where

    there is a single Ux constraint.

gcc/testsuite/ChangeLog:
2021-10-12  Andre Vieira  

    * gcc.target/arm/mve/mve.exp: Make it test main directory.
    * gcc.target/arm/mve/mve_load_memory_modes.c: New test.
    * gcc.target/arm/mve/mve_store_memory_modes.c: New test.
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 
6c6e77fab666f4aeff023b1f949e3ca0a3545658..d921261633aeff4f92a2e1a6057b00b685dea892
 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -8530,8 +8530,7 @@ thumb2_legitimate_address_p (machine_mode mode, rtx x, 
int strict_p)
   bool use_ldrd;
   enum rtx_code code = GET_CODE (x);
 
-  if (TARGET_HAVE_MVE
-  && (mode == V8QImode || mode == E_V4QImode || mode == V4HImode))
+  if (TARGET_HAVE_MVE && VALID_MVE_MODE (mode))
 return mve_vector_mem_operand (mode, x, strict_p);
 
   if (arm_address_register_rtx_p (x, strict_p))
@@ -13433,53 +13432,49 @@ mve_vector_mem_operand (machine_mode mode, rtx op, 
bool strict)
   || code == PRE_INC || code == POST_DEC)
 {
   reg_no = REGNO (XEXP (op, 0));
-  return (((mode == E_V8QImode || mode == E_V4QImode || mode == E_V4HImode)
-  ? reg_no <= LAST_LO_REGNUM
-  :(reg_no < LAST_ARM_REGNUM && reg_no != SP_REGNUM))
- || (!strict && reg_no >= FIRST_PSEUDO_REGISTER));
-}
-  else if ((code == POST_MODIFY || code == PRE_MODIFY)
-  && GET_CODE (XEXP (op, 1)) == PLUS && REG_P (XEXP (XEXP (op, 1), 1)))
+  return ((mode == E_V8QImode || mode == E_V4QImode || mode == E_V4HImode)
+ ? reg_no <= LAST_LO_REGNUM
+ :(reg_no < LAST_ARM_REGNUM && reg_no != SP_REGNUM))
+   || reg_no >= FIRST_PSEUDO_REGISTER;
+}
+  else if (((code == POST_MODIFY || code == PRE_MODIFY)
+   && GET_CODE (XEXP (op, 1)) == PLUS
+   && XEXP (op, 0) == XEXP (XEXP (op, 1), 0)
+   && REG_P (XEXP (op, 0))
+   && GET_CODE (XEXP (XEXP (op, 1), 1)) == CONST_INT)
+  /* Make sure to only accept PLUS after reload_completed, otherwise
+ this will interfere with auto_inc's pattern detection.  */
+  || (reload_completed && code == PLUS && REG_P (XEXP (op, 0))
+  && GET_CODE (XEXP (op, 1)) == CONST_INT))
 {
   reg_no = REGNO (XEXP (op, 0));
-  val = INTVAL (XEXP ( XEXP (op, 1), 1));
+  if (code == PLUS)
+   val = INTVAL (XEXP (op, 1));
+  else
+   val = INTVAL (XEXP(XEXP (op, 1), 1));
+
   switch (mode)
{
  case E_V16QImode:
-   if (abs (val) <= 127)
- return ((reg_no < LAST_ARM_REGNUM && reg_no != SP_REGNUM)
- || (!strict && reg_no >= FIRST_PSEUDO_REGISTER));
-   return FALSE;
- case E_V8HImode:
- case E_V8HFmode:
-   if (abs (val) <= 255)
- return ((reg_no < LAST_ARM_REGNUM && reg_no != SP_REGNUM)
- || (!strict && reg_no >= FIRST_PSEUDO_REGISTER));
-   return FALSE;
  case E_V8QImode:
  case E_V4QImode:
if (abs (val) <= 127)
- return (reg_no <= LAST_LO_REGNUM
- || (!strict && reg_no >= FIRST_PSEUDO_REGISTER));
+ return (reg_no < LAST_ARM_REGNUM && reg_no != SP_REGNUM)
+   || reg_no >= FIRST_PSEUDO_REGISTER;
return FALSE;
+ case E_V8HImode:
+ case E_V8HFmode:
  case E_V4HImode:
  case E_V4HFmode:
if (val % 2 == 0 && abs (val) <= 254)
- return (reg_no <= LAST_LO_REGNUM
- || (!strict && reg_no >= FIRST_PSEUDO_REGISTER));
+ return reg_no <= LAST_LO_REGNUM
+   || reg_no >= FIRST_PSEUDO_REGISTER;
return FALSE;
  case E_V4SImode:
  case E_V4SFmode:
if (val % 4 == 0 && abs (val) <= 508)
- return ((reg_no < LAST_ARM_REGNUM && reg_no != SP_REGNUM)
- || (!strict && reg_no >= FIRST_PSEUDO_REGISTER)

Re: [PATCH 1v2/3][vect] Add main vectorized loop unrolling

2021-10-12 Thread Andre Vieira (lists) via Gcc-patches

Hi Richi,

I think this is what you meant, I now hide all the unrolling cost 
calculations in the existing target hooks for costs. I did need to 
adjust 'finish_cost' to take the loop_vinfo so the target's 
implementations are able to set the newly renamed 'suggested_unroll_factor'.


Also added the checks for the epilogue's VF.

Is this more like what you had in mind?


gcc/ChangeLog:

    * config/aarch64/aarch64.c (aarch64_finish_cost): Add class 
vec_info parameter.

    * config/i386/i386.c (ix86_finish_cost): Likewise.
    * config/rs6000/rs6000.c (rs6000_finish_cost): Likewise.
    * doc/tm.texi: Document changes to TARGET_VECTORIZE_FINISH_COST.
    * target.def: Add class vec_info parameter to finish_cost.
    * targhooks.c (default_finish_cost): Likewise.
    * targhooks.h (default_finish_cost): Likewise.
    * tree-vect-loop.c (vect_determine_vectorization_factor): Use 
suggested_unroll_factor

    to increase vectorization_factor if possible.
    (_loop_vec_info::_loop_vec_info): Add suggested_unroll_factor 
member.
    (vect_compute_single_scalar_iteration_cost): Adjust call to 
finish_cost.
    (vect_determine_partial_vectors_and_peeling): Ensure unrolled 
loop is not predicated.

    (vect_determine_unroll_factor): New.
    (vect_try_unrolling): New.
    (vect_reanalyze_as_main_loop): Also try to unroll when 
reanalyzing as main loop.
    (vect_analyze_loop): Add call to vect_try_unrolling and check 
to ensure epilogue
    is either a smaller VF than main loop or uses partial vectors 
and might be of equal

    VF.
    (vect_estimate_min_profitable_iters): Adjust call to finish_cost.
    (vectorizable_reduction): Make sure to not use 
single_defuse_cyle when unrolling.
    * tree-vect-slp.c (vect_bb_vectorization_profitable_p): Adjust 
call to finish_cost.
    * tree-vectorizer.h (finish_cost): Change to pass new class 
vec_info parameter.


On 01/10/2021 09:19, Richard Biener wrote:

On Thu, 30 Sep 2021, Andre Vieira (lists) wrote:


Hi,



That just forces trying the vector modes we've tried before. Though I might
need to revisit this now I think about it. I'm afraid it might be possible
for
this to generate an epilogue with a vf that is not lower than that of the
main
loop, but I'd need to think about this again.

Either way I don't think this changes the vector modes used for the
epilogue.
But maybe I'm just missing your point here.

Yes, I was refering to the above which suggests that when we vectorize
the main loop with V4SF but unroll then we try vectorizing the
epilogue with V4SF as well (but not unrolled).  I think that's
premature (not sure if you try V8SF if the main loop was V4SF but
unrolled 4 times).

My main motivation for this was because I had a SVE loop that vectorized with
both VNx8HI, then V8HI which beat VNx8HI on cost, then it decided to unroll
V8HI by two and skipped using VNx8HI as a predicated epilogue which would've
been the best choice.

I see, yes - for fully predicated epilogues it makes sense to consider
the same vector mode as for the main loop anyways (independent on
whether we're unrolling or not).  One could argue that with an
unrolled V4SImode main loop a predicated V8SImode epilogue would also
be a good match (but then somehow costing favored the unrolled V4SI
over the V8SI for the main loop...).


So that is why I decided to just 'reset' the vector_mode selection. In a
scenario where you only have the traditional vector modes it might make less
sense.

Just realized I still didn't add any check to make sure the epilogue has a
lower VF than the previous loop, though I'm still not sure that could happen.
I'll go look at where to add that if you agree with this.

As said above, it only needs a lower VF in case the epilogue is not
fully masked - otherwise the same VF would be OK.


I can move it there, it would indeed remove the need for the change to
vect_update_vf_for_slp, the change to
vect_determine_partial_vectors_and_peeling would still be required I think.
It
is meant to disable using partial vectors in an unrolled loop.

Why would we disable the use of partial vectors in an unrolled loop?

The motivation behind that is that the overhead caused by generating
predicates for each iteration will likely be too much for it to be profitable
to unroll. On top of that, when dealing with low iteration count loops, if
executing one predicated iteration would be enough we now still need to
execute all other unrolled predicated iterations, whereas if we keep them
unrolled we skip the unrolled loops.

OK, I guess we're not factoring in costs when deciding on predication
but go for it if it's gernally enabled and possible.

With the proposed scheme we'd then cost the predicated not unrolled
loop against a not predicated unrolled loop which might be a bit
apples vs. o

Re: [PATCH 1v2/3][vect] Add main vectorized loop unrolling

2021-09-30 Thread Andre Vieira (lists) via Gcc-patches

Hi,



That just forces trying the vector modes we've tried before. Though I might
need to revisit this now I think about it. I'm afraid it might be possible for
this to generate an epilogue with a vf that is not lower than that of the main
loop, but I'd need to think about this again.

Either way I don't think this changes the vector modes used for the epilogue.
But maybe I'm just missing your point here.

Yes, I was refering to the above which suggests that when we vectorize
the main loop with V4SF but unroll then we try vectorizing the
epilogue with V4SF as well (but not unrolled).  I think that's
premature (not sure if you try V8SF if the main loop was V4SF but
unrolled 4 times).


My main motivation for this was because I had a SVE loop that vectorized 
with both VNx8HI, then V8HI which beat VNx8HI on cost, then it decided 
to unroll V8HI by two and skipped using VNx8HI as a predicated epilogue 
which would've been the best choice.


So that is why I decided to just 'reset' the vector_mode selection. In a 
scenario where you only have the traditional vector modes it might make 
less sense.


Just realized I still didn't add any check to make sure the epilogue has 
a lower VF than the previous loop, though I'm still not sure that could 
happen. I'll go look at where to add that if you agree with this.



I can move it there, it would indeed remove the need for the change to
vect_update_vf_for_slp, the change to
vect_determine_partial_vectors_and_peeling would still be required I think. It
is meant to disable using partial vectors in an unrolled loop.

Why would we disable the use of partial vectors in an unrolled loop?
The motivation behind that is that the overhead caused by generating 
predicates for each iteration will likely be too much for it to be 
profitable to unroll. On top of that, when dealing with low iteration 
count loops, if executing one predicated iteration would be enough we 
now still need to execute all other unrolled predicated iterations, 
whereas if we keep them unrolled we skip the unrolled loops.

Sure but I'm suggesting you keep the not unrolled body as one way of
costed vectorization but then if the target says "try unrolling"
re-do the analysis with the same mode but a larger VF.  Just like
we iterate over vector modes you'll now iterate over pairs of
vector mode + VF (unroll factor).  It's not about re-using the costing
it's about using costing that is actually relevant and also to avoid
targets inventing two distinct separate costings - a target (powerpc)
might already compute load/store density and other stuff for the main
costing so it should have an idea whether doubling or triplicating is OK.

Richard.
Sounds good! I changed the patch to determine the unrolling factor 
later, after all analysis has been done and retry analysis if an 
unrolling factor larger than 1 has been chosen for this loop and 
vector_mode.


gcc/ChangeLog:

    * doc/tm.texi: Document TARGET_VECTORIZE_UNROLL_FACTOR.
    * doc/tm.texi.in: Add entries for TARGET_VECTORIZE_UNROLL_FACTOR.
    * params.opt: Add vect-unroll and vect-unroll-reductions 
parameters.

    * target.def: Define hook TARGET_VECTORIZE_UNROLL_FACTOR.
    * targhooks.c (default_unroll_factor): New.
    * targhooks.h (default_unroll_factor): Likewise.
    * tree-vect-loop.c (_loop_vec_info::_loop_vec_info): Initialize
    par_unrolling_factor.
    (vect_determine_partial_vectors_and_peeling): Account for 
unrolling.

    (vect_determine_unroll_factor): New.
    (vect_try_unrolling): New.
    (vect_reanalyze_as_main_loop): Call vect_try_unrolling when
    retrying a loop_vinfo as a main loop.
    (vect_analyze_loop): Call vect_try_unrolling when vectorizing 
main loops.
    (vect_analyze_loop): Allow for epilogue vectorization when 
unrolling

    and rewalk vector_mode warray for the epilogues.
    (vectorizable_reduction): Disable single_defuse_cycle when 
unrolling.
    * tree-vectorizer.h (vect_unroll_value): Declare 
par_unrolling_factor

    as a member of loop_vec_info.
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 
be8148583d8571b0d035b1938db9d056bfd213a8..71ee33a200fcbd37ccd5380321df507ae1e8961f
 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6289,6 +6289,12 @@ allocated by TARGET_VECTORIZE_INIT_COST.  The default 
releases the
 accumulator.
 @end deftypefn
 
+@deftypefn {Target Hook} unsigned TARGET_VECTORIZE_UNROLL_FACTOR (class 
vec_info *@var{vinfo})
+This hook should return the desired vector unrolling factor for a loop with
+@var{vinfo}. The default returns one, which means no unrolling will be
+performed.
+@end deftypefn
+
 @deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_GATHER (const_tree 
@var{mem_vectype}, const_tree @var{index_type}, int @var{scale})
 Target builtin that implements vector gather operation.  @var{mem_vectype}
 is the vector type of the load and @var{index_type} is scalar type of
diff --git a/gcc/doc

Re: [PATCH 1/3][vect] Add main vectorized loop unrolling

2021-09-21 Thread Andre Vieira (lists) via Gcc-patches

Hi Richi,

Thanks for the review, see below some questions.

On 21/09/2021 13:30, Richard Biener wrote:

On Fri, 17 Sep 2021, Andre Vieira (lists) wrote:


Hi all,

This patch adds the ability to define a target hook to unroll the main
vectorized loop. It also introduces --param's vect-unroll and
vect-unroll-reductions to control this through a command-line. I found this
useful to experiment and believe can help when tuning, so I decided to leave
it in.
We only unroll the main loop and have disabled unrolling epilogues for now. We
also do not support unrolling of any loop that has a negative step and we do
not support unrolling a loop with any reduction other than a
TREE_CODE_REDUCTION.

Bootstrapped and regression tested on aarch64-linux-gnu as part of the series.

I wonder why we want to change the vector modes used for the epilogue,
we're either making it more likely to need to fall through to the
scalar epilogue or require another vectorized epilogue.
I don't quite understand what you mean by change the vector modes for 
the epilogue. I don't think we do.

If you are referring to:
      /* If we are unrolling, try all VECTOR_MODES for the epilogue.  */
      if (loop_vinfo->par_unrolling_factor > 1)
        {
      next_vector_mode = vector_modes[0];
      mode_i = 1;

      if (dump_enabled_p ())
        dump_printf_loc (MSG_NOTE, vect_location,
                 "* Re-trying analysis with vector mode"
                 " %s for epilogue with partial vectors.\n",
                 GET_MODE_NAME (next_vector_mode));
      continue;
        }

That just forces trying the vector modes we've tried before. Though I 
might need to revisit this now I think about it. I'm afraid it might be 
possible for this to generate an epilogue with a vf that is not lower 
than that of the main loop, but I'd need to think about this again.


Either way I don't think this changes the vector modes used for the 
epilogue. But maybe I'm just missing your point here.

That said, for simplicity I'd only change the VF of the main loop.

There I wonder why you need to change vect_update_vf_for_slp or
vect_determine_partial_vectors_and_peeling and why it's not enough
to adjust the VF in a single place, I'd do that here:

   /* This is the point where we can re-start analysis with SLP forced off.
*/
start_over:

   /* Now the vectorization factor is final.  */
   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
   gcc_assert (known_ne (vectorization_factor, 0U));

>  call vect_update_vf_for_unroll ()
I can move it there, it would indeed remove the need for the change to 
vect_update_vf_for_slp, the change to 
vect_determine_partial_vectors_and_peeling would still be required I 
think. It is meant to disable using partial vectors in an unrolled loop.

note there's also loop->unroll (from #pragma GCC unroll) which we
could include in what you look at in vect_unroll_value.

I don't like add_stmt_cost_for_unroll - how should a target go
and decide based on what it is fed?  You could as well feed it
the scalar body or the vinfo so it can get a shot at all
the vectorizers meta data - but feeding it individual stmt_infos
does not add any meaningful abstraction and thus what's the
point?
I am still working on tuning our backend hook, but the way it works is 
it estimates how many load, store and general ops are required for the 
vectorized loop based on these.

I _think_ what would make some sense is when we actually cost
the vector body (with the not unrolled VF) ask the target
"well, how about unrolling this?" because there it has the
chance to look at the actual vector stmts produced (in "cost form").
And if the target answers "yeah - go ahead and try x4" we signal
that to the iteration and have "mode N with x4 unroll" validated and
costed.

So instead of a new target hook amend the finish_cost hook to
produce a suggested unroll value and cost both the unrolled and
not unrolled body.

Sorry for steering in a different direction ;)
The reason we decided to do this early and not after cost is because 
'vect_prune_runtime_alias_test_list' and 
'vect_enhance_data_refs_alignment' require the VF and if you suddenly 
raise that the alias analysis could become invalid.


An initial implementation did do it later for that very reason that we 
could reuse the cost calculations and AArch64 already computed these 
'ops' after Richard Sandiford's patches.

But yeah ... the above kinda led me to rewrite it this way.



Thanks,
Richard.




gcc/ChangeLog:

     * doc/tm.texi: Document TARGET_VECTORIZE_UNROLL_FACTOR
     and TARGET_VECTORIZE_ADD_STMT_COST_FOR_UNROLL.
     * doc/tm.texi.in: Add entries for target hooks above.
     * params.opt: Add vect-unroll and vect-unroll-reductions
parameters.
   

[PATCH 2/3][vect] Consider outside costs earlier for epilogue loops

2021-09-17 Thread Andre Vieira (lists) via Gcc-patches

Hi,

This patch changes the order in which we check outside and inside costs 
for epilogue loops, this is to ensure that a predicated epilogue is more 
likely to be picked over an unpredicated one, since it saves having to 
enter a scalar epilogue loop.


gcc/ChangeLog:

    * tree-vect-loop.c (vect_better_loop_vinfo_p): Change how 
epilogue loop costs are compared.
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 
14f8150d7c262b9422784e0e997ca4387664a20a..038af13a91d43c9f09186d042cf415020ea73a38
 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -2881,17 +2881,75 @@ vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
return new_simdlen_p;
 }
 
+  loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo);
+  if (main_loop)
+{
+  poly_uint64 main_poly_vf = LOOP_VINFO_VECT_FACTOR (main_loop);
+  unsigned HOST_WIDE_INT main_vf;
+  unsigned HOST_WIDE_INT old_factor, new_factor, old_cost, new_cost;
+  /* If we can determine how many iterations are left for the epilogue
+loop, that is if both the main loop's vectorization factor and number
+of iterations are constant, then we use them to calculate the cost of
+the epilogue loop together with a 'likely value' for the epilogues
+vectorization factor.  Otherwise we use the main loop's vectorization
+factor and the maximum poly value for the epilogue's.  If the target
+has not provided with a sensible upper bound poly vectorization
+factors are likely to be favored over constant ones.  */
+  if (main_poly_vf.is_constant (&main_vf)
+ && LOOP_VINFO_NITERS_KNOWN_P (main_loop))
+   {
+ unsigned HOST_WIDE_INT niters
+   = LOOP_VINFO_INT_NITERS (main_loop) % main_vf;
+ HOST_WIDE_INT old_likely_vf
+   = estimated_poly_value (old_vf, POLY_VALUE_LIKELY);
+ HOST_WIDE_INT new_likely_vf
+   = estimated_poly_value (new_vf, POLY_VALUE_LIKELY);
+
+ /* If the epilogue is using partial vectors we account for the
+partial iteration here too.  */
+ old_factor = niters / old_likely_vf;
+ if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (old_loop_vinfo)
+ && niters % old_likely_vf != 0)
+   old_factor++;
+
+ new_factor = niters / new_likely_vf;
+ if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (new_loop_vinfo)
+ && niters % new_likely_vf != 0)
+   new_factor++;
+   }
+  else
+   {
+ unsigned HOST_WIDE_INT main_vf_max
+   = estimated_poly_value (main_poly_vf, POLY_VALUE_MAX);
+
+ old_factor = main_vf_max / estimated_poly_value (old_vf,
+  POLY_VALUE_MAX);
+ new_factor = main_vf_max / estimated_poly_value (new_vf,
+  POLY_VALUE_MAX);
+
+ /* If the loop is not using partial vectors then it will iterate one
+time less than one that does.  It is safe to subtract one here,
+because the main loop's vf is always at least 2x bigger than that
+of an epilogue.  */
+ if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (old_loop_vinfo))
+   old_factor -= 1;
+ if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (new_loop_vinfo))
+   new_factor -= 1;
+   }
+
+  /* Compute the costs by multiplying the inside costs with the factor and
+add the outside costs for a more complete picture.  The factor is the
+amount of times we are expecting to iterate this epilogue.  */
+  old_cost = old_loop_vinfo->vec_inside_cost * old_factor;
+  new_cost = new_loop_vinfo->vec_inside_cost * new_factor;
+  old_cost += old_loop_vinfo->vec_outside_cost;
+  new_cost += new_loop_vinfo->vec_outside_cost;
+  return new_cost < old_cost;
+}
+
   /* Limit the VFs to what is likely to be the maximum number of iterations,
  to handle cases in which at least one loop_vinfo is fully-masked.  */
-  HOST_WIDE_INT estimated_max_niter;
-  loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo);
-  unsigned HOST_WIDE_INT main_vf;
-  if (main_loop
-  && LOOP_VINFO_NITERS_KNOWN_P (main_loop)
-  && LOOP_VINFO_VECT_FACTOR (main_loop).is_constant (&main_vf))
-estimated_max_niter = LOOP_VINFO_INT_NITERS (main_loop) % main_vf;
-  else
-estimated_max_niter = likely_max_stmt_executions_int (loop);
+  HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
   if (estimated_max_niter != -1)
 {
   if (known_le (estimated_max_niter, new_vf))


[PATCH 1/3][vect] Add main vectorized loop unrolling

2021-09-17 Thread Andre Vieira (lists) via Gcc-patches

Hi all,

This patch adds the ability to define a target hook to unroll the main 
vectorized loop. It also introduces --param's vect-unroll and 
vect-unroll-reductions to control this through a command-line. I found 
this useful to experiment and believe can help when tuning, so I decided 
to leave it in.
We only unroll the main loop and have disabled unrolling epilogues for 
now. We also do not support unrolling of any loop that has a negative 
step and we do not support unrolling a loop with any reduction other 
than a TREE_CODE_REDUCTION.


Bootstrapped and regression tested on aarch64-linux-gnu as part of the 
series.


gcc/ChangeLog:

    * doc/tm.texi: Document TARGET_VECTORIZE_UNROLL_FACTOR
    and TARGET_VECTORIZE_ADD_STMT_COST_FOR_UNROLL.
    * doc/tm.texi.in: Add entries for target hooks above.
    * params.opt: Add vect-unroll and vect-unroll-reductions 
parameters.

    * target.def: Define hooks TARGET_VECTORIZE_UNROLL_FACTOR
    and TARGET_VECTORIZE_ADD_STMT_COST_FOR_UNROLL.
    * targhooks.c (default_add_stmt_cost_for_unroll): New.
    (default_unroll_factor): Likewise.
    * targhooks.h (default_add_stmt_cost_for_unroll): Likewise.
    (default_unroll_factor): Likewise.
    * tree-vect-loop.c (_loop_vec_info::_loop_vec_info): Initialize
    par_unrolling_factor.
    (vect_update_vf_for_slp): Use unrolling factor to update 
vectorization

    factor.
    (vect_determine_partial_vectors_and_peeling): Account for 
unrolling.
    (vect_determine_unroll_factor): Determine how much to unroll 
vectorized

    main loop.
    (vect_analyze_loop_2): Call vect_determine_unroll_factor.
    (vect_analyze_loop): Allow for epilogue vectorization when 
unrolling

    and rewalk vector_mode array for the epilogues.
    (vectorizable_reduction): Disable single_defuse_cycle when 
unrolling.
    * tree-vectorizer.h (vect_unroll_value): Declare 
par_unrolling_factor

    as a member of loop_vec_info.
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 
f68f42638a112bed8396fd634bd3fd3c44ce848a..3bc9694d2162055d3db165ef888f35deb676548b
 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6283,6 +6283,19 @@ allocated by TARGET_VECTORIZE_INIT_COST.  The default 
releases the
 accumulator.
 @end deftypefn
 
+@deftypefn {Target Hook} void TARGET_VECTORIZE_ADD_STMT_COST_FOR_UNROLL (class 
vec_info *@var{vinfo}, class _stmt_vec_info *@var{stmt_info}, void *@var{data})
+This hook should update the target-specific @var{data} relative
+relative to the statement represented by @var{stmt_vinfo} to be used
+later to determine the unrolling factor for this loop using the current
+vectorization factor.
+@end deftypefn
+
+@deftypefn {Target Hook} unsigned TARGET_VECTORIZE_UNROLL_FACTOR (class 
vec_info *@var{vinfo}, void *@var{data})
+This hook should return the desired vector unrolling factor for a loop with
+@var{vinfo} based on the target-specific @var{data}. The default returns one,
+which means no unrolling will be performed.
+@end deftypefn
+
 @deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_GATHER (const_tree 
@var{mem_vectype}, const_tree @var{index_type}, int @var{scale})
 Target builtin that implements vector gather operation.  @var{mem_vectype}
 is the vector type of the load and @var{index_type} is scalar type of
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 
fdf16b901c537e6a02f630a80a2213d2dcb6d5d6..40f4cb02c34f575439f35070301855ddaf82a21a
 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4195,6 +4195,10 @@ address;  but often a machine-dependent strategy can 
generate better code.
 
 @hook TARGET_VECTORIZE_DESTROY_COST_DATA
 
+@hook TARGET_VECTORIZE_ADD_STMT_COST_FOR_UNROLL
+
+@hook TARGET_VECTORIZE_UNROLL_FACTOR
+
 @hook TARGET_VECTORIZE_BUILTIN_GATHER
 
 @hook TARGET_VECTORIZE_BUILTIN_SCATTER
diff --git a/gcc/params.opt b/gcc/params.opt
index 
f414dc1a61cfa9d5b9ded75e96560fc1f73041a5..00f92d4484797df0dbbad052f45205469cbb2c49
 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -1117,4 +1117,12 @@ Controls how loop vectorizer uses partial vectors.  0 
means never, 1 means only
 Common Joined UInteger Var(param_vect_inner_loop_cost_factor) Init(50) 
IntegerRange(1, 1) Param Optimization
 The maximum factor which the loop vectorizer applies to the cost of statements 
in an inner loop relative to the loop being vectorized.
 
+-param=vect-unroll=
+Common Joined UInteger Var(param_vect_unroll) Init(0) IntegerRange(0, 32) 
Param Optimization
+Controls how many times the vectorizer tries to unroll loops.  Also see 
vect-unroll-reductions.
+
+-param=vect-unroll-reductions=
+Common Joined UInteger Var(param_vect_unroll_reductions) Init(0) 
IntegerRange(0, 32) Param Optimization
+Controls how many times the vectorizer tries to unroll loops that contain 
associative reductions.  0 means that such loops should be unrolled vect-unroll 
times.
+
 ; This comment is to ensure we retain the blank line

[PATCH 0/3][vect] Enable vector unrolling of main loop

2021-09-17 Thread Andre Vieira (lists) via Gcc-patches

Hi all,

This patch series enables unrolling of an unpredicated main vectorized 
loop based on a target hook. The epilogue loop will have (at least) half 
the VF of the main loop and can be predicated.


Andre Vieira (3):
[vect] Add main vectorized loop unrolling
[vect] Consider outside costs earlier for epilogue loops
[AArch64] Implement vect_unroll backend hook



Re: [RFC] Using main loop's updated IV as base_address for epilogue vectorization

2021-06-16 Thread Andre Vieira (lists) via Gcc-patches



On 14/06/2021 11:57, Richard Biener wrote:

On Mon, 14 Jun 2021, Richard Biener wrote:


Indeed. For example a simple
int a[1024], b[1024], c[1024];

void foo(int n)
{
   for (int i = 0; i < n; ++i)
 a[i+1] += c[i+i] ? b[i+1] : 0;
}

should usually see peeling for alignment (though on x86 you need
exotic -march= since cost models generally have equal aligned and
unaligned access costs).  For example with -mavx2 -mtune=atom
we'll see an alignment peeling prologue, a AVX2 vector loop,
a SSE2 vectorized epilogue and a scalar epilogue.  It also
shows the original scalar loop being used in the scalar prologue
and epilogue.

We're not even trying to make the counting IV easily used
across loops (we're not counting scalar iterations in the
vector loops).

Specifically we see

 [local count: 94607391]:
niters_vector_mult_vf.10_62 = bnd.9_61 << 3;
_67 = niters_vector_mult_vf.10_62 + 7;
_64 = (int) niters_vector_mult_vf.10_62;
tmp.11_63 = i_43 + _64;
if (niters.8_45 == niters_vector_mult_vf.10_62)
   goto ; [12.50%]
else
   goto ; [87.50%]

after the maini vect loop, recomputing the original IV (i) rather
than using the inserted canonical IV.  And then the vectorized
epilogue header check doing

 [local count: 93293400]:
# i_59 = PHI 
# _66 = PHI <_67(33), 0(18)>
_96 = (unsigned int) n_10(D);
niters.26_95 = _96 - _66;
_108 = (unsigned int) n_10(D);
_109 = _108 - _66;
_110 = _109 + 4294967295;
if (_110 <= 3)
   goto ; [10.00%]
else
   goto ; [90.00%]

re-computing everything from scratch again (also notice how
the main vect loop guard jumps around the alignment prologue
as well and lands here - and the vectorized epilogue using
unaligned accesses - good!).

That is, I'd expect _much_ easier jobs if we'd manage to
track the number of performed scalar iterations (or the
number of scalar iterations remaining) using the canonical
IV we add to all loops across all of the involved loops.

Richard.



So I am now looking at using an IV that counts scalar iterations rather 
than vector iterations and reusing that through all loops, (prologue, 
main loop, vect_epilogue and scalar epilogue). The first is easy, since 
that's what we already do for partial vectors or non-constant VFs. The 
latter requires some plumbing and removing a lot of the code in there 
that creates new IV's going from [0, niters - previous iterations]. I 
don't yet have a clear cut view of how to do this, I first thought of 
keeping track of the 'control' IV in the loop_vinfo, but the prologue 
and scalar epilogues won't have one. 'loop' keeps a control_ivs struct, 
but that is used for overflow detection and only keeps track of what 
looks like a constant 'base' and 'step'. Not quite sure how all that 
works, but intuitively doesn't seem like the right thing to reuse.


I'll go hack around and keep you posted on progress.

Regards,
Andre



Re: [RFC] Using main loop's updated IV as base_address for epilogue vectorization

2021-06-14 Thread Andre Vieira (lists) via Gcc-patches

Hi,


On 20/05/2021 11:22, Richard Biener wrote:

On Mon, 17 May 2021, Andre Vieira (lists) wrote:


Hi,

So this is my second attempt at finding a way to improve how we generate the
vector IV's and teach the vectorizer to share them between main loop and
epilogues. On IRC we discussed my idea to use the loop's control_iv, but that
was a terrible idea and I quickly threw it in the bin. The main problem, that
for some reason I failed to see, was that the control_iv increases by 's' and
the datarefs by 's' * NELEMENTS where 's' is usually 1 and NELEMENTs the
amount of elements we handle per iteration. That means the epilogue loops
would have to start from the last loop's IV * the last loop's NELEMENT's and
that would just cause a mess.

Instead I started to think about creating IV's for the datarefs and what I
thought worked best was to create these in scalar before peeling. That way the
peeling mechanisms takes care of the duplication of these for the vector and
scalar epilogues and it also takes care of adding phi-nodes for the
skip_vector paths.

How does this work for if-converted loops where we use the
non-if-converted scalar loop for (scalar) peeling but the
if-converted scalar loop for vectorized epilogues?  I suppose
you're only adjusting the if-converted copy.

True hadn't thought about this :(



These new IV's have two functions:
1) 'vect_create_data_ref_ptr' can use them to:
  a) if it's the main loop: replace the values of the 'initial' value of the
main loop's IV and the initial values in the skip_vector phi-nodes
  b) Update the the skip_vector phi-nodes argument for the non-skip path with
the updated vector ptr.

b) means the prologue IV will not be dead there so we actually need
to compute it?  I suppose IVOPTs could be teached to replace an
IV with its final value (based on some other IV) when it's unused?
Or does it already magically do good?

It does not and ...



2) They are used for the scalar epilogue ensuring they share the same
datareference ptr.

There are still a variety of 'hacky' elements here and a lot of testing to be
done, but I hope to be able to clean them away. One of the main issues I had
was that I had to skip a couple of checks and things for the added phi-nodes
and update statements as these do not have stmt_vec_info representation.
Though I'm not sure adding this representation at their creation was much
cleaner... It is something I could play around with but I thought this was a
good moment to ask you for input. For instance, maybe we could do this
transformation before analysis?

Also be aware that because I create a IV for each dataref this leads to
regressions with SVE codegen for instance. NEON is able to use the post-index
addressing mode to increase each dr IV at access time, but SVE can't do this.
For this I don't know if maybe we could try to be smart and create shared
IV's. So rather than make them based on the actual vector ptr, use a shared
sizetype IV that can be shared among dr IV's with the same step. Or maybe this
is something for IVOPTs?

Certainly IVOPTs could decide to use the newly created IVs in the
scalar loops for the DRs therein as well.  But since IVOPTs only
considers a single loop at a time it will probably not pay too
much attention and is only influenced by the out-of-loop uses of
the final values of the IVs.

My gut feeling tells me that whatever we do we'll have to look
into improving IVOPTs to consider multiple loops.


So I redid the IV-sharing and it's looking a lot simpler and neater, 
however it only shares IVs between vectorized loops and not scalar pro- 
or epilogues. I am not certain IVOPTs will be able to deal with these, 
as it has no knowledge of the number of iterations of each different 
loop. So take for instance a prologue peeling for alignment loop and a 
first main vectorization loop. To be able to reuse the IV's from the 
prologue in the main vectorization loop it would need to know that the 
initial start adress + PEELING_NITERS == base address for main 
vectorization loop.


I'll starting testing this approach for correctness if there are no 
major concerns. Though I suspect we will only want to turn this into a 
patch once we have the IVOPTs work done to a point where it at least 
doesn't regress codegen because of shared IVs and eventually we can look 
at how to solve the sharing between vectorized and scalar loops.


A small nitpick on my own RFC. I will probably move the 'skip_e' to 
outside of the map, as we only need one per loop_vinfo and not one per 
DR. Initially I didnt even have this skip_e in, but was using the 
creation of a dummy PHI node and then replacing it with the real thing 
later. Though this made the code simpler, especially when inserting the 
'init's stmt_list.


Kind regards,
Andre
diff --git a/gcc/tree

[RFC][ivopts] Generate better code for IVs with uses outside the loop

2021-06-10 Thread Andre Vieira (lists) via Gcc-patches



On 08/06/2021 16:00, Andre Simoes Dias Vieira via Gcc-patches wrote:

Hi Bin,

Thank you for the reply, I have some questions, see below.

On 07/06/2021 12:28, Bin.Cheng wrote:

On Fri, Jun 4, 2021 at 12:35 AM Andre Vieira (lists) via Gcc-patches
 wrote:

Hi Andre,
I didn't look into the details of the IV sharing RFC.  It seems to me
costing outside uses is trying to generate better code for later code
(epilogue loop here).  The only problem is IVOPTs doesn't know that
the outside use is not in the final form - which will be transformed
by IVOPTs again.

I think this example is not good at describing your problem because it
shows exactly that considering outside use results in better code,
compared to the other two approaches.
I don't quite understand what you are saying here :( What do you mean 
by final form? It seems to me that costing uses inside and outside 
loop the same way is wrong because calculating the IV inside the loop 
has to be done every iteration, whereas if you can resolve it to a 
single update (without an IV) then you can sink it outside the loop. 
This is why I think this example shows why we need to cost these uses 
differently.

2) Is there a cleaner way to generate the optimal 'post-increment' use
for the outside-use variable? I first thought the position in the
candidate might be something I could use or even the var_at_stmt
functionality, but the outside IV has the actual increment of the
variable as it's use, rather than the outside uses. This is this RFC's
main weakness I find.

To answer why IVOPTs behaves like this w/o your two patches. The main
problem is the point IVOPTs rewrites outside use IV - I don't remember
the exact point - but looks like at the end of loop while before
incrementing instruction of main IV.  It's a known issue that outside
use should be costed/re-written on the exit edge along which its value
flows out of loop.  I had a patch a long time ago but discarded it,
because it didn't bring obvious improvement and is complicated in case
of multi-exit edges.
Yeah I haven't looked at multi-exit edges and I understand that 
complicates things. But for now we could disable the special casing of 
outside uses when dealing with multi-exit loops and keep the current 
behavior.


But in general, I am less convinced that any of the two patches is the
right direction solving IV sharing issue between vectorized loop and
epilogue loop.  I would need to read the previous RFC before giving
further comments though.


The previous RFC still has a lot of unanswered questions too, but 
regardless of that, take the following (non-vectorizer) example:


#include 
#include 

void bar (char  * __restrict__ a, char * __restrict__ b, char * 
__restrict__ c, unsigned long long n)

{
    svbool_t all_true = svptrue_b8 ();
  unsigned long long i = 0;
    for (; i < (n & ~(svcntb() - 1)); i += svcntb()) {
  svuint8_t va = svld1 (all_true, (uint8_t*)a);
  svuint8_t vb = svld1 (all_true, (uint8_t*)b);
  svst1 (all_true, (uint8_t *)c, svadd_z (all_true, va,vb));
  a += svcntb();
  b += svcntb();
  c += svcntb();
  }
  svbool_t pred;
  for (; i < (n); i += svcntb()) {
  pred = svwhilelt_b8 (i, n);
  svuint8_t va = svld1 (pred, (uint8_t*)a);
  svuint8_t vb = svld1 (pred, (uint8_t*)b);
  svst1 (pred, (uint8_t *)c, svadd_z (pred, va,vb));
  a += svcntb();
  b += svcntb();
  c += svcntb();
  }


Current IVOPTs will use 4 iterators for the first loop, when it could 
do with just 1. In fact, if you use my patches it will create just a 
single IV and sink the uses and it is then able to merge them with 
loads & stores of the next loop.
I mixed things up here, I think an earlier version of my patch (with 
even more hacks) managed to rewrite these properly, but it looks like 
the current ones are messing things up.
I'll continue to try to understand how this works as I do still think 
IVOPTs should be able to do better.


You mentioned you had a patch you thought might help earlier, but you 
dropped it. Do you still have it lying around anywhere?


I am not saying setting outside costs to 0 is the right thing to do by 
the way. It is absolutely not! It will break cost considerations for 
other cases. Like I said above I've been playing around with using 
'!use->outside' as a multiplier for the cost. Unfortunately it won't 
help with the case above, because this seems to choose 'infinite_cost' 
because the candidate IV has a lower precision than the use IV. I 
don't quite understand yet how candidates are created, but something 
I'm going to try to look at. Just wanted to show this as an example of 
how IVOPTs would not improve code with multiple loops that don't 
involve the vectorizer.


BR,
Andre




Thanks,
bin


[RFC][ivopts] Generate better code for IVs with uses outside the loop (was Re: [RFC] Implementing detection of saturation and rounding arithmetic)

2021-06-03 Thread Andre Vieira (lists) via Gcc-patches

Streams got crossed there and used the wrong subject ...

On 03/06/2021 17:34, Andre Vieira (lists) via Gcc-patches wrote:

Hi,

This RFC is motivated by the IV sharing RFC in 
https://gcc.gnu.org/pipermail/gcc-patches/2021-May/569502.html and the 
need to have the IVOPTS pass be able to clean up IV's shared between 
multiple loops. When creating a similar problem with C code I noticed 
IVOPTs treated IV's with uses outside the loop differently, this 
didn't even required multiple loops, take for instance the following 
example using SVE intrinsics:


#include 
#include 
extern void use (char *);
void bar (char  * __restrict__ a, char * __restrict__ b, char * 
__restrict__ c, unsigned n)

{
    svbool_t all_true = svptrue_b8 ();
  unsigned i = 0;
  if (n < (UINT_MAX - svcntb() - 1))
    {
    for (; i < n; i += svcntb())
    {
    svuint8_t va = svld1 (all_true, (uint8_t*)a);
    svuint8_t vb = svld1 (all_true, (uint8_t*)b);
    svst1 (all_true, (uint8_t *)c, svadd_z (all_true, 
va,vb));

    a += svcntb();
    b += svcntb();
    c += svcntb();
    }
    }
  use (a);
}

IVOPTs tends to generate a shared IV for SVE memory accesses, as we 
don't have a post-increment for SVE load/stores. If we had not 
included 'use (a);' in this example, IVOPTs would have replaced the 
IV's for a, b and c with a single one, (also used for the 
loop-control). See:


   [local count: 955630225]:
  # ivtmp.7_8 = PHI 
  va_14 = MEM  [(unsigned char *)a_10(D) + ivtmp.7_8 * 1];
  vb_15 = MEM  [(unsigned char *)b_11(D) + ivtmp.7_8 * 1];
  _2 = svadd_u8_z ({ -1, ... }, va_14, vb_15);
  MEM <__SVUint8_t> [(unsigned char *)c_12(D) + ivtmp.7_8 * 1] = _2;
  ivtmp.7_25 = ivtmp.7_8 + POLY_INT_CST [16, 16];
  i_23 = (unsigned int) ivtmp.7_25;
  if (n_9(D) > i_23)
    goto ; [89.00%]
  else
    goto ; [11.00%]

 However, due to the 'use (a);' it will create two IVs one for 
loop-control, b and c and one for a. See:


  [local count: 955630225]:
  # a_28 = PHI 
  # ivtmp.7_25 = PHI 
  va_15 = MEM  [(unsigned char *)a_28];
  vb_16 = MEM  [(unsigned char *)b_12(D) + ivtmp.7_25 * 1];
  _2 = svadd_u8_z ({ -1, ... }, va_15, vb_16);
  MEM <__SVUint8_t> [(unsigned char *)c_13(D) + ivtmp.7_25 * 1] = _2;
  a_18 = a_28 + POLY_INT_CST [16, 16];
  ivtmp.7_24 = ivtmp.7_25 + POLY_INT_CST [16, 16];
  i_8 = (unsigned int) ivtmp.7_24;
  if (n_10(D) > i_8)
    goto ; [89.00%]
  else
    goto ; [11.00%]

With the first patch attached in this RFC 'no_cost.patch', I tell 
IVOPTs to not cost uses outside of the loop. This makes IVOPTs 
generate a single IV, but unfortunately it decides to create the 
variable for the use inside the loop and it also seems to use the 
pre-increment value of the shared-IV and add the [16,16] to it. See:


   [local count: 955630225]:
  # ivtmp.7_25 = PHI 
  va_15 = MEM  [(unsigned char *)a_11(D) + ivtmp.7_25 * 1];
  vb_16 = MEM  [(unsigned char *)b_12(D) + ivtmp.7_25 * 1];
  _2 = svadd_u8_z ({ -1, ... }, va_15, vb_16);
  MEM <__SVUint8_t> [(unsigned char *)c_13(D) + ivtmp.7_25 * 1] = _2;
  _8 = (unsigned long) a_11(D);
  _7 = _8 + ivtmp.7_25;
  _6 = _7 + POLY_INT_CST [16, 16];
  a_18 = (char * restrict) _6;
  ivtmp.7_24 = ivtmp.7_25 + POLY_INT_CST [16, 16];
  i_5 = (unsigned int) ivtmp.7_24;
  if (n_10(D) > i_5)
    goto ; [89.00%]
  else
    goto ; [11.00%]

With the patch 'var_after.patch' I make get_computation_aff_1 use 
'cand->var_after' for outside uses thus using the post-increment var 
of the candidate IV. This means I have to insert it in a different 
place and make sure to delete the old use->stmt. I'm sure there is a 
better way to do this using IVOPTs current framework, but I didn't 
find one yet. See the result:


  [local count: 955630225]:
  # ivtmp.7_25 = PHI 
  va_15 = MEM  [(unsigned char *)a_11(D) + ivtmp.7_25 * 1];
  vb_16 = MEM  [(unsigned char *)b_12(D) + ivtmp.7_25 * 1];
  _2 = svadd_u8_z ({ -1, ... }, va_15, vb_16);
  MEM <__SVUint8_t> [(unsigned char *)c_13(D) + ivtmp.7_25 * 1] = _2;
  ivtmp.7_24 = ivtmp.7_25 + POLY_INT_CST [16, 16];
  _8 = (unsigned long) a_11(D);
  _7 = _8 + ivtmp.7_24;
  a_18 = (char * restrict) _7;
  i_6 = (unsigned int) ivtmp.7_24;
  if (n_10(D) > i_6)
    goto ; [89.00%]
  else
    goto ; [11.00%]


This is still not optimal as we are still doing the update inside the 
loop and there is absolutely no need for that. I found that running 
sink would solve it and it seems someone has added a second sink pass, 
so that saves me a third patch :) see after sink2:


   [local count: 955630225]:
  # ivtmp.7_25 = PHI 
  va_15 = MEM  [(unsigned char *)a_11(D) + ivtmp.7_25 * 1];
  vb_16 = MEM  [(unsigned char *)b_12(D) + ivtmp.7_25 * 1];
  _2 = svadd_u8_z ({ -1, ... }, va_15, vb_16);
  MEM <__SVUint8_t> [(unsigned char *)c_13(D) + ivtmp.7_25 * 1] = _2;
  ivtmp.7_24

[RFC] Implementing detection of saturation and rounding arithmetic

2021-06-03 Thread Andre Vieira (lists) via Gcc-patches

Hi,

This RFC is motivated by the IV sharing RFC in 
https://gcc.gnu.org/pipermail/gcc-patches/2021-May/569502.html and the 
need to have the IVOPTS pass be able to clean up IV's shared between 
multiple loops. When creating a similar problem with C code I noticed 
IVOPTs treated IV's with uses outside the loop differently, this didn't 
even required multiple loops, take for instance the following example 
using SVE intrinsics:


#include 
#include 
extern void use (char *);
void bar (char  * __restrict__ a, char * __restrict__ b, char * 
__restrict__ c, unsigned n)

{
    svbool_t all_true = svptrue_b8 ();
  unsigned i = 0;
  if (n < (UINT_MAX - svcntb() - 1))
    {
    for (; i < n; i += svcntb())
    {
    svuint8_t va = svld1 (all_true, (uint8_t*)a);
    svuint8_t vb = svld1 (all_true, (uint8_t*)b);
    svst1 (all_true, (uint8_t *)c, svadd_z (all_true, va,vb));
    a += svcntb();
    b += svcntb();
    c += svcntb();
    }
    }
  use (a);
}

IVOPTs tends to generate a shared IV for SVE memory accesses, as we 
don't have a post-increment for SVE load/stores. If we had not included 
'use (a);' in this example, IVOPTs would have replaced the IV's for a, b 
and c with a single one, (also used for the loop-control). See:


   [local count: 955630225]:
  # ivtmp.7_8 = PHI 
  va_14 = MEM  [(unsigned char *)a_10(D) + ivtmp.7_8 * 1];
  vb_15 = MEM  [(unsigned char *)b_11(D) + ivtmp.7_8 * 1];
  _2 = svadd_u8_z ({ -1, ... }, va_14, vb_15);
  MEM <__SVUint8_t> [(unsigned char *)c_12(D) + ivtmp.7_8 * 1] = _2;
  ivtmp.7_25 = ivtmp.7_8 + POLY_INT_CST [16, 16];
  i_23 = (unsigned int) ivtmp.7_25;
  if (n_9(D) > i_23)
    goto ; [89.00%]
  else
    goto ; [11.00%]

 However, due to the 'use (a);' it will create two IVs one for 
loop-control, b and c and one for a. See:


  [local count: 955630225]:
  # a_28 = PHI 
  # ivtmp.7_25 = PHI 
  va_15 = MEM  [(unsigned char *)a_28];
  vb_16 = MEM  [(unsigned char *)b_12(D) + ivtmp.7_25 * 1];
  _2 = svadd_u8_z ({ -1, ... }, va_15, vb_16);
  MEM <__SVUint8_t> [(unsigned char *)c_13(D) + ivtmp.7_25 * 1] = _2;
  a_18 = a_28 + POLY_INT_CST [16, 16];
  ivtmp.7_24 = ivtmp.7_25 + POLY_INT_CST [16, 16];
  i_8 = (unsigned int) ivtmp.7_24;
  if (n_10(D) > i_8)
    goto ; [89.00%]
  else
    goto ; [11.00%]

With the first patch attached in this RFC 'no_cost.patch', I tell IVOPTs 
to not cost uses outside of the loop. This makes IVOPTs generate a 
single IV, but unfortunately it decides to create the variable for the 
use inside the loop and it also seems to use the pre-increment value of 
the shared-IV and add the [16,16] to it. See:


   [local count: 955630225]:
  # ivtmp.7_25 = PHI 
  va_15 = MEM  [(unsigned char *)a_11(D) + ivtmp.7_25 * 1];
  vb_16 = MEM  [(unsigned char *)b_12(D) + ivtmp.7_25 * 1];
  _2 = svadd_u8_z ({ -1, ... }, va_15, vb_16);
  MEM <__SVUint8_t> [(unsigned char *)c_13(D) + ivtmp.7_25 * 1] = _2;
  _8 = (unsigned long) a_11(D);
  _7 = _8 + ivtmp.7_25;
  _6 = _7 + POLY_INT_CST [16, 16];
  a_18 = (char * restrict) _6;
  ivtmp.7_24 = ivtmp.7_25 + POLY_INT_CST [16, 16];
  i_5 = (unsigned int) ivtmp.7_24;
  if (n_10(D) > i_5)
    goto ; [89.00%]
  else
    goto ; [11.00%]

With the patch 'var_after.patch' I make get_computation_aff_1 use 
'cand->var_after' for outside uses thus using the post-increment var of 
the candidate IV. This means I have to insert it in a different place 
and make sure to delete the old use->stmt. I'm sure there is a better 
way to do this using IVOPTs current framework, but I didn't find one 
yet. See the result:


  [local count: 955630225]:
  # ivtmp.7_25 = PHI 
  va_15 = MEM  [(unsigned char *)a_11(D) + ivtmp.7_25 * 1];
  vb_16 = MEM  [(unsigned char *)b_12(D) + ivtmp.7_25 * 1];
  _2 = svadd_u8_z ({ -1, ... }, va_15, vb_16);
  MEM <__SVUint8_t> [(unsigned char *)c_13(D) + ivtmp.7_25 * 1] = _2;
  ivtmp.7_24 = ivtmp.7_25 + POLY_INT_CST [16, 16];
  _8 = (unsigned long) a_11(D);
  _7 = _8 + ivtmp.7_24;
  a_18 = (char * restrict) _7;
  i_6 = (unsigned int) ivtmp.7_24;
  if (n_10(D) > i_6)
    goto ; [89.00%]
  else
    goto ; [11.00%]


This is still not optimal as we are still doing the update inside the 
loop and there is absolutely no need for that. I found that running sink 
would solve it and it seems someone has added a second sink pass, so 
that saves me a third patch :) see after sink2:


   [local count: 955630225]:
  # ivtmp.7_25 = PHI 
  va_15 = MEM  [(unsigned char *)a_11(D) + ivtmp.7_25 * 1];
  vb_16 = MEM  [(unsigned char *)b_12(D) + ivtmp.7_25 * 1];
  _2 = svadd_u8_z ({ -1, ... }, va_15, vb_16);
  MEM <__SVUint8_t> [(unsigned char *)c_13(D) + ivtmp.7_25 * 1] = _2;
  ivtmp.7_24 = ivtmp.7_25 + POLY_INT_CST [16, 16];
  i_6 = (unsigned int) ivtmp.7_24;
  if (i_6 < n_10(D))
    goto ; [89.00%]
  else
    goto ; [11.00%]

   [local count: 105119324]:
  _8 = (unsigned long) a_11(D);
  _7 = _8 + ivtmp.7_24;
  a_18 = (char * restrict) _7;
  goto ; [

Re: [PATCH][vect] Use main loop's thresholds and vectorization factor to narrow upper_bound of epilogue

2021-06-03 Thread Andre Vieira (lists) via Gcc-patches

Thank you Kewen!!

I will apply this now.

BR,
Andre

On 25/05/2021 09:42, Kewen.Lin wrote:

on 2021/5/24 下午3:21, Kewen.Lin via Gcc-patches wrote:

Hi Andre,

on 2021/5/24 下午2:17, Andre Vieira (lists) via Gcc-patches wrote:

Hi,

When vectorizing with --param vect-partial-vector-usage=1 the vectorizer uses 
an unpredicated (all-true predicate for SVE) main loop and a predicated tail 
loop. The way this was implemented seems to mean it re-uses the same 
vector-mode for both loops, which means the tail loop isn't an actual loop but 
only executes one iteration.

This patch uses the knowledge of the conditions to enter an epilogue loop to 
help come up with a potentially more restricive upper bound.

Regression tested on aarch64-linux-gnu and also ran the testsuite using 
'--param vect-partial-vector-usage=1' detecting no ICEs and no execution 
failures.

Would be good to have this tested for PPC too as I believe they are the main 
users of the --param vect-partial-vector-usage=1 option. Can someone help me 
test (and maybe even benchmark?) this on a PPC target?



Thanks for doing this!  I can test it on Power10 which enables this parameter
by default, also evaluate its impact on SPEC2017 Ofast/unroll.


Bootstrapped/regtested on powerpc64le-linux-gnu Power10.
SPEC2017 run didn't show any remarkable improvement/degradation.

BR,
Kewen


[PATCH][vect] Use main loop's thresholds and vectorization factor to narrow upper_bound of epilogue

2021-05-23 Thread Andre Vieira (lists) via Gcc-patches

Hi,

When vectorizing with --param vect-partial-vector-usage=1 the vectorizer 
uses an unpredicated (all-true predicate for SVE) main loop and a 
predicated tail loop. The way this was implemented seems to mean it 
re-uses the same vector-mode for both loops, which means the tail loop 
isn't an actual loop but only executes one iteration.


This patch uses the knowledge of the conditions to enter an epilogue 
loop to help come up with a potentially more restricive upper bound.


Regression tested on aarch64-linux-gnu and also ran the testsuite using 
'--param vect-partial-vector-usage=1' detecting no ICEs and no execution 
failures.


Would be good to have this tested for PPC too as I believe they are the 
main users of the --param vect-partial-vector-usage=1 option. Can 
someone help me test (and maybe even benchmark?) this on a PPC target?


Kind regards,
Andre

gcc/ChangeLog:

    * tree-vect-loop.c (vect_transform_loop): Use main loop's 
various' thresholds

    to narrow the upper bound on epilogue iterations.

gcc/testsuite/ChangeLog:

    * gcc.target/aarch64/sve/part_vect_single_iter_epilog.c: New test.

diff --git 
a/gcc/testsuite/gcc.target/aarch64/sve/part_vect_single_iter_epilog.c 
b/gcc/testsuite/gcc.target/aarch64/sve/part_vect_single_iter_epilog.c
new file mode 100644
index 
..a03229eb55585f637ebd5288fb4c00f8f921d44c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/part_vect_single_iter_epilog.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
+
+void
+foo (short * __restrict__ a, short * __restrict__ b, short * __restrict__ c, 
int n)
+{
+  for (int i = 0; i < n; ++i)
+c[i] = a[i] + b[i];
+}
+
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]+.h, wzr, [xw][0-9]+} 1 
} } */
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 
3e973e774af8f9205be893e01ad9263281116885..81e9c5cc42415a0a92b765bc46640105670c4e6b
 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -9723,12 +9723,31 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple 
*loop_vectorized_call)
   /* In these calculations the "- 1" converts loop iteration counts
  back to latch counts.  */
   if (loop->any_upper_bound)
-loop->nb_iterations_upper_bound
-  = (final_iter_may_be_partial
-? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
- lowest_vf) - 1
-: wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
-  lowest_vf) - 1);
+{
+  loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
+  loop->nb_iterations_upper_bound
+   = (final_iter_may_be_partial
+  ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
+   lowest_vf) - 1
+  : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
+lowest_vf) - 1);
+  if (main_vinfo)
+   {
+ unsigned int bound;
+ poly_uint64 main_iters
+   = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
+  LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
+ main_iters
+   = upper_bound (main_iters,
+  LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
+ if (can_div_away_from_zero_p (main_iters,
+   LOOP_VINFO_VECT_FACTOR (loop_vinfo),
+   &bound))
+   loop->nb_iterations_upper_bound
+ = wi::umin ((widest_int) (bound - 1),
+ loop->nb_iterations_upper_bound);
+  }
+  }
   if (loop->any_likely_upper_bound)
 loop->nb_iterations_likely_upper_bound
   = (final_iter_may_be_partial


Re: [PATCH][AArch64]: Use UNSPEC_LD1_SVE for all LD1 loads

2021-05-18 Thread Andre Vieira (lists) via Gcc-patches

Hi,

Using aarch64_pred_mov for these was tricky as it did both store and 
load. Furthermore there was some concern it might allow for a predicated 
mov to end up as a mem -> mem and a predicated load being wrongfully 
reloaded to a full-load to register. So instead we decided to let the 
extending aarch64_load_* patterns accept both UNSPEC_LD1_SVE and 
UNSPEC_PRED_X.


Is this OK for trunk?

Kind regards,
Andre Vieira


gcc/ChangeLog:
2021-05-18  Andre Vieira  

    * config/aarch64/iterators.md (SVE_PRED_LOAD): New iterator.
    (pred_load): New int attribute.
    * config/aarch64/aarch64-sve.md 
(aarch64_load_): 
Use SVE_PRED_LOAD

    enum iterator and corresponding pred_load attribute.
    * config/aarch64/aarch64-sve-builtins-base.cc (expand): Update 
call to code_for_aarch64_load.


gcc/testsuite/ChangeLog:
2021-05-18  Andre Vieira  

    * gcc.target/aarch64/sve/logical_unpacked_and_2.c: Change 
scan-assembly-times to scan-assembly not for superfluous uxtb.

    * gcc.target/aarch64/sve/logical_unpacked_and_3.c: Likewise.
    * gcc.target/aarch64/sve/logical_unpacked_and_4.c: Likewise.
    * gcc.target/aarch64/sve/logical_unpacked_and_6.c: Likewise.
    * gcc.target/aarch64/sve/logical_unpacked_and_7.c: Likewise.
    * gcc.target/aarch64/sve/logical_unpacked_eor_2.c: Likewise.
    * gcc.target/aarch64/sve/logical_unpacked_eor_3.c: Likewise.
    * gcc.target/aarch64/sve/logical_unpacked_eor_4.c: Likewise.
    * gcc.target/aarch64/sve/logical_unpacked_eor_6.c: Likewise.
    * gcc.target/aarch64/sve/logical_unpacked_eor_7.c: Likewise.
    * gcc.target/aarch64/sve/logical_unpacked_orr_2.c: Likewise.
    * gcc.target/aarch64/sve/logical_unpacked_orr_3.c: Likewise.
    * gcc.target/aarch64/sve/logical_unpacked_orr_4.c: Likewise.
    * gcc.target/aarch64/sve/logical_unpacked_orr_6.c: Likewise.
    * gcc.target/aarch64/sve/logical_unpacked_orr_7.c: Likewise.
    * gcc.target/aarch64/sve/ld1_extend.c: New test.
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index 
dfdf0e2fd186389cbddcff51ef52f8778d7fdb24..8fd6d3fb3171f56b4ceacaf7ea812bc696117210
 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -1123,7 +1123,7 @@ public:
   rtx
   expand (function_expander &e) const OVERRIDE
   {
-insn_code icode = code_for_aarch64_load (extend_rtx_code (),
+insn_code icode = code_for_aarch64_load (UNSPEC_LD1_SVE, extend_rtx_code 
(),
 e.vector_mode (0),
 e.memory_vector_mode ());
 return e.use_contiguous_load_insn (icode);
diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index 
7db2938bb84e04d066a7b07574e5cf344a3a8fb6..a5663200d51b95684b4dc0caefd527a525aebd52
 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -1287,7 +1287,7 @@ (define_insn "vec_mask_load_lanes"
 ;; -
 
 ;; Predicated load and extend, with 8 elements per 128-bit block.
-(define_insn_and_rewrite 
"@aarch64_load_"
+(define_insn_and_rewrite 
"@aarch64_load_"
   [(set (match_operand:SVE_HSDI 0 "register_operand" "=w")
(unspec:SVE_HSDI
  [(match_operand: 3 "general_operand" "UplDnm")
@@ -1295,7 +1295,7 @@ (define_insn_and_rewrite 
"@aarch64_load_ 2 "register_operand" "Upl")
(match_operand:SVE_PARTIAL_I 1 "memory_operand" "m")]
-  UNSPEC_LD1_SVE))]
+  SVE_PRED_LOAD))]
  UNSPEC_PRED_X))]
   "TARGET_SVE && (~ & ) == 0"
   "ld1\t%0., %2/z, %1"
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 
fb6e228651eae6a2db8c1ac755885ae7ad9225d6..8c17929cea4c83cc9f80b4cde950407ba4eb0416
 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -2509,6 +2509,10 @@ (define_int_iterator SVE_SHIFT_WIDE [UNSPEC_ASHIFT_WIDE
 
 (define_int_iterator SVE_LDFF1_LDNF1 [UNSPEC_LDFF1 UNSPEC_LDNF1])
 
+(define_int_iterator SVE_PRED_LOAD [UNSPEC_PRED_X UNSPEC_LD1_SVE])
+
+(define_int_attr pred_load [(UNSPEC_PRED_X "_x") (UNSPEC_LD1_SVE "")])
+
 (define_int_iterator SVE2_U32_UNARY [UNSPEC_URECPE UNSPEC_RSQRTE])
 
 (define_int_iterator SVE2_INT_UNARY_NARROWB [UNSPEC_SQXTNB
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/ld1_extend.c 
b/gcc/testsuite/gcc.target/aarch64/sve/ld1_extend.c
new file mode 100644
index 
..7f78cb4b3e4445c4da93b00ae78d6ef6fec1b2de
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/ld1_extend.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 --

Re: [RFC] Using main loop's updated IV as base_address for epilogue vectorization

2021-05-17 Thread Andre Vieira (lists) via Gcc-patches

Hi,

So this is my second attempt at finding a way to improve how we generate 
the vector IV's and teach the vectorizer to share them between main loop 
and epilogues. On IRC we discussed my idea to use the loop's control_iv, 
but that was a terrible idea and I quickly threw it in the bin. The main 
problem, that for some reason I failed to see, was that the control_iv 
increases by 's' and the datarefs by 's' * NELEMENTS where 's' is 
usually 1 and NELEMENTs the amount of elements we handle per iteration. 
That means the epilogue loops would have to start from the last loop's 
IV * the last loop's NELEMENT's and that would just cause a mess.


Instead I started to think about creating IV's for the datarefs and what 
I thought worked best was to create these in scalar before peeling. That 
way the peeling mechanisms takes care of the duplication of these for 
the vector and scalar epilogues and it also takes care of adding 
phi-nodes for the skip_vector paths.

These new IV's have two functions:
1) 'vect_create_data_ref_ptr' can use them to:
 a) if it's the main loop: replace the values of the 'initial' value of 
the main loop's IV and the initial values in the skip_vector phi-nodes
 b) Update the the skip_vector phi-nodes argument for the non-skip path 
with the updated vector ptr.


2) They are used for the scalar epilogue ensuring they share the same 
datareference ptr.


There are still a variety of 'hacky' elements here and a lot of testing 
to be done, but I hope to be able to clean them away. One of the main 
issues I had was that I had to skip a couple of checks and things for 
the added phi-nodes and update statements as these do not have 
stmt_vec_info representation.  Though I'm not sure adding this 
representation at their creation was much cleaner... It is something I 
could play around with but I thought this was a good moment to ask you 
for input. For instance, maybe we could do this transformation before 
analysis?


Also be aware that because I create a IV for each dataref this leads to 
regressions with SVE codegen for instance. NEON is able to use the 
post-index addressing mode to increase each dr IV at access time, but 
SVE can't do this.  For this I don't know if maybe we could try to be 
smart and create shared IV's. So rather than make them based on the 
actual vector ptr, use a shared sizetype IV that can be shared among dr 
IV's with the same step. Or maybe this is something for IVOPTs?


Let me know what ya think!

Kind regards,
Andre
diff --git a/gcc/tree-data-ref.h b/gcc/tree-data-ref.h
index 
8001cc54f518d9d9d1a0fcfe5790d22dae109fb2..939c0a7fefd4355dd75d7646ac2ae63ce23a0e14
 100644
--- a/gcc/tree-data-ref.h
+++ b/gcc/tree-data-ref.h
@@ -174,6 +174,8 @@ struct data_reference
 
   /* Alias information for the data reference.  */
   struct dr_alias alias;
+
+  hash_map *iv_bases;
 };
 
 #define DR_STMT(DR)(DR)->stmt
diff --git a/gcc/tree-data-ref.c b/gcc/tree-data-ref.c
index 
124a7bea6a94161556a6622fa7b113b3cef98bcf..f638bb3e0aa007e0bf7ad8f75fb767d3484b02ce
 100644
--- a/gcc/tree-data-ref.c
+++ b/gcc/tree-data-ref.c
@@ -1475,6 +1475,7 @@ void
 free_data_ref (data_reference_p dr)
 {
   DR_ACCESS_FNS (dr).release ();
+  delete dr->iv_bases;
   free (dr);
 }
 
@@ -1506,6 +1507,7 @@ create_data_ref (edge nest, loop_p loop, tree memref, 
gimple *stmt,
   DR_REF (dr) = memref;
   DR_IS_READ (dr) = is_read;
   DR_IS_CONDITIONAL_IN_STMT (dr) = is_conditional_in_stmt;
+  dr->iv_bases = new hash_map ();
 
   dr_analyze_innermost (&DR_INNERMOST (dr), memref,
nest != NULL ? loop : NULL, stmt);
diff --git a/gcc/tree-ssa-loop-manip.h b/gcc/tree-ssa-loop-manip.h
index 
86fc118b6befb06233e5e86a01454fd7075075e1..93e14d09763da5034ba97d09b07c94c20fe25a28
 100644
--- a/gcc/tree-ssa-loop-manip.h
+++ b/gcc/tree-ssa-loop-manip.h
@@ -24,6 +24,8 @@ typedef void (*transform_callback)(class loop *, void *);
 
 extern void create_iv (tree, tree, tree, class loop *, gimple_stmt_iterator *,
   bool, tree *, tree *);
+extern void create_or_update_iv (tree, tree, tree, class loop *, 
gimple_stmt_iterator *,
+ bool, tree *, tree *, gphi *, bool);
 extern void rewrite_into_loop_closed_ssa_1 (bitmap, unsigned, int,
class loop *);
 extern void rewrite_into_loop_closed_ssa (bitmap, unsigned);
diff --git a/gcc/tree-ssa-loop-manip.c b/gcc/tree-ssa-loop-manip.c
index 
28ae1316fa0eb6939a45d15e893b7386622ba60c..1709e175c382ef5d74c2f628a61c9fffe26f726d
 100644
--- a/gcc/tree-ssa-loop-manip.c
+++ b/gcc/tree-ssa-loop-manip.c
@@ -57,9 +57,10 @@ static bitmap_obstack loop_renamer_obstack;
VAR_AFTER (unless they are NULL).  */
 
 void
-create_iv (tree base, tree step, tree var, class loop *loop,
-  gimple_stmt_iterator *incr_pos, bool after,
-  tree *var_before, tree *var_after)
+create_or_update_iv (tree base, tree step, tree var, class loop *loop,
+

[PATCH][AArch64]: Use UNSPEC_LD1_SVE for all LD1 loads

2021-05-14 Thread Andre Vieira (lists) via Gcc-patches

Hi,

I noticed we were missing out on LD1 + UXT combinations in some cases 
and found it was because of inconsistent use of the unspec enum 
UNSPEC_LD1_SVE. The combine pattern for LD1[S][BHWD] uses UNSPEC_LD1_SVE 
whereas one of the LD1 expanders was using UNSPEC_PRED_X. I wasn't sure 
whether to change the UNSPEC_LD1_SVE into UNSPEC_PRED_X as the enum 
doesn't seem to be used for anything in particular, though I decided 
against it for now as it is easier to rename UNSPEC_LD1_SVE to 
UNSPEC_PRED_X if there is no use for it than it is to rename only 
specific instances of UNSPEC_PRED_X.


If there is a firm belief the UNSPEC_LD1_SVE will not be used for 
anything I am also happy to refactor it out.


Bootstrapped and regression tested aarch64-none-linux-gnu.

Is this OK for trunk?

Kind regards,
Andre Vieira

gcc/ChangeLog:
2021-05-14  Andre Vieira  

    * config/aarch64/aarch64-sve.md: Use UNSPEC_LD1_SVE instead of 
UNSPEC_PRED_X.


gcc/testsuite/ChangeLog:
2021-05-14  Andre Vieira  

    * gcc.target/aarch64/sve/logical_unpacked_and_2.c: Remove 
superfluous uxtb.

    * gcc.target/aarch64/sve/logical_unpacked_and_3.c: Likewise.
    * gcc.target/aarch64/sve/logical_unpacked_and_4.c: Likewise.
    * gcc.target/aarch64/sve/logical_unpacked_and_6.c: Likewise.
    * gcc.target/aarch64/sve/logical_unpacked_and_7.c: Likewise.
    * gcc.target/aarch64/sve/logical_unpacked_eor_2.c: Likewise.
    * gcc.target/aarch64/sve/logical_unpacked_eor_3.c: Likewise.
    * gcc.target/aarch64/sve/logical_unpacked_eor_4.c: Likewise.
    * gcc.target/aarch64/sve/logical_unpacked_eor_6.c: Likewise.
    * gcc.target/aarch64/sve/logical_unpacked_eor_7.c: Likewise.
    * gcc.target/aarch64/sve/logical_unpacked_orr_2.c: Likewise.
    * gcc.target/aarch64/sve/logical_unpacked_orr_4.c: Likewise.
    * gcc.target/aarch64/sve/logical_unpacked_orr_6.c: Likewise.
    * gcc.target/aarch64/sve/logical_unpacked_orr_7.c: Likewise.
    * gcc.target/aarch64/sve/ld1_extend.c: New test.

diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index 
7db2938bb84e04d066a7b07574e5cf344a3a8fb6..5fd74fcf3e0a984b5b40b8128ad9354fb899ce5f
 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -747,7 +747,7 @@ (define_insn_and_split "@aarch64_pred_mov"
(unspec:SVE_ALL
  [(match_operand: 1 "register_operand" "Upl, Upl, Upl")
   (match_operand:SVE_ALL 2 "nonimmediate_operand" "w, m, w")]
- UNSPEC_PRED_X))]
+ UNSPEC_LD1_SVE))]
   "TARGET_SVE
&& (register_operand (operands[0], mode)
|| register_operand (operands[2], mode))"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/ld1_extend.c 
b/gcc/testsuite/gcc.target/aarch64/sve/ld1_extend.c
new file mode 100644
index 
..7f78cb4b3e4445c4da93b00ae78d6ef6fec1b2de
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/ld1_extend.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */
+
+void foo (signed char * __restrict__ a, signed char * __restrict__ b, short * 
__restrict__ c, int n)
+{
+for (int i = 0; i < n; ++i)
+  c[i] = a[i] + b[i];
+}
+
+/* { dg-final { scan-assembler-times {\tld1sb\t} 4 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/logical_unpacked_and_2.c 
b/gcc/testsuite/gcc.target/aarch64/sve/logical_unpacked_and_2.c
index 
08b274512e1c6ce8f5845084a664b2fa0456dafe..cb6029e90ffc815e75092624f611c4631cbd9fd6
 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/logical_unpacked_and_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/logical_unpacked_and_2.c
@@ -11,7 +11,6 @@ f (uint64_t *restrict dst, uint16_t *restrict src1, uint8_t 
*restrict src2)
 
 /* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d,} 2 } } */
 /* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.d,} 2 } } */
-/* { dg-final { scan-assembler-times {\tuxtb\tz[0-9]+\.h,} 1 } } */
 /* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d,} 2 } } */
 /* { dg-final { scan-assembler-times {\tuxth\tz[0-9]+\.d,} 2 } } */
 /* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/logical_unpacked_and_3.c 
b/gcc/testsuite/gcc.target/aarch64/sve/logical_unpacked_and_3.c
index 
c823470ca925ee66929475f74fa8d94bc4735594..02fc5460e5ce89c8a3fef611aac561145ddd0f39
 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/logical_unpacked_and_3.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/logical_unpacked_and_3.c
@@ -11,7 +11,6 @@ f (uint64_t *restrict dst, uint32_t *restrict src1, uint8_t 
*restrict src2)
 
 /* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d,} 2 } } */
 /* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.d,} 2 } } */
-/* { dg-final { scan-assembler-times {\tuxtb\tz[0-9]+\.s,} 1 } } */
 /* { dg

Re: [RFC] Using main loop's updated IV as base_address for epilogue vectorization

2021-05-05 Thread Andre Vieira (lists) via Gcc-patches



On 05/05/2021 13:34, Richard Biener wrote:

On Wed, 5 May 2021, Andre Vieira (lists) wrote:


I tried to see what IVOPTs would make of this and it is able to analyze the
IVs but it doesn't realize (not even sure it tries) that one IV's end (loop 1)
could be used as the base for the other (loop 2). I don't know if this is
where you'd want such optimizations to be made, on one side I think it would
be great as it would also help with non-vectorized loops as you allured to.

Hmm, OK.  So there's the first loop that has a looparound jump and thus
we do not always enter the 2nd loop with the first loop final value of the
IV.  But yes, IVOPTs does not try to allocate IVs across multiple loops.
And for a followup transform to catch this it would need to compute
the final value of the IV and then match this up with the initial
value computation.  I suppose FRE could be teached to do this, at
least for very simple cases.
I will admit I am not at all familiar with how FRE works, I know it 
exists as the occlusion of running it often breaks my vector patches :P 
But that's about all I know.
I will have a look and see if it makes sense from my perspective to 
address it there, because ...



Anyway I diverge. Back to the main question of this patch. How do you suggest
I go about this? Is there a way to make IVOPTS aware of the 'iterate-once' IVs
in the epilogue(s) (both vector and scalar!) and then teach it to merge IV's
if one ends where the other begins?

I don't think we will make that work easily.  So indeed attacking this
in the vectorizer sounds most promising.


The problem with this that I found with my approach is that it only 
tackles the vectorized epilogues and that leads to regressions, I don't 
have the example at hand, but what I saw was happening was that 
increased register pressure lead to a spill in the hot path. I believe 
this was caused by the epilogue loop using the update pointers as the 
base for their DR's, in this case there were three DR's (2 loads one 
store), but the scalar epilogue still using the original base + niters, 
since this data_reference approach only changes the vectorized epilogues.




  I'll note there's also
the issue of epilogue vectorization and reductions where we seem
to not re-use partially reduced reduction vectors but instead
reduce to a scalar in each step.  That's a related issue - we're
not able to carry forward a (reduction) IV we generated for the
main vector loop to the epilogue loops.  Like for

double foo (double *a, int n)
{
   double sum = 0.;
   for (int i = 0; i < n; ++i)
 sum += a[i];
   return sum;
}

with AVX512 we get three reductions to scalars instead of
a partial reduction from zmm to ymm before the first vectorized
epilogue followed by a reduction from ymm to xmm before the second
(the jump around for the epilogues need to jump to the further
reduction piece obviously).

So I think we want to record IVs we generate (the reduction IVs
are already nicely associated with the stmt-infos), one might
consider to refer to them from the dr_vec_info for example.

It's just going to be "interesting" to wire everything up
correctly with all the jump-arounds we have ...
I have a downstream hack for the reductions, but it only worked for 
partial-vector-usage as there you have the guarantee it's the same 
vector-mode, so you don't need to pfaff around with half and full 
vectors. Obviously what you are suggesting has much wider applications 
and not surprisingly I think Richard Sandiford also pointed out to me 
that these are somewhat related and we might be able to reuse the 
IV-creation to manage it all. But I feel like I am currently light years 
away from that.


I had started to look at removing the data_reference updating we have 
now and dealing with this in the 'create_iv' calls from 
'vect_create_data_ref_ptr' inside 'vectorizable_{load,store}' but then I 
thought it would be good to discuss it with you first. This will require 
keeping track of the 'end-value' of the IV, which for loops where we can 
skip the previous loop means we will need to construct a phi-node 
containing the updated pointer and the initial base. But I'm not 
entirely sure where to keep track of all this. Also I don't know if I 
can replace the base address of the data_reference right there at the 
'create_iv' call, can a data_reference be used multiple times in the 
same loop?


I'll go do a bit more nosing around this idea and the ivmap you 
mentioned before. Let me know if you have any ideas on how this all 
should look like, even if its a 'in an ideal world'.


Andre



On 04/05/2021 10:56, Richard Biener wrote:

On Fri, 30 Apr 2021, Andre Vieira (lists) wrote:


Hi,

The aim of this RFC is to explore a way of cleaning up the codegen around
data_references.  To be specific, I

Re: [RFC] Using main loop's updated IV as base_address for epilogue vectorization

2021-05-05 Thread Andre Vieira (lists) via Gcc-patches

Hi Richi,

So I'm trying to look at what IVOPTs does right now and how it might be 
able to help us. Looking at these two code examples:

#include 
#if 0
int foo(short * a, short * b, unsigned int n)
{
    int sum = 0;
    for (unsigned int i = 0; i < n; ++i)
    sum += a[i] + b[i];

    return sum;
}


#else

int bar (short * a, short *b, unsigned int n)
{
    int sum = 0;
    unsigned int i = 0;
    for (; i < (n / 16); i += 1)
    {
    // Iterates [0, 16, .., (n/16 * 16) * 16]
    // Example n = 127,
    // iterates [0, 16, 32, 48, 64, 80, 96, 112]
    sum += a[i*16] + b[i*16];
    }
    for (size_t j =  (size_t) ((n / 16) * 16); j < n; ++j)
    {
    // Iterates [(n/16 * 16) * 16 , (((n/16 * 16) + 1) * 16)... ,n*16]
    // Example n = 127,
    // j starts at (127/16) * 16 = 7 * 16 = 112,
    // So iterates over [112, 113, 114, 115, ..., 127]
    sum += a[j] + b[j];
    }
    return sum;
}
#endif

Compiled the bottom one (#if 0) with 'aarch64-linux-gnu' with the 
following options '-O3 -march=armv8-a -fno-tree-vectorize 
-fdump-tree-ivopts-all -fno-unroll-loops'. See godbolt link here: 
https://godbolt.org/z/MEf6j6ebM


I tried to see what IVOPTs would make of this and it is able to analyze 
the IVs but it doesn't realize (not even sure it tries) that one IV's 
end (loop 1) could be used as the base for the other (loop 2). I don't 
know if this is where you'd want such optimizations to be made, on one 
side I think it would be great as it would also help with non-vectorized 
loops as you allured to.


However, if you compile the top test case (#if 1) and let the 
tree-vectorizer have a go you will see different behaviours for 
different vectorization approaches, so for:
'-O3 -march=armv8-a', using NEON and epilogue vectorization it seems 
IVOPTs only picks up on one loop.
If you use '-O3 -march=armv8-a+sve --param vect-partial-vector-usage=1' 
it will detect two loops. This may well be because in fact epilogue 
vectorization 'un-loops' it because it knows it will only have to do one 
iteration of the vectorized epilogue. vect-partial-vector-usage=1 could 
have done the same, but because we are dealing with polymorphic vector 
modes it fails to, I have a hack that works for 
vect-partial-vector-usage to avoid it, but I think we can probably do 
better and try to reason about boundaries in poly_int's rather than 
integers (TBC).


Anyway I diverge. Back to the main question of this patch. How do you 
suggest I go about this? Is there a way to make IVOPTS aware of the 
'iterate-once' IVs in the epilogue(s) (both vector and scalar!) and then 
teach it to merge IV's if one ends where the other begins?


On 04/05/2021 10:56, Richard Biener wrote:

On Fri, 30 Apr 2021, Andre Vieira (lists) wrote:


Hi,

The aim of this RFC is to explore a way of cleaning up the codegen around
data_references.  To be specific, I'd like to reuse the main-loop's updated
data_reference as the base_address for the epilogue's corresponding
data_reference, rather than use the niters.  We have found this leads to
better codegen in the vectorized epilogue loops.

The approach in this RFC creates a map if iv_updates which always contain an
updated pointer that is caputed in vectorizable_{load,store}, an iv_update may
also contain a skip_edge in case we decide the vectorization can be skipped in
'vect_do_peeling'. During the epilogue update this map of iv_updates is then
checked to see if it contains an entry for a data_reference and it is used
accordingly and if not it reverts back to the old behavior of using the niters
to advance the data_reference.

The motivation for this work is to improve codegen for the option `--param
vect-partial-vector-usage=1` for SVE. We found that one of the main problems
for the codegen here was coming from unnecessary conversions caused by the way
we update the data_references in the epilogue.

This patch passes regression tests in aarch64-linux-gnu, but the codegen is
still not optimal in some cases. Specifically those where we have a scalar
epilogue, as this does not use the data_reference's and will rely on the
gimple scalar code, thus constructing again a memory access using the niters.
This is a limitation for which I haven't quite worked out a solution yet and
does cause some minor regressions due to unfortunate spills.

Let me know what you think and if you have ideas of how we can better achieve
this.

Hmm, so the patch adds a kludge to improve the kludge we have in place ;)

I think it might be interesting to create a C testcase mimicing the
update problem without involving the vectorizer.  That way we can
see how the various components involved behave (FRE + ivopts most
specifically).

That said, a cleaner approach to dealing with this would be to
explicitely track the IVs we generate for vectorized DRs, eventually

Re: [PATCH 9/9] arm: Auto-vectorization for MVE: vld4/vst4

2021-05-04 Thread Andre Vieira (lists) via Gcc-patches

Hi Christophe,

The series LGTM but you'll need the approval of an arm port maintainer 
before committing. I only did code-review, did not try to build/run tests.


Kind regards,
Andre

On 30/04/2021 15:09, Christophe Lyon via Gcc-patches wrote:

This patch enables MVE vld4/vst4 instructions for auto-vectorization.
We move the existing expanders from neon.md and enable them for MVE,
calling the respective emitter.

2021-03-12  Christophe Lyon  

gcc/
* config/arm/neon.md (vec_load_lanesxi)
(vec_store_lanexoi): Move ...
* config/arm/vec-common.md: here.

gcc/testsuite/
* gcc.target/arm/simd/mve-vld4.c: New test, derived from
slp-perm-3.c
---
  gcc/config/arm/neon.md   |  20 
  gcc/config/arm/vec-common.md |  26 +
  gcc/testsuite/gcc.target/arm/simd/mve-vld4.c | 140 +++
  3 files changed, 166 insertions(+), 20 deletions(-)
  create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vld4.c

diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index bc8775c..fb58baf 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -5617,16 +5617,6 @@ (define_insn "neon_vld4"
  (const_string "neon_load4_4reg")))]
  )
  
-(define_expand "vec_load_lanesxi"

-  [(match_operand:XI 0 "s_register_operand")
-   (match_operand:XI 1 "neon_struct_operand")
-   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
-  "TARGET_NEON"
-{
-  emit_insn (gen_neon_vld4 (operands[0], operands[1]));
-  DONE;
-})
-
  (define_expand "neon_vld4"
[(match_operand:XI 0 "s_register_operand")
 (match_operand:XI 1 "neon_struct_operand")
@@ -5818,16 +5808,6 @@ (define_insn "neon_vst4"
  (const_string "neon_store4_4reg")))]
  )
  
-(define_expand "vec_store_lanesxi"

-  [(match_operand:XI 0 "neon_struct_operand")
-   (match_operand:XI 1 "s_register_operand")
-   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
-  "TARGET_NEON"
-{
-  emit_insn (gen_neon_vst4 (operands[0], operands[1]));
-  DONE;
-})
-
  (define_expand "neon_vst4"
[(match_operand:XI 0 "neon_struct_operand")
 (match_operand:XI 1 "s_register_operand")
diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
index 7abefea..d46b78d 100644
--- a/gcc/config/arm/vec-common.md
+++ b/gcc/config/arm/vec-common.md
@@ -512,3 +512,29 @@ (define_expand "vec_store_lanesoi"
  emit_insn (gen_mve_vst2q (operands[0], operands[1]));
DONE;
  })
+
+(define_expand "vec_load_lanesxi"
+  [(match_operand:XI 0 "s_register_operand")
+   (match_operand:XI 1 "neon_struct_operand")
+   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_NEON || TARGET_HAVE_MVE"
+{
+  if (TARGET_NEON)
+emit_insn (gen_neon_vld4 (operands[0], operands[1]));
+  else
+emit_insn (gen_mve_vld4q (operands[0], operands[1]));
+  DONE;
+})
+
+(define_expand "vec_store_lanesxi"
+  [(match_operand:XI 0 "neon_struct_operand")
+   (match_operand:XI 1 "s_register_operand")
+   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_NEON || TARGET_HAVE_MVE"
+{
+  if (TARGET_NEON)
+emit_insn (gen_neon_vst4 (operands[0], operands[1]));
+  else
+emit_insn (gen_mve_vst4q (operands[0], operands[1]));
+  DONE;
+})
diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vld4.c 
b/gcc/testsuite/gcc.target/arm/simd/mve-vld4.c
new file mode 100644
index 000..ce3e755
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/mve-vld4.c
@@ -0,0 +1,140 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O3" } */
+
+#include 
+
+#define M00 100
+#define M10 216
+#define M20 23
+#define M30 237
+#define M01 1322
+#define M11 13
+#define M21 27271
+#define M31 2280
+#define M02 74
+#define M12 191
+#define M22 500
+#define M32 111
+#define M03 134
+#define M13 117
+#define M23 11
+#define M33 771
+
+#define N 128
+
+/* Integer tests.  */
+#define FUNC(SIGN, TYPE, BITS) \
+  void foo_##SIGN##BITS##x (TYPE##BITS##_t *__restrict__ pInput,   \
+   TYPE##BITS##_t *__restrict__ pOutput)   \
+  {\
+unsigned int i;\
+TYPE##BITS##_t  a, b, c, d;
\
+   \
+for (i = 0; i < N / BITS; i++)  \
+  {
\
+   a = *pInput++;  \
+   b = *pInput++;  \
+   c = *pInput++;  \
+   d = *pInput++;  \
+   

Re: [PATCH 7/9] arm: Auto-vectorization for MVE: add __fp16 support to VCMP

2021-05-04 Thread Andre Vieira (lists) via Gcc-patches
It would be good to also add tests for NEON as you also enable auto-vec 
for it. I checked and I do think the necessary 'neon_vc' patterns exist 
for 'VH', so we should be OK there.


On 30/04/2021 15:09, Christophe Lyon via Gcc-patches wrote:

This patch adds __fp16 support to the previous patch that added vcmp
support with MVE. For this we update existing expanders to use VDQWH
iterator, and add a new expander vcond.  In the
process we need to create suitable iterators, and update v_cmp_result
as needed.

2021-04-26  Christophe Lyon  

gcc/
* config/arm/iterators.md (V16): New iterator.
(VH_cvtto): New iterator.
(v_cmp_result): Added V4HF and V8HF support.
* config/arm/vec-common.md (vec_cmp): Use VDQWH.
(vcond): Likewise.
(vcond_mask_): Likewise.
(vcond): New expander.

gcc/testsuite/
* gcc.target/arm/simd/mve-compare-3.c: New test with GCC vectors.
* gcc.target/arm/simd/mve-vcmp-f16.c: New test for
auto-vectorization.
---
  gcc/config/arm/iterators.md   |  6 
  gcc/config/arm/vec-common.md  | 40 ---
  gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c | 38 +
  gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c  | 30 +
  4 files changed, 102 insertions(+), 12 deletions(-)
  create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-compare-3.c
  create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f16.c

diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index a128465..3042baf 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -231,6 +231,9 @@ (define_mode_iterator VU [V16QI V8HI V4SI])
  ;; Vector modes for 16-bit floating-point support.
  (define_mode_iterator VH [V8HF V4HF])
  
+;; Modes with 16-bit elements only.

+(define_mode_iterator V16 [V4HI V4HF V8HI V8HF])
+
  ;; 16-bit floating-point vector modes suitable for moving (includes BFmode).
  (define_mode_iterator VHFBF [V8HF V4HF V4BF V8BF])
  
@@ -571,6 +574,8 @@ (define_mode_attr V_cvtto [(V2SI "v2sf") (V2SF "v2si")

  ;; (Opposite) mode to convert to/from for vector-half mode conversions.
  (define_mode_attr VH_CVTTO [(V4HI "V4HF") (V4HF "V4HI")
(V8HI "V8HF") (V8HF "V8HI")])
+(define_mode_attr VH_cvtto [(V4HI "v4hf") (V4HF "v4hi")
+   (V8HI "v8hf") (V8HF "v8hi")])
  
  ;; Define element mode for each vector mode.

  (define_mode_attr V_elem [(V8QI "QI") (V16QI "QI")
@@ -720,6 +725,7 @@ (define_mode_attr V_cmp_result [(V8QI "V8QI") (V16QI 
"V16QI")
  (define_mode_attr v_cmp_result [(V8QI "v8qi") (V16QI "v16qi")
(V4HI "v4hi") (V8HI  "v8hi")
(V2SI "v2si") (V4SI  "v4si")
+   (V4HF "v4hi") (V8HF  "v8hi")
(DI   "di")   (V2DI  "v2di")
(V2SF "v2si") (V4SF  "v4si")])
  
diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md

index 034b48b..3fd341c 100644
--- a/gcc/config/arm/vec-common.md
+++ b/gcc/config/arm/vec-common.md
@@ -366,8 +366,8 @@ (define_expand "vlshr3"
  (define_expand "vec_cmp"
[(set (match_operand: 0 "s_register_operand")
(match_operator: 1 "comparison_operator"
- [(match_operand:VDQW 2 "s_register_operand")
-  (match_operand:VDQW 3 "reg_or_zero_operand")]))]
+ [(match_operand:VDQWH 2 "s_register_operand")
+  (match_operand:VDQWH 3 "reg_or_zero_operand")]))]
"ARM_HAVE__ARITH
 && !TARGET_REALLY_IWMMXT
 && (! || flag_unsafe_math_optimizations)"
@@ -399,13 +399,13 @@ (define_expand "vec_cmpu"
  ;; element-wise.
  
  (define_expand "vcond"

-  [(set (match_operand:VDQW 0 "s_register_operand")
-   (if_then_else:VDQW
+  [(set (match_operand:VDQWH 0 "s_register_operand")
+   (if_then_else:VDQWH
  (match_operator 3 "comparison_operator"
-   [(match_operand:VDQW 4 "s_register_operand")
-(match_operand:VDQW 5 "reg_or_zero_operand")])
- (match_operand:VDQW 1 "s_register_operand")
- (match_operand:VDQW 2 "s_register_operand")))]
+   [(match_operand:VDQWH 4 "s_register_operand")
+(match_operand:VDQWH 5 "reg_or_zero_operand")])
+ (match_operand:VDQWH 1 "s_register_operand")
+ (match_operand:VDQWH 2 "s_register_operand")))]
"ARM_HAVE__ARITH
 && !TARGET_REALLY_IWMMXT
 && (! || flag_unsafe_math_optimizations)"
@@ -430,6 +430,22 @@ (define_expand "vcond"
DONE;
  })
  
+(define_expand "vcond"

+  [(set (match_operand: 0 "s_register_operand")
+   (if_then_else:
+ (match_operator 3 "comparison_operator"
+   [(match_operand:V16 4 "s_register_operand")
+(match_operand:V16 5 "reg_or_zero_operand")])
+ (match_operand: 1 "s_register_operand")
+ (match_operand: 2 "s_register_operand")))]
+  "ARM

Re: [PATCH 6/9] arm: Auto-vectorization for MVE: vcmp

2021-05-04 Thread Andre Vieira (lists) via Gcc-patches

Hi Christophe,

On 30/04/2021 15:09, Christophe Lyon via Gcc-patches wrote:

Since MVE has a different set of vector comparison operators from
Neon, we have to update the expansion to take into account the new
ones, for instance 'NE' for which MVE does not require to use 'EQ'
with the inverted condition.

Conversely, Neon supports comparisons with #0, MVE does not.

For:
typedef long int vs32 __attribute__((vector_size(16)));
vs32 cmp_eq_vs32_reg (vs32 a, vs32 b) { return a == b; }

we now generate:
cmp_eq_vs32_reg:
vldr.64 d4, .L123   @ 8 [c=8 l=4]  *mve_movv4si/8
vldr.64 d5, .L123+8
vldr.64 d6, .L123+16@ 9 [c=8 l=4]  *mve_movv4si/8
vldr.64 d7, .L123+24
vcmp.i32  eq, q0, q1@ 7 [c=16 l=4]  mve_vcmpeqq_v4si
vpsel q0, q3, q2@ 15[c=8 l=4]  mve_vpselq_sv4si
bx  lr  @ 26[c=8 l=4]  *thumb2_return
.L124:
.align  3
.L123:
.word   0
.word   0
.word   0
.word   0
.word   1
.word   1
.word   1
.word   1

For some reason emit_move_insn (zero, CONST0_RTX (cmp_mode)) produces
a pair of vldr instead of vmov.i32, qX, #0

I think ideally we would even want:
vpte  eq, q0, q1
vmovt.i32 q0, #0
vmove.i32 q0, #1

But we don't have a way to generate VPT blocks with multiple 
instructions yet unfortunately so I guess VPSEL will have to do for now.




2021-03-01  Christophe Lyon  

gcc/
* config/arm/arm-protos.h (arm_expand_vector_compare): Update
prototype.
* config/arm/arm.c (arm_expand_vector_compare): Add support for
MVE.
(arm_expand_vcond): Likewise.
* config/arm/iterators.md (supf): Remove VCMPNEQ_S, VCMPEQQ_S,
VCMPEQQ_N_S, VCMPNEQ_N_S.
(VCMPNEQ, VCMPEQQ, VCMPEQQ_N, VCMPNEQ_N): Remove.
* config/arm/mve.md (@mve_vcmpq_): Add '@' prefix.
(@mve_vcmpq_f): Likewise.
(@mve_vcmpq_n_f): Likewise.
(@mve_vpselq_): Likewise.
(@mve_vpselq_f"): Likewise.
* config/arm/neon.md (vec_cmp): Likewise.
(vcond): Likewise.
(vcond): Likewise.
(vcondu): Likewise.
(vcond_mask_): Likewise.
* config/arm/unspecs.md (VCMPNEQ_U, VCMPNEQ_S, VCMPEQQ_S)
(VCMPEQQ_N_S, VCMPNEQ_N_S, VCMPEQQ_U, CMPEQQ_N_U, VCMPNEQ_N_U)
(VCMPGEQ_N_S, VCMPGEQ_S, VCMPGTQ_N_S, VCMPGTQ_S, VCMPLEQ_N_S)
(VCMPLEQ_S, VCMPLTQ_N_S, VCMPLTQ_S, VCMPCSQ_N_U, VCMPCSQ_U)
(VCMPHIQ_N_U, VCMPHIQ_U): Remove.
* config/arm/vec-common.md (vec_cmp): Likewise.
(vcond): Likewise.
(vcond): Likewise.
(vcondu): Likewise.
(vcond_mask_): Likewise.

gcc/testsuite
* gcc.target/arm/simd/mve-compare-1.c: New test with GCC vectors.
* gcc.target/arm/simd/mve-compare-2.c: New test with GCC vectors.
* gcc.target/arm/simd/mve-compare-scalar-1.c: New test with GCC
vectors.
* gcc.target/arm/simd/mve-vcmp-f32.c: New test for
auto-vectorization.
* gcc.target/arm/simd/mve-vcmp.c: New test for auto-vectorization.

add gcc/testsuite/gcc.target/arm/simd/mve-compare-scalar-1.c
---
  gcc/config/arm/arm-protos.h|   2 +-
  gcc/config/arm/arm.c   | 211 -
  gcc/config/arm/iterators.md|   9 +-
  gcc/config/arm/mve.md  |  10 +-
  gcc/config/arm/neon.md |  87 -
  gcc/config/arm/unspecs.md  |  20 --
  gcc/config/arm/vec-common.md   | 107 +++
  gcc/testsuite/gcc.target/arm/simd/mve-compare-1.c  |  80 
  gcc/testsuite/gcc.target/arm/simd/mve-compare-2.c  |  38 
  .../gcc.target/arm/simd/mve-compare-scalar-1.c |  69 +++
  gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f32.c   |  30 +++
  gcc/testsuite/gcc.target/arm/simd/mve-vcmp.c   |  50 +
  12 files changed, 547 insertions(+), 166 deletions(-)
  create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-compare-1.c
  create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-compare-2.c
  create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-compare-scalar-1.c
  create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vcmp-f32.c
  create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vcmp.c

diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 2521541..ffccaa7 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -373,7 +373,7 @@ extern void arm_emit_coreregs_64bit_shift (enum rtx_code, 
rtx, rtx, rtx, rtx,
  extern bool arm_fusion_enabled_p (tune_params::fuse_ops);
  extern bool arm_valid_symbolic_address_p (rtx);
  extern bool arm_validize_comparison (rtx *, rtx *, rtx *);
-extern bool arm_expand_vector_compare (rtx, rtx_code, rtx, rtx, bool);
+extern bool arm_expand_vector_compare (rtx, rtx_code, rtx, rtx, bool, bool);
  #endif /* RTX_C

[RFC] Using main loop's updated IV as base_address for epilogue vectorization

2021-04-30 Thread Andre Vieira (lists) via Gcc-patches

Hi,

The aim of this RFC is to explore a way of cleaning up the codegen 
around data_references.  To be specific, I'd like to reuse the 
main-loop's updated data_reference as the base_address for the 
epilogue's corresponding data_reference, rather than use the niters.  We 
have found this leads to better codegen in the vectorized epilogue loops.


The approach in this RFC creates a map if iv_updates which always 
contain an updated pointer that is caputed in vectorizable_{load,store}, 
an iv_update may also contain a skip_edge in case we decide the 
vectorization can be skipped in 'vect_do_peeling'. During the epilogue 
update this map of iv_updates is then checked to see if it contains an 
entry for a data_reference and it is used accordingly and if not it 
reverts back to the old behavior of using the niters to advance the 
data_reference.


The motivation for this work is to improve codegen for the option 
`--param vect-partial-vector-usage=1` for SVE. We found that one of the 
main problems for the codegen here was coming from unnecessary 
conversions caused by the way we update the data_references in the epilogue.


This patch passes regression tests in aarch64-linux-gnu, but the codegen 
is still not optimal in some cases. Specifically those where we have a 
scalar epilogue, as this does not use the data_reference's and will rely 
on the gimple scalar code, thus constructing again a memory access using 
the niters.  This is a limitation for which I haven't quite worked out a 
solution yet and does cause some minor regressions due to unfortunate 
spills.


Let me know what you think and if you have ideas of how we can better 
achieve this.


Kind regards,
Andre Vieira

diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index 
c1d6e02194b251f7c940784c291d58c754f07454..ebb71948abe4ca27d495a2707254beb27e385a0d
 100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -1928,6 +1928,15 @@ vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
   return iters_name;
 }
 
+static bool
+maybe_not_zero (tree t)
+{
+  if (!t)
+return false;
+  if (TREE_CODE (t) != INTEGER_CST)
+return true;
+  return !tree_int_cst_equal (t, build_zero_cst (TREE_TYPE (t)));
+}
 
 /* Function vect_update_init_of_dr
 
@@ -1954,6 +1963,76 @@ vect_update_init_of_dr (dr_vec_info *dr_info, tree 
niters, tree_code code)
   dr_info->offset = offset;
 }
 
+static void
+vect_update_base_of_dr (struct data_reference * dr,
+   loop_vec_info epilogue_vinfo, iv_update *iv_update)
+{
+  tree new_base_addr = iv_update->new_base_addr;
+  edge skip_e = iv_update->skip_edge;
+  if (skip_e)
+{
+  /* If we have SKIP_E we need to use the phi-node that joins the IV coming
+from the main loop and the initial IV.  */
+  gimple_seq stmts;
+  tree base_addr = DR_BASE_ADDRESS (dr);
+  tree type = TREE_TYPE (base_addr);
+  gphi *new_phi;
+
+  edge e = EDGE_PRED (skip_e->dest, 0);
+  e = e != skip_e ? e : EDGE_PRED (skip_e->dest, 1);
+
+  base_addr = force_gimple_operand (base_addr, &stmts, true,
+   NULL_TREE);
+  gimple_stmt_iterator gsi = gsi_last_bb (skip_e->src);
+  if (is_gimple_assign (gsi_stmt (gsi))
+ || is_gimple_call (gsi_stmt (gsi)))
+   gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT);
+  else
+   gsi_insert_seq_before (&gsi, stmts, GSI_NEW_STMT);
+
+  /* Make sure NEW_BASE_ADDR and the initial base address use the same
+type.  Not sure why I chose to use DR_BASE_ADDR's type here, probably
+makes more sense to use the NEW_BASE_ADDR's type.  */
+  stmts = NULL;
+  new_base_addr = fold_convert (type, new_base_addr);
+  new_base_addr = force_gimple_operand (new_base_addr, &stmts, true, 
NULL_TREE);
+  gsi = gsi_last_bb (e->src);
+  if (is_gimple_assign (gsi_stmt (gsi))
+ || is_gimple_call (gsi_stmt (gsi)))
+   gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT);
+  else
+   gsi_insert_seq_before (&gsi, stmts, GSI_NEW_STMT);
+
+  new_phi = create_phi_node (make_ssa_name (type), skip_e->dest);
+  add_phi_arg (new_phi, new_base_addr, e, UNKNOWN_LOCATION);
+  add_phi_arg (new_phi, base_addr, skip_e, UNKNOWN_LOCATION);
+
+  new_base_addr = gimple_phi_result (new_phi);
+}
+  else
+{
+  gimple_seq stmts;
+  class loop *loop = LOOP_VINFO_LOOP (epilogue_vinfo);
+  tree type = TREE_TYPE (DR_BASE_ADDRESS (dr));
+  new_base_addr = fold_convert (type, new_base_addr);
+  new_base_addr = force_gimple_operand (new_base_addr, &stmts, true,
+   NULL_TREE);
+  gimple_stmt_iterator gsi
+   = gsi_last_bb (loop_preheader_edge (loop)->src);
+  if (!gsi_stmt (gsi)
+ || is_gimple_assign (gsi_stmt (gsi))
+ || is

Re: [PATCH][PR98791]: IRA: Make sure allocno copy mode's are ordered

2021-03-10 Thread Andre Vieira (lists) via Gcc-patches



On 19/02/2021 15:05, Vladimir Makarov wrote:


On 2021-02-19 5:53 a.m., Andre Vieira (lists) wrote:

Hi,

This patch makes sure that allocno copies are not created for 
unordered modes. The testcases in the PR highlighted a case where an 
allocno copy was being created for:

(insn 121 120 123 11 (parallel [
    (set (reg:VNx2QI 217)
    (vec_duplicate:VNx2QI (subreg/s/v:QI (reg:SI 93 [ _2 
]) 0)))

    (clobber (scratch:VNx16BI))
    ]) 4750 {*vec_duplicatevnx2qi_reg}
 (expr_list:REG_DEAD (reg:SI 93 [ _2 ])
    (nil)))

As the compiler detected that the vec_duplicate_reg pattern 
allowed the input and output operand to be of the same register 
class, it tried to create an allocno copy for these two operands, 
stripping subregs in the process. However, this meant that the copy 
was between VNx2QI and SI, which have unordered mode precisions.


So at compile time we do not know which of the two modes is smaller 
which is a requirement when updating allocno copy costs.


Regression tested on aarch64-linux-gnu.

Is this OK for trunk (and after a week backport to gcc-10) ?

OK.  Yes, it is wise to wait a bit and see how the patch behaves on 
the trunk before submitting it to gcc-10 branch.  Sometimes such 
changes can have quite unexpected consequences.  But I guess not in 
this is case.



Is it OK to backport now? The committed patch applies cleanly and I 
regression tested it on gcc-10 branch for aarch64-linux-gnu.


Kind regards,

Andre



Re: [PATCH][PR98791]: IRA: Make sure allocno copy mode's are ordered

2021-02-22 Thread Andre Vieira (lists) via Gcc-patches

Hi Alex,

On 22/02/2021 10:20, Alex Coplan wrote:

For the testcase, you might want to use the one I posted most recently:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98791#c3
which avoids the dependency on the aarch64-autovec-preference param
(which is in GCC 11 only) as this will simplify backporting.

But if it's preferable to have a testcase without SVE intrinsics for GCC
11 then we should stick with that.
I don't see any problem with having SVE intrinsics in the testcase, 
committed with your other test as I agree it makes the backport easier 
eventually.


Thanks for pointing that out.
diff --git a/gcc/ira-conflicts.c b/gcc/ira-conflicts.c
index 
2c2234734c3166872d94d94c5960045cb89ff2a8..d83cfc1c1a708ba04f5e01a395721540e31173f0
 100644
--- a/gcc/ira-conflicts.c
+++ b/gcc/ira-conflicts.c
@@ -275,7 +275,10 @@ process_regs_for_copy (rtx reg1, rtx reg2, bool 
constraint_p,
   ira_allocno_t a1 = ira_curr_regno_allocno_map[REGNO (reg1)];
   ira_allocno_t a2 = ira_curr_regno_allocno_map[REGNO (reg2)];
 
-  if (!allocnos_conflict_for_copy_p (a1, a2) && offset1 == offset2)
+  if (!allocnos_conflict_for_copy_p (a1, a2)
+ && offset1 == offset2
+ && ordered_p (GET_MODE_PRECISION (ALLOCNO_MODE (a1)),
+   GET_MODE_PRECISION (ALLOCNO_MODE (a2
{
  cp = ira_add_allocno_copy (a1, a2, freq, constraint_p, insn,
 ira_curr_loop_tree_node);
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr98791.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pr98791.c
new file mode 100644
index 
..cc1f1831afb68ba70016cbe26f8f9273cfceafa8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr98791.c
@@ -0,0 +1,12 @@
+/* PR rtl-optimization/98791  */
+/* { dg-do compile } */
+/* { dg-options "-O -ftree-vectorize" } */
+#include 
+extern char a[11];
+extern long b[];
+void f() {
+  for (int d; d < 10; d++) {
+a[d] = svaddv(svptrue_b8(), svdup_u8(0));
+b[d] = 0;
+  }
+}


[PATCH][PR98791]: IRA: Make sure allocno copy mode's are ordered

2021-02-19 Thread Andre Vieira (lists) via Gcc-patches

Hi,

This patch makes sure that allocno copies are not created for unordered 
modes. The testcases in the PR highlighted a case where an allocno copy 
was being created for:

(insn 121 120 123 11 (parallel [
    (set (reg:VNx2QI 217)
    (vec_duplicate:VNx2QI (subreg/s/v:QI (reg:SI 93 [ _2 ]) 
0)))

    (clobber (scratch:VNx16BI))
    ]) 4750 {*vec_duplicatevnx2qi_reg}
 (expr_list:REG_DEAD (reg:SI 93 [ _2 ])
    (nil)))

As the compiler detected that the vec_duplicate_reg pattern 
allowed the input and output operand to be of the same register class, 
it tried to create an allocno copy for these two operands, stripping 
subregs in the process. However, this meant that the copy was between 
VNx2QI and SI, which have unordered mode precisions.


So at compile time we do not know which of the two modes is smaller 
which is a requirement when updating allocno copy costs.


Regression tested on aarch64-linux-gnu.

Is this OK for trunk (and after a week backport to gcc-10) ?

Regards,
Andre


gcc/ChangeLog:
2021-02-19  Andre Vieira  

    PR rtl-optimization/98791
    * ira-conflicts.c (process_regs_for_copy): Don't create allocno 
copies for unordered modes.


gcc/testsuite/ChangeLog:
2021-02-19  Andre Vieira  

    PR rtl-optimization/98791
    * gcc.target/aarch64/sve/pr98791.c: New test.

diff --git a/gcc/ira-conflicts.c b/gcc/ira-conflicts.c
index 
2c2234734c3166872d94d94c5960045cb89ff2a8..d83cfc1c1a708ba04f5e01a395721540e31173f0
 100644
--- a/gcc/ira-conflicts.c
+++ b/gcc/ira-conflicts.c
@@ -275,7 +275,10 @@ process_regs_for_copy (rtx reg1, rtx reg2, bool 
constraint_p,
   ira_allocno_t a1 = ira_curr_regno_allocno_map[REGNO (reg1)];
   ira_allocno_t a2 = ira_curr_regno_allocno_map[REGNO (reg2)];
 
-  if (!allocnos_conflict_for_copy_p (a1, a2) && offset1 == offset2)
+  if (!allocnos_conflict_for_copy_p (a1, a2)
+ && offset1 == offset2
+ && ordered_p (GET_MODE_PRECISION (ALLOCNO_MODE (a1)),
+   GET_MODE_PRECISION (ALLOCNO_MODE (a2
{
  cp = ira_add_allocno_copy (a1, a2, freq, constraint_p, insn,
 ira_curr_loop_tree_node);
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr98791.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pr98791.c
new file mode 100644
index 
..ee0c7b51602cacd45f9e33acecb1eaa9f9edebf2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr98791.c
@@ -0,0 +1,12 @@
+/* PR rtl-optimization/98791  */
+/* { dg-do compile } */
+/* { dg-options "-O -ftree-vectorize --param=aarch64-autovec-preference=3" } */
+extern char a[], b[];
+short c, d;
+long *e;
+void f() {
+  for (int g; g < c; g += 1) {
+a[g] = d;
+b[g] = e[g];
+  }
+}


[AArch64] PR98657: Fix vec_duplicate creation in SVE's 3

2021-02-17 Thread Andre Vieira (lists) via Gcc-patches

Hi,

This patch prevents generating a vec_duplicate with illegal predicate.

Regression tested on aarch64-linux-gnu.

OK for trunk?

gcc/ChangeLog:
2021-02-17  Andre Vieira  

    PR target/98657
    * config/aarch64/aarch64-sve.md: Use 'expand_vector_broadcast' 
to emit vec_duplicate's

    in '3' pattern.

gcc/testsuite/ChangeLog:
2021-02-17  Andre Vieira  

    PR target/98657
    * gcc.target/aarch64/sve/pr98657.c: New test.
diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index 
608319600318974b414e47285ee1474b041f0e05..7db2938bb84e04d066a7b07574e5cf344a3a8fb6
 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -4549,10 +4549,8 @@ (define_expand "3"
   }
 else
   {
-   amount = gen_reg_rtx (mode);
-   emit_insn (gen_vec_duplicate (amount,
-   convert_to_mode (mode,
-operands[2], 0)));
+   amount = convert_to_mode (mode, operands[2], 0);
+   amount = expand_vector_broadcast (mode, amount);
   }
 emit_insn (gen_v3 (operands[0], operands[1], amount));
 DONE;
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr98657.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pr98657.c
new file mode 100644
index 
..592af25d7bbc69bc05823d27358f07cd741dbe20
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr98657.c
@@ -0,0 +1,9 @@
+/* PR target/98657  */
+/* { dg-do compile } */
+/* { dg-options "-O3 -msve-vector-bits=256" } */
+extern char a[];
+void b(_Bool c[][18]) {
+  int d;
+  for (int e = 0; e < 23; e++)
+a[e] = 6 >> c[1][d];
+}


Re: PR98974: Fix vectorizable_condition after STMT_VINFO_VEC_STMTS

2021-02-05 Thread Andre Vieira (lists) via Gcc-patches



On 05/02/2021 12:47, Richard Sandiford wrote:

"Andre Vieira (lists)"  writes:

Hi,

As mentioned in the PR, this patch fixes up the nvectors parameter passed to 
vect_get_loop_mask in vectorizable_condition.
Before the STMT_VINFO_VEC_STMTS rework we used to handle each ncopy separately, 
now we gather them all at the same time and don't need to multiply vec_num with 
ncopies.

The reduced testcase I used to illustrate the issue in the PR gives a warning, 
if someone knows how to get rid of that (it's Fortran) I'd include it as a 
testcase for this.

Looks like Richi's since posted one.

Included it.

diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 
0bc1cb1c5b4f6c1f0447241b4d31434bf7cca1a4..d07602f6d38f9c51936ac09942599fc5a14f46ab
 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -10237,8 +10237,7 @@ vectorizable_condition (vec_info *vinfo,
{
  unsigned vec_num = vec_oprnds0.length ();
  tree loop_mask
-   = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
- vectype, i);
+   = vect_get_loop_mask (gsi, masks, vec_num, vectype, i);
  tree tmp2 = make_ssa_name (vec_cmp_type);
  gassign *g
= gimple_build_assign (tmp2, BIT_AND_EXPR, vec_compare,

Does removing the shadowed vec_num work?  I think that would be less
confusing, and means that the calculation stays in sync with the

Yeah that works too.

Here's a reworked patch.


gcc/ChangeLog:
2021-02-05  Andre Vieira  

    PR middle-end/98974
    * tree-vect-stmts.c (vectorizable_condition): Fix nvectors 
parameter

    for vect_get_loop_mask call.

gcc/testsuite/ChangeLog:
2021-02-05  Andre Vieira  

    * gfortran.dg/pr98974.F90: New test.
diff --git a/gcc/testsuite/gfortran.dg/pr98974.F90 
b/gcc/testsuite/gfortran.dg/pr98974.F90
new file mode 100644
index 
..b3db6a6654a0b36bc567405c70429a5dbe168d1e
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/pr98974.F90
@@ -0,0 +1,20 @@
+! PR middle-end/98974
+! { dg-do compile { target { aarch64*-*-* } } }
+! { dg-options "-Ofast -mcpu=neoverse-v1" }
+
+module module_foobar
+  integer,parameter :: fp_kind = selected_real_kind(15)
+contains
+ subroutine foobar( foo, ix ,jx ,kx,iy,ky)
+   real, dimension( ix, kx, jx )  :: foo
+   real(fp_kind), dimension( iy, ky, 3 ) :: bar, baz
+   do k=1,ky
+  do i=1,iy
+if ( baz(i,k,1) > 0. ) then
+  bar(i,k,1) = 0
+endif
+foo(i,nk,j) = baz0 *  bar(i,k,1)
+  enddo
+   enddo
+ end
+end
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 
0bc1cb1c5b4f6c1f0447241b4d31434bf7cca1a4..064e5d138ce9a151287662a0caefd9925b0a2920
 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -10235,7 +10235,6 @@ vectorizable_condition (vec_info *vinfo,
 
  if (masks)
{
- unsigned vec_num = vec_oprnds0.length ();
  tree loop_mask
= vect_get_loop_mask (gsi, masks, vec_num * ncopies,
  vectype, i);


PR98974: Fix vectorizable_condition after STMT_VINFO_VEC_STMTS

2021-02-05 Thread Andre Vieira (lists) via Gcc-patches

Hi,

As mentioned in the PR, this patch fixes up the nvectors parameter passed to 
vect_get_loop_mask in vectorizable_condition.
Before the STMT_VINFO_VEC_STMTS rework we used to handle each ncopy separately, 
now we gather them all at the same time and don't need to multiply vec_num with 
ncopies.

The reduced testcase I used to illustrate the issue in the PR gives a warning, 
if someone knows how to get rid of that (it's Fortran) I'd include it as a 
testcase for this.

Bootstrapped and regression tested on aarch64-none-linux-gnu. I don't believe 
that code triggers for other targets, so not sure it makes sense to test on 
others?

Is this OK for trunk? Would you rather wait for the testcase?

gcc/ChangeLog:
2021-02-05  Andre Vieira  

PR middle-end/98974
* tree-vect-stmts.c (vectorizable_condition): Fix nvectors parameter
for vect_get_loop_mask call.

diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 
0bc1cb1c5b4f6c1f0447241b4d31434bf7cca1a4..d07602f6d38f9c51936ac09942599fc5a14f46ab
 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -10237,8 +10237,7 @@ vectorizable_condition (vec_info *vinfo,
{
  unsigned vec_num = vec_oprnds0.length ();
  tree loop_mask
-   = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
- vectype, i);
+   = vect_get_loop_mask (gsi, masks, vec_num, vectype, i);
  tree tmp2 = make_ssa_name (vec_cmp_type);
  gassign *g
= gimple_build_assign (tmp2, BIT_AND_EXPR, vec_compare,


[AArch64] Fix vector multiplication costs

2021-02-03 Thread Andre Vieira (lists) via Gcc-patches
This patch introduces a vect.mul RTX cost and decouples the vector 
multiplication costing from the scalar one.


After Wilco's "AArch64: Add cost table for Cortex-A76" patch we saw a 
regression in vector codegen. Reproduceable with the small test added in 
this patch.
Upon further investigation we noticed 'aarch64_rtx_mult_cost' was using 
scalar costs to calculate the cost of vector multiplication, which was 
now lower and preventing 'choose_mult_variant' from making the right 
choice to expand such vector multiplications with constants as shift and 
sub's. I also added a special case for SSRA to use the default vector 
cost rather than mult, SSRA seems to be cost using 
'aarch64_rtx_mult_cost', which to be fair is quite curious. I believe we 
should have a better look at 'aarch64_rtx_costs' altogether and 
completely decouple vector and scalar costs. Though that is something 
that requires more rewriting than I believe should be done in Stage 4.


I gave all targets a vect.mult cost of 4x the vect.alu cost, with the 
exception of targets with cost 0 for vect.alu, those I gave the cost 4.


Bootstrapped on aarch64.

Is this OK for trunk?

gcc/ChangeLog:

    * config/aarch64/aarch64-cost-tables.h: Add entries for vect.mul.
    * config/aarch64/aarch64.c (aarch64_rtx_mult_cost): Use 
vect.mul for

    vector multiplies and vect.alu for SSRA.
    * config/arm/aarch-common-protos.h (struct vector_cost_table): 
Define

    vect.mul cost field.
    * config/arm/aarch-cost-tables.h: Add entries for vect.mul.
    * config/arm/arm.c: Likewise.

gcc/testsuite/ChangeLog:

    * gcc.target/aarch64/asimd-mul-to-shl-sub.c: New test.

diff --git a/gcc/config/aarch64/aarch64-cost-tables.h 
b/gcc/config/aarch64/aarch64-cost-tables.h
index 
c309f88cbd56f0d2347996d860c982a3a6744492..dd2e7e7cbb13d24f0b51092270cd7e2d75fabf29
 100644
--- a/gcc/config/aarch64/aarch64-cost-tables.h
+++ b/gcc/config/aarch64/aarch64-cost-tables.h
@@ -123,7 +123,8 @@ const struct cpu_cost_table qdf24xx_extra_costs =
   },
   /* Vector */
   {
-COSTS_N_INSNS (1)  /* alu.  */
+COSTS_N_INSNS (1),  /* alu.  */
+COSTS_N_INSNS (4)   /* mult.  */
   }
 };
 
@@ -227,7 +228,8 @@ const struct cpu_cost_table thunderx_extra_costs =
   },
   /* Vector */
   {
-COSTS_N_INSNS (1)  /* Alu.  */
+COSTS_N_INSNS (1), /* Alu.  */
+COSTS_N_INSNS (4)  /* mult.  */
   }
 };
 
@@ -330,7 +332,8 @@ const struct cpu_cost_table thunderx2t99_extra_costs =
   },
   /* Vector */
   {
-COSTS_N_INSNS (1)  /* Alu.  */
+COSTS_N_INSNS (1), /* Alu.  */
+COSTS_N_INSNS (4)  /* Mult.  */
   }
 };
 
@@ -433,7 +436,8 @@ const struct cpu_cost_table thunderx3t110_extra_costs =
   },
   /* Vector */
   {
-COSTS_N_INSNS (1)  /* Alu.  */
+COSTS_N_INSNS (1), /* Alu.  */
+COSTS_N_INSNS (4)  /* Mult.  */
   }
 };
 
@@ -537,7 +541,8 @@ const struct cpu_cost_table tsv110_extra_costs =
   },
   /* Vector */
   {
-COSTS_N_INSNS (1)  /* alu.  */
+COSTS_N_INSNS (1),  /* alu.  */
+COSTS_N_INSNS (4)   /* mult.  */
   }
 };
 
@@ -640,7 +645,8 @@ const struct cpu_cost_table a64fx_extra_costs =
   },
   /* Vector */
   {
-COSTS_N_INSNS (1)  /* alu.  */
+COSTS_N_INSNS (1),  /* alu.  */
+COSTS_N_INSNS (4)   /* mult.  */
   }
 };
 
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 
b6192e55521004ae70cd13acbdb4dab142216845..146ed8c1b693d7204a754bc4e6d17025e0af544b
 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -11568,7 +11568,6 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int 
outer, bool speed)
   if (VECTOR_MODE_P (mode))
 {
   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
-  mode = GET_MODE_INNER (mode);
   if (vec_flags & VEC_ADVSIMD)
{
  /* The by-element versions of the instruction have the same costs as
@@ -11582,6 +11581,17 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int 
outer, bool speed)
  else if (GET_CODE (op1) == VEC_DUPLICATE)
op1 = XEXP (op1, 0);
}
+  cost += rtx_cost (op0, mode, MULT, 0, speed);
+  cost += rtx_cost (op1, mode, MULT, 1, speed);
+  if (speed)
+   {
+ if (GET_CODE (x) == MULT)
+   cost += extra_cost->vect.mult;
+ /* This is to catch the SSRA costing currently flowing here.  */
+ else
+   cost += extra_cost->vect.alu;
+   }
+  return cost;
 }
 
   /* Integer multiply/fma.  */
diff --git a/gcc/config/arm/aarch-common-protos.h 
b/gcc/config/arm/aarch-common-protos.h
index 
251de3d61a833a2bb4b77e9211cac7fbc17c0b75..7a9cf3d324c103de74af741abe9ef30b76fea5ce
 100644
--- a/gcc/config/arm/aarch-common-protos.h
+++ b/gcc/config/arm/aarch-common-protos.h
@@ -132,6 +132,7 @@ struct fp_cost_table
 struct vector_cost_table
 {
   const int alu;
+  const int mult;
 };
 
 struct cpu_cost_table
diff --git a/gcc/config/arm/aarch-cost-tables.h 
b/gcc/config/arm/aarch-cost-table

Re: [PATCH] arm: Fix up neon_vector_mem_operand [PR97528]

2021-02-03 Thread Andre Vieira (lists) via Gcc-patches
Same patch applies cleanly on gcc-8, bootstrapped 
arm-none-linux-gnueabihf and ran regressions also clean.


Can I also commit it to gcc-8?

Thanks,
Andre

On 02/02/2021 17:36, Kyrylo Tkachov wrote:



-Original Message-
From: Andre Vieira (lists) 
Sent: 02 February 2021 17:27
To: gcc-patches@gcc.gnu.org
Cc: Kyrylo Tkachov ; ja...@redhat.com
Subject: Re: [PATCH] arm: Fix up neon_vector_mem_operand [PR97528]

Hi,

This is a gcc-9 backport of the PR97528 fix that has been applied to
trunk and gcc-10.
Bootstraped on arm-linux-gnueabihf and regression tested.

OK for gcc-9 branch?

Ok.
Thanks,
Kyrill


2021-02-02  Andre Vieira  

      Backport from mainline
      2020-11-20  Jakub Jelinek  

      PR target/97528
      * config/arm/arm.c (neon_vector_mem_operand): For POST_MODIFY,
require
      first POST_MODIFY operand is a REG and is equal to the first operand
      of PLUS.

      * gcc.target/arm/pr97528.c: New test.

On 20/11/2020 11:25, Kyrylo Tkachov via Gcc-patches wrote:

-Original Message-
From: Jakub Jelinek 
Sent: 19 November 2020 18:57
To: Richard Earnshaw ; Ramana
Radhakrishnan ; Kyrylo Tkachov

Cc: gcc-patches@gcc.gnu.org
Subject: [PATCH] arm: Fix up neon_vector_mem_operand [PR97528]

Hi!

The documentation for POST_MODIFY says:
 Currently, the compiler can only handle second operands of the
 form (plus (reg) (reg)) and (plus (reg) (const_int)), where
 the first operand of the PLUS has to be the same register as
 the first operand of the *_MODIFY.
The following testcase ICEs, because combine just attempts to simplify
things and ends up with
(post_modify (reg1) (plus (mult (reg2) (const_int 4)) (reg1))
but the target predicates accept it, because they only verify
that POST_MODIFY's second operand is PLUS and the second operand
of the PLUS is a REG.

The following patch fixes this by performing further verification that
the POST_MODIFY is in the form it should be.

Bootstrapped/regtested on armv7hl-linux-gnueabi, ok for trunk
and release branches after a while?

Ok.
Thanks,
Kyrill


2020-11-19  Jakub Jelinek  

PR target/97528
* config/arm/arm.c (neon_vector_mem_operand): For
POST_MODIFY, require
first POST_MODIFY operand is a REG and is equal to the first operand
of PLUS.

* gcc.target/arm/pr97528.c: New test.

--- gcc/config/arm/arm.c.jj 2020-11-13 19:00:46.729620560 +0100
+++ gcc/config/arm/arm.c2020-11-18 17:05:44.656867343 +0100
@@ -13429,7 +13429,9 @@ neon_vector_mem_operand (rtx op, int typ
 /* Allow post-increment by register for VLDn */
 if (type == 2 && GET_CODE (ind) == POST_MODIFY
 && GET_CODE (XEXP (ind, 1)) == PLUS
-  && REG_P (XEXP (XEXP (ind, 1), 1)))
+  && REG_P (XEXP (XEXP (ind, 1), 1))
+  && REG_P (XEXP (ind, 0))
+  && rtx_equal_p (XEXP (ind, 0), XEXP (XEXP (ind, 1), 0)))
return true;

 /* Match:
--- gcc/testsuite/gcc.target/arm/pr97528.c.jj   2020-11-18
17:09:58.195053288 +0100
+++ gcc/testsuite/gcc.target/arm/pr97528.c  2020-11-18
17:09:47.839168237 +0100
@@ -0,0 +1,28 @@
+/* PR target/97528 */
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O1" }  */
+/* { dg-add-options arm_neon } */
+
+#include 
+
+typedef __simd64_int16_t T;
+typedef __simd64_uint16_t U;
+unsigned short c;
+int d;
+U e;
+
+void
+foo (void)
+{
+  unsigned short *dst = &c;
+  int g = d, b = 4;
+  U dc = e;
+  for (int h = 0; h < b; h++)
+{
+  unsigned short *i = dst;
+  U j = dc;
+  vst1_s16 ((int16_t *) i, (T) j);
+  dst += g;
+}
+}


Jakub


Re: [PATCH] arm: Fix up neon_vector_mem_operand [PR97528]

2021-02-02 Thread Andre Vieira (lists) via Gcc-patches

Hi,

This is a gcc-9 backport of the PR97528 fix that has been applied to 
trunk and gcc-10.

Bootstraped on arm-linux-gnueabihf and regression tested.

OK for gcc-9 branch?

2021-02-02  Andre Vieira  

    Backport from mainline
    2020-11-20  Jakub Jelinek  

    PR target/97528
    * config/arm/arm.c (neon_vector_mem_operand): For POST_MODIFY, require
    first POST_MODIFY operand is a REG and is equal to the first operand
    of PLUS.

    * gcc.target/arm/pr97528.c: New test.

On 20/11/2020 11:25, Kyrylo Tkachov via Gcc-patches wrote:



-Original Message-
From: Jakub Jelinek 
Sent: 19 November 2020 18:57
To: Richard Earnshaw ; Ramana
Radhakrishnan ; Kyrylo Tkachov

Cc: gcc-patches@gcc.gnu.org
Subject: [PATCH] arm: Fix up neon_vector_mem_operand [PR97528]

Hi!

The documentation for POST_MODIFY says:
Currently, the compiler can only handle second operands of the
form (plus (reg) (reg)) and (plus (reg) (const_int)), where
the first operand of the PLUS has to be the same register as
the first operand of the *_MODIFY.
The following testcase ICEs, because combine just attempts to simplify
things and ends up with
(post_modify (reg1) (plus (mult (reg2) (const_int 4)) (reg1))
but the target predicates accept it, because they only verify
that POST_MODIFY's second operand is PLUS and the second operand
of the PLUS is a REG.

The following patch fixes this by performing further verification that
the POST_MODIFY is in the form it should be.

Bootstrapped/regtested on armv7hl-linux-gnueabi, ok for trunk
and release branches after a while?

Ok.
Thanks,
Kyrill


2020-11-19  Jakub Jelinek  

PR target/97528
* config/arm/arm.c (neon_vector_mem_operand): For
POST_MODIFY, require
first POST_MODIFY operand is a REG and is equal to the first operand
of PLUS.

* gcc.target/arm/pr97528.c: New test.

--- gcc/config/arm/arm.c.jj 2020-11-13 19:00:46.729620560 +0100
+++ gcc/config/arm/arm.c2020-11-18 17:05:44.656867343 +0100
@@ -13429,7 +13429,9 @@ neon_vector_mem_operand (rtx op, int typ
/* Allow post-increment by register for VLDn */
if (type == 2 && GET_CODE (ind) == POST_MODIFY
&& GET_CODE (XEXP (ind, 1)) == PLUS
-  && REG_P (XEXP (XEXP (ind, 1), 1)))
+  && REG_P (XEXP (XEXP (ind, 1), 1))
+  && REG_P (XEXP (ind, 0))
+  && rtx_equal_p (XEXP (ind, 0), XEXP (XEXP (ind, 1), 0)))
   return true;

/* Match:
--- gcc/testsuite/gcc.target/arm/pr97528.c.jj   2020-11-18
17:09:58.195053288 +0100
+++ gcc/testsuite/gcc.target/arm/pr97528.c  2020-11-18
17:09:47.839168237 +0100
@@ -0,0 +1,28 @@
+/* PR target/97528 */
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O1" }  */
+/* { dg-add-options arm_neon } */
+
+#include 
+
+typedef __simd64_int16_t T;
+typedef __simd64_uint16_t U;
+unsigned short c;
+int d;
+U e;
+
+void
+foo (void)
+{
+  unsigned short *dst = &c;
+  int g = d, b = 4;
+  U dc = e;
+  for (int h = 0; h < b; h++)
+{
+  unsigned short *i = dst;
+  U j = dc;
+  vst1_s16 ((int16_t *) i, (T) j);
+  dst += g;
+}
+}


Jakub
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 
04edd637d43198ad801bb5ada8f1807faae7001e..4679da75dd823778d5a3e37c497ee10793e9c7d7
 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -12730,7 +12730,9 @@ neon_vector_mem_operand (rtx op, int type, bool strict)
   /* Allow post-increment by register for VLDn */
   if (type == 2 && GET_CODE (ind) == POST_MODIFY
   && GET_CODE (XEXP (ind, 1)) == PLUS
-  && REG_P (XEXP (XEXP (ind, 1), 1)))
+  && REG_P (XEXP (XEXP (ind, 1), 1))
+  && REG_P (XEXP (ind, 0))
+  && rtx_equal_p (XEXP (ind, 0), XEXP (XEXP (ind, 1), 0)))
  return true;
 
   /* Match:
diff --git a/gcc/testsuite/gcc.target/arm/pr97528.c 
b/gcc/testsuite/gcc.target/arm/pr97528.c
new file mode 100644
index 
..6cc59f2158c5f8c8dd78e5083ca7ebc4e5f63a44
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/pr97528.c
@@ -0,0 +1,28 @@
+/* PR target/97528 */
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O1" }  */
+/* { dg-add-options arm_neon } */
+
+#include 
+
+typedef __simd64_int16_t T;
+typedef __simd64_uint16_t U;
+unsigned short c;
+int d;
+U e;
+
+void
+foo (void)
+{
+  unsigned short *dst = &c;
+  int g = d, b = 4;
+  U dc = e;
+  for (int h = 0; h < b; h++)
+{
+  unsigned short *i = dst;
+  U j = dc;
+  vst1_s16 ((int16_t *) i, (T) j);
+  dst += g;
+}
+}


Re: RFC: ARM MVE and Neon auto-vectorization

2020-12-09 Thread Andre Vieira (lists) via Gcc-patches



On 08/12/2020 13:50, Christophe Lyon via Gcc-patches wrote:

Hi,


My 'vand' patch changes the definition of VDQ so that the relevant
modes are enabled only when !TARGET_HAVE_MVE (V8QI, ...), and this
helps writing a simpler expander.

However, vneg is used by vshr (right-shifts by register are
implemented as left-shift by negation of that register), so the
expander uses something like:

   emit_insn (gen_neg2 (neg, operands[2]));
   if (TARGET_NEON)
   emit_insn (gen_ashl3_signed (operands[0], operands[1], neg));
   else
   emit_insn (gen_mve_vshlq_s (operands[0], operands[1], neg));

which does not work if the iterator has conditional members: the
'else' part is still generated for  unsupported by MVE.

So I guess my question is:  do we want to enforce implementation
of Neon / MVE common parts? There are already lots of partly
overlapping/duplicate iterators. I have tried to split iterators into
eg VDQ_COMMON_TO_NEON_AND_MVE and VDQ_NEON_ONLY but this means we have
to basically duplicate the expanders which defeats the point...
Ideally I think we'd want a minimal number iterators and defines, which 
was the idea behind the conditional iterators disabling 64-bit modes for 
MVE.


Obviously that then breaks the code above. For this specific case I 
would suggest unifying define_insns ashl3_{signed,unsigned} and 
mve_vshlq_, they are very much the same patterns, I also 
don't understand why ahsl's signed and unsigned are separate. For 
instance create a 'ashl3__' or something like that, and make 
sure the calls to gen_ashl33_{unsigned,signed} now call to 
gen_ashl3__ and that arm_mve_builtins.def use 
ashl3__ instead of this,  needs to be at the end of 
the name for the builtin construct. Whether this 'form' would work 
everywhere, I don't know. And I suspect you might find more issues like 
this. If there are more than you are willing to change right now then 
maybe the easier step forward is to try to tackle them one at a time, 
and use a new conditional iterator where you've been able to merge NEON 
and MVE patterns.


As a general strategy I think we should try to clean the mess up, but I 
don't think we should try to clean it all up in one go as that will 
probably lead to it not getting done at all. I'm not the maintainer, so 
I'd be curious to see how Kyrill feels about this, but in my opinion we 
should take patches that don't make it less maintainable, so if you can 
clean it up as much as possible, great! Otherwise if its not making the 
mess bigger and its enabling auto-vec then I personally don't see why it 
shouldn't be accepted.

Or we can keep different expanders for Neon and MVE? But we have
already quite a few in vec-common.md.
We can't keep different expanders if they expand the same optab with the 
same modes in the same backend. So we will always have to make NEON and 
MVE work together.


Re: [PATCH 1/7] arm: Auto-vectorization for MVE: vand

2020-11-27 Thread Andre Vieira (lists) via Gcc-patches

Hi Christophe,

On 26/11/2020 15:31, Christophe Lyon wrote:

Hi Andre,

Thanks for the quick feedback.

On Wed, 25 Nov 2020 at 18:17, Andre Simoes Dias Vieira
 wrote:

Hi Christophe,

Thanks for these! See some inline comments.

On 25/11/2020 13:54, Christophe Lyon via Gcc-patches wrote:

This patch enables MVE vandq instructions for auto-vectorization.  MVE
vandq insns in mve.md are modified to use 'and' instead of unspec
expression to support and3.  The and3 expander is added to
vec-common.md

2020-11-12  Christophe Lyon  

   gcc/
   * gcc/config/arm/iterators.md (supf): Remove VANDQ_S and VANDQ_U.
   (VANQ): Remove.
   * config/arm/mve.md (mve_vandq_u): New entry for vand
   instruction using expression and.
   (mve_vandq_s): New expander.
   * config/arm/neon.md (and3): Renamed into and3_neon.
   * config/arm/unspecs.md (VANDQ_S, VANDQ_U): Remove.
   * config/arm/vec-common.md (and3): New expander.

   gcc/testsuite/
   * gcc.target/arm/simd/mve-vand.c: New test.
---
   gcc/config/arm/iterators.md  |  4 +---
   gcc/config/arm/mve.md| 20 
   gcc/config/arm/neon.md   |  2 +-
   gcc/config/arm/unspecs.md|  2 --
   gcc/config/arm/vec-common.md | 15 
   gcc/testsuite/gcc.target/arm/simd/mve-vand.c | 34 

   6 files changed, 66 insertions(+), 11 deletions(-)
   create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vand.c

diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 592af35..72039e4 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -1232,8 +1232,7 @@ (define_int_attr supf [(VCVTQ_TO_F_S "s") (VCVTQ_TO_F_U "u") 
(VREV16Q_S "s")
  (VADDLVQ_P_U "u") (VCMPNEQ_U "u") (VCMPNEQ_S "s")
  (VABDQ_M_S "s") (VABDQ_M_U "u") (VABDQ_S "s")
  (VABDQ_U "u") (VADDQ_N_S "s") (VADDQ_N_U "u")
-(VADDVQ_P_S "s") (VADDVQ_P_U "u") (VANDQ_S "s")
-(VANDQ_U "u") (VBICQ_S "s") (VBICQ_U "u")
+(VADDVQ_P_S "s") (VADDVQ_P_U "u") (VBICQ_S "s") (VBICQ_U 
"u")
  (VBRSRQ_N_S "s") (VBRSRQ_N_U "u") (VCADDQ_ROT270_S "s")
  (VCADDQ_ROT270_U "u") (VCADDQ_ROT90_S "s")
  (VCMPEQQ_S "s") (VCMPEQQ_U "u") (VCADDQ_ROT90_U "u")
@@ -1501,7 +1500,6 @@ (define_int_iterator VABDQ [VABDQ_S VABDQ_U])
   (define_int_iterator VADDQ_N [VADDQ_N_S VADDQ_N_U])
   (define_int_iterator VADDVAQ [VADDVAQ_S VADDVAQ_U])
   (define_int_iterator VADDVQ_P [VADDVQ_P_U VADDVQ_P_S])
-(define_int_iterator VANDQ [VANDQ_U VANDQ_S])
   (define_int_iterator VBICQ [VBICQ_S VBICQ_U])
   (define_int_iterator VBRSRQ_N [VBRSRQ_N_U VBRSRQ_N_S])
   (define_int_iterator VCADDQ_ROT270 [VCADDQ_ROT270_S VCADDQ_ROT270_U])
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index ecbaaa9..975eb7d 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -894,17 +894,27 @@ (define_insn "mve_vaddvq_p_"
   ;;
   ;; [vandq_u, vandq_s])
   ;;
-(define_insn "mve_vandq_"
+;; signed and unsigned versions are the same: define the unsigned
+;; insn, and use an expander for the signed one as we still reference
+;; both names from arm_mve.h.
+(define_insn "mve_vandq_u"
 [
  (set (match_operand:MVE_2 0 "s_register_operand" "=w")
- (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w")
-(match_operand:MVE_2 2 "s_register_operand" "w")]
-  VANDQ))
+ (and:MVE_2 (match_operand:MVE_2 1 "s_register_operand" "w")
+(match_operand:MVE_2 2 "s_register_operand" "w")))

The predicate on the second operand is more restrictive than the one in
expand 'neon_inv_logic_op2'. This should still work with immediates, or
well I checked for integers, it generates a loop as such:


Right, thanks for catching it.


  vldrw.32q3, [r0]
  vldr.64 d4, .L8
  vldr.64 d5, .L8+8
  vandq3, q3, q2
  vstrw.32q3, [r2]

MVE does support vand's with immediates, just like NEON, I suspect you
could just copy the way Neon handles these, possibly worth the little
extra effort. You can use dest[i] = a[i] & ~1 as a testcase.
If you don't it might still be worth expanding the test to make sure
other immediates-types combinations don't trigger an ICE?

I'm not sure I understand why it loads it in two 64-bit chunks and not
do a single load or not just do something like a vmov or vbic immediate.
Anyhow that's a worry for another day I guess..

Do you mean something like the attached (on top of this patch)?
I dislike the code duplication in mve_vandq_u which would
become a copy of and3_neon.

Hi Christophe,

Yeah that's what I meant. I agree with the code duplication. The reason 
we still use separate ones is because of the difference in supported 
modes. Maybe the right way around tha

Re: [PATCH 3/7] arm: Auto-vectorization for MVE: veor

2020-11-26 Thread Andre Vieira (lists) via Gcc-patches

LGTM,  but please wait for maintainer review.

On 25/11/2020 13:54, Christophe Lyon via Gcc-patches wrote:

This patch enables MVE veorq instructions for auto-vectorization.  MVE
veorq insns in mve.md are modified to use xor instead of unspec
expression to support xor3.  The xor3 expander is added to
vec-common.md

2020-11-12  Christophe Lyon  

gcc/
* config/arm/iterators.md (supf): Remove VEORQ_S and VEORQ_U.
(VEORQ): Remove.
* config/arm/mve.md (mve_veorq_u): New entry for veor
instruction using expression xor.
(mve_veorq_s): New expander.
* config/arm/neon.md (xor3): Renamed into xor3_neon.
* config/arm/unspscs.md (VEORQ_S, VEORQ_U): Remove.
* config/arm/vec-common.md (xor3): New expander.

gcc/testsuite/
* gcc.target/arm/simd/mve-veor.c: Add tests for veor.
---
  gcc/config/arm/iterators.md  |  3 +--
  gcc/config/arm/mve.md| 17 ++
  gcc/config/arm/neon.md   |  2 +-
  gcc/config/arm/unspecs.md|  2 --
  gcc/config/arm/vec-common.md | 15 
  gcc/testsuite/gcc.target/arm/simd/mve-veor.c | 34 
  6 files changed, 63 insertions(+), 10 deletions(-)
  create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-veor.c

diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 5fcb7af..0195275 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -1237,7 +1237,7 @@ (define_int_attr supf [(VCVTQ_TO_F_S "s") (VCVTQ_TO_F_U "u") 
(VREV16Q_S "s")
   (VCADDQ_ROT270_U "u") (VCADDQ_ROT90_S "s")
   (VCMPEQQ_S "s") (VCMPEQQ_U "u") (VCADDQ_ROT90_U "u")
   (VCMPEQQ_N_S "s") (VCMPEQQ_N_U "u") (VCMPNEQ_N_S "s")
-  (VCMPNEQ_N_U "u") (VEORQ_S "s") (VEORQ_U "u")
+  (VCMPNEQ_N_U "u")
   (VHADDQ_N_S "s") (VHADDQ_N_U "u") (VHADDQ_S "s")
   (VHADDQ_U "u") (VHSUBQ_N_S "s")  (VHSUBQ_N_U "u")
   (VHSUBQ_S "s") (VMAXQ_S "s") (VMAXQ_U "u") (VHSUBQ_U "u")
@@ -1507,7 +1507,6 @@ (define_int_iterator VCADDQ_ROT90 [VCADDQ_ROT90_U 
VCADDQ_ROT90_S])
  (define_int_iterator VCMPEQQ [VCMPEQQ_U VCMPEQQ_S])
  (define_int_iterator VCMPEQQ_N [VCMPEQQ_N_S VCMPEQQ_N_U])
  (define_int_iterator VCMPNEQ_N [VCMPNEQ_N_U VCMPNEQ_N_S])
-(define_int_iterator VEORQ [VEORQ_U VEORQ_S])
  (define_int_iterator VHADDQ [VHADDQ_S VHADDQ_U])
  (define_int_iterator VHADDQ_N [VHADDQ_N_U VHADDQ_N_S])
  (define_int_iterator VHSUBQ [VHSUBQ_S VHSUBQ_U])
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 0f04044..a5f5d75 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -1204,17 +1204,24 @@ (define_insn "mve_vcmpneq_n_"
  ;;
  ;; [veorq_u, veorq_s])
  ;;
-(define_insn "mve_veorq_"
+(define_insn "mve_veorq_u"
[
 (set (match_operand:MVE_2 0 "s_register_operand" "=w")
-   (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w")
-  (match_operand:MVE_2 2 "s_register_operand" "w")]
-VEORQ))
+   (xor:MVE_2 (match_operand:MVE_2 1 "s_register_operand" "w")
+  (match_operand:MVE_2 2 "s_register_operand" "w")))
]
"TARGET_HAVE_MVE"
-  "veor %q0, %q1, %q2"
+  "veor\t%q0, %q1, %q2"
[(set_attr "type" "mve_move")
  ])
+(define_expand "mve_veorq_s"
+  [
+   (set (match_operand:MVE_2 0 "s_register_operand")
+   (xor:MVE_2 (match_operand:MVE_2 1 "s_register_operand")
+  (match_operand:MVE_2 2 "s_register_operand")))
+  ]
+  "TARGET_HAVE_MVE"
+)
  
  ;;

  ;; [vhaddq_n_u, vhaddq_n_s])
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 669c34d..e1263b0 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -747,7 +747,7 @@ (define_insn "bic3_neon"
[(set_attr "type" "neon_logic")]
  )
  
-(define_insn "xor3"

+(define_insn "xor3_neon"
[(set (match_operand:VDQ 0 "s_register_operand" "=w")
(xor:VDQ (match_operand:VDQ 1 "s_register_operand" "w")
 (match_operand:VDQ 2 "s_register_operand" "w")))]
diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
index f111ad8..78313ea 100644
--- a/gcc/config/arm/unspecs.md
+++ b/gcc/config/arm/unspecs.md
@@ -608,7 +608,6 @@ (define_c_enum "unspec" [
VCMPEQQ_S
VCMPEQQ_N_S
VCMPNEQ_N_S
-  VEORQ_S
VHADDQ_S
VHADDQ_N_S
VHSUBQ_S
@@ -653,7 +652,6 @@ (define_c_enum "unspec" [
VCMPEQQ_U
VCMPEQQ_N_U
VCMPNEQ_N_U
-  VEORQ_U
VHADDQ_U
VHADDQ_N_U
VHSUBQ_U
diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
index 413fb07..687134a 100644
--- a/gcc/config/arm/vec-common.md
+++ b/gcc/config/arm/vec-common.md
@@ -202,3 +202,18 @@ (define_expand "ior3"
  (match_operand:VNINOTM1 2 "neon_logic_op2" "")))]
"TARGET_NEON"
  )
+
+(define_expand "xor3"
+  [(set (

Re: [PATCH][GCC][Arm] PR target/95646: Do not clobber callee saved registers with CMSE

2020-07-20 Thread Andre Vieira (lists)



On 08/07/2020 09:04, Andre Simoes Dias Vieira wrote:


On 07/07/2020 13:43, Christophe Lyon wrote:

Hi,


On Mon, 6 Jul 2020 at 16:31, Andre Vieira (lists)
 wrote:


On 30/06/2020 14:50, Andre Vieira (lists) wrote:

On 29/06/2020 11:15, Christophe Lyon wrote:

On Mon, 29 Jun 2020 at 10:56, Andre Vieira (lists)
 wrote:

On 23/06/2020 21:52, Christophe Lyon wrote:

On Tue, 23 Jun 2020 at 15:28, Andre Vieira (lists)
 wrote:

On 23/06/2020 13:10, Kyrylo Tkachov wrote:

-Original Message-
From: Andre Vieira (lists) 
Sent: 22 June 2020 09:52
To: gcc-patches@gcc.gnu.org
Cc: Kyrylo Tkachov 
Subject: [PATCH][GCC][Arm] PR target/95646: Do not clobber
callee saved
registers with CMSE

Hi,

As reported in bugzilla when the -mcmse option is used while
compiling
for size (-Os) with a thumb-1 target the generated code will
clear the
registers r7-r10. These however are callee saved and should be
preserved
accross ABI boundaries. The reason this happens is because these
registers are made "fixed" when optimising for size with Thumb-1
in a
way to make sure they are not used, as pushing and popping
hi-registers
requires extra moves to and from LO_REGS.

To fix this, this patch uses 'callee_saved_reg_p', which
accounts for
this optimisation, instead of 'call_used_or_fixed_reg_p'. Be
aware of
'callee_saved_reg_p''s definition, as it does still take call 
used

registers into account, which aren't callee_saved in my opinion,
so it
is a rather misnoemer, works in our advantage here though as it
does
exactly what we need.

Regression tested on arm-none-eabi.

Is this OK for trunk? (Will eventually backport to previous
versions if
stable.)

Ok.
Thanks,
Kyrill

As I was getting ready to push this I noticed I didn't add any
skip-ifs
to prevent this failing with specific target options. So here's 
a new

version with those.

Still OK?


Hi,

This is not sufficient to skip arm-linux-gnueabi* configs built 
with

non-default cpu/fpu.

For instance, with arm-linux-gnueabihf --with-cpu=cortex-a9
--with-fpu=neon-fp16 --with-float=hard
I see:
FAIL: gcc.target/arm/pr95646.c (test for excess errors)
Excess errors:
cc1: error: ARMv8-M Security Extensions incompatible with 
selected FPU

cc1: error: target CPU does not support ARM mode

and the testcase is compiled with -mcpu=cortex-m23 -mcmse -Os

Resending as I don't think my earlier one made it to the lists
(sorry if
you are receiving this double!)

I'm not following this, before I go off and try to reproduce it,
what do
you mean by 'the testcase is compiled with -mcpu=cortex-m23 -mcmse
-Os'?
These are the options you are seeing in the log file? Surely they
should
override the default options? Only thing I can think of is this 
might

need an extra -mfloat-abi=soft to make sure it overrides the default
float-abi.  Could you give that a try?

No it doesn't make a difference alone.

I also had to add:
-mfpu=auto (that clears the above warning)
-mthumb otherwise we now get cc1: error: target CPU does not support
ARM mode

Looks like some effective-target machinery is needed

So I had a look at this,  I was pretty sure that -mfloat-abi=soft
overwrote -mfpu=<>, which in large it does, as in no FP instructions
will be generated but the error you see only checks for the right
number of FP registers. Which doesn't check whether
'TARGET_HARD_FLOAT' is set or not. I'll fix this too and use the
check-effective-target for armv8-m.base for this test as it is indeed
a better approach than my bag of skip-ifs. I'm testing it locally to
make sure my changes don't break anything.

Cheers,
Andre

Hi,

Sorry for the delay. So I changed the test to use the effective-target
machinery as you suggested and I also made sure that you don't get the
"ARMv8-M Security Extensions incompatible with selected FPU" when
-mfloat-abi=soft.
Further changed 'asm' to '__asm__' to avoid failures with '-std=' 
options.


Regression tested on arm-none-eabi.
@Christophe: could you test this for your configuration, shouldn't fail
anymore!


Indeed with your patch I don't see any failure with pr95646.c

Note that it is still unsupported with arm-eabi when running the tests
with -mcpu=cortex-mXX
because the compiler complains that -mcpu=cortex-mXX conflicts with
-march=armv8-m.base,
thus the effective-target test fails.

BTW, is that warning useful/practical? Wouldn't it be more convenient
if the last -mcpu/-march
on the command line was the only one taken into account? (I had a
similar issue when
running tests (libstdc++) getting -march=armv8-m.main+fp from their
multilib environment
and forcing -mcpu=cortex-m33 because it also means '+dsp' and produces
a warning;
I had to use -mcpu=cortex-m33 -march=armv8-m.main+fp+dsp to 
workaround this)
Yeah I've been annoyed by that before, also in the context of testing 
multilibs.


Even

Re: [PATCH][GCC][Arm] PR target/95646: Do not clobber callee saved registers with CMSE

2020-07-06 Thread Andre Vieira (lists)


On 30/06/2020 14:50, Andre Vieira (lists) wrote:


On 29/06/2020 11:15, Christophe Lyon wrote:

On Mon, 29 Jun 2020 at 10:56, Andre Vieira (lists)
 wrote:


On 23/06/2020 21:52, Christophe Lyon wrote:

On Tue, 23 Jun 2020 at 15:28, Andre Vieira (lists)
 wrote:

On 23/06/2020 13:10, Kyrylo Tkachov wrote:

-Original Message-
From: Andre Vieira (lists) 
Sent: 22 June 2020 09:52
To: gcc-patches@gcc.gnu.org
Cc: Kyrylo Tkachov 
Subject: [PATCH][GCC][Arm] PR target/95646: Do not clobber 
callee saved

registers with CMSE

Hi,

As reported in bugzilla when the -mcmse option is used while 
compiling
for size (-Os) with a thumb-1 target the generated code will 
clear the
registers r7-r10. These however are callee saved and should be 
preserved

accross ABI boundaries. The reason this happens is because these
registers are made "fixed" when optimising for size with Thumb-1 
in a
way to make sure they are not used, as pushing and popping 
hi-registers

requires extra moves to and from LO_REGS.

To fix this, this patch uses 'callee_saved_reg_p', which 
accounts for
this optimisation, instead of 'call_used_or_fixed_reg_p'. Be 
aware of

'callee_saved_reg_p''s definition, as it does still take call used
registers into account, which aren't callee_saved in my opinion, 
so it
is a rather misnoemer, works in our advantage here though as it 
does

exactly what we need.

Regression tested on arm-none-eabi.

Is this OK for trunk? (Will eventually backport to previous 
versions if

stable.)

Ok.
Thanks,
Kyrill
As I was getting ready to push this I noticed I didn't add any 
skip-ifs

to prevent this failing with specific target options. So here's a new
version with those.

Still OK?


Hi,

This is not sufficient to skip arm-linux-gnueabi* configs built with
non-default cpu/fpu.

For instance, with arm-linux-gnueabihf --with-cpu=cortex-a9
--with-fpu=neon-fp16 --with-float=hard
I see:
FAIL: gcc.target/arm/pr95646.c (test for excess errors)
Excess errors:
cc1: error: ARMv8-M Security Extensions incompatible with selected FPU
cc1: error: target CPU does not support ARM mode

and the testcase is compiled with -mcpu=cortex-m23 -mcmse -Os
Resending as I don't think my earlier one made it to the lists 
(sorry if

you are receiving this double!)

I'm not following this, before I go off and try to reproduce it, 
what do
you mean by 'the testcase is compiled with -mcpu=cortex-m23 -mcmse 
-Os'?
These are the options you are seeing in the log file? Surely they 
should

override the default options? Only thing I can think of is this might
need an extra -mfloat-abi=soft to make sure it overrides the default
float-abi.  Could you give that a try?

No it doesn't make a difference alone.

I also had to add:
-mfpu=auto (that clears the above warning)
-mthumb otherwise we now get cc1: error: target CPU does not support 
ARM mode


Looks like some effective-target machinery is needed
So I had a look at this,  I was pretty sure that -mfloat-abi=soft 
overwrote -mfpu=<>, which in large it does, as in no FP instructions 
will be generated but the error you see only checks for the right 
number of FP registers. Which doesn't check whether 
'TARGET_HARD_FLOAT' is set or not. I'll fix this too and use the 
check-effective-target for armv8-m.base for this test as it is indeed 
a better approach than my bag of skip-ifs. I'm testing it locally to 
make sure my changes don't break anything.


Cheers,
Andre

Hi,

Sorry for the delay. So I changed the test to use the effective-target 
machinery as you suggested and I also made sure that you don't get the 
"ARMv8-M Security Extensions incompatible with selected FPU" when 
-mfloat-abi=soft.

Further changed 'asm' to '__asm__' to avoid failures with '-std=' options.

Regression tested on arm-none-eabi.
@Christophe: could you test this for your configuration, shouldn't fail 
anymore!


Is this OK for trunk?

Cheers,
Andre

gcc/ChangeLog:
2020-07-06  Andre Vieira  

    * config/arm/arm.c (arm_options_perform_arch_sanity_checks): Do not
    check +D32 for CMSE if -mfloat-abi=soft

gcc/testsuite/ChangeLog:
2020-07-06  Andre Vieira  

    * gcc.target/arm/pr95646.c: Fix testism.


Christophe



Cheers,
Andre

Christophe


Cheers,
Andre

Cheers,
Andre

gcc/ChangeLog:
2020-06-22  Andre Vieira 

    PR target/95646
    * config/arm/arm.c: 
(cmse_nonsecure_entry_clear_before_return):

Use 'callee_saved_reg_p' instead of
    'calL_used_or_fixed_reg_p'.

gcc/testsuite/ChangeLog:
2020-06-22  Andre Vieira 

    PR target/95646
    * gcc.target/arm/pr95646.c: New test.
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 
dac9a6fb5c41ce42cd7a278b417eab25239a043c..38500220bfb2a1ddbbff15eb552451701f7256d5
 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@

Re: [PATCH][GCC][Arm] PR target/95646: Do not clobber callee saved registers with CMSE

2020-06-30 Thread Andre Vieira (lists)



On 29/06/2020 11:15, Christophe Lyon wrote:

On Mon, 29 Jun 2020 at 10:56, Andre Vieira (lists)
 wrote:


On 23/06/2020 21:52, Christophe Lyon wrote:

On Tue, 23 Jun 2020 at 15:28, Andre Vieira (lists)
 wrote:

On 23/06/2020 13:10, Kyrylo Tkachov wrote:

-Original Message-
From: Andre Vieira (lists) 
Sent: 22 June 2020 09:52
To: gcc-patches@gcc.gnu.org
Cc: Kyrylo Tkachov 
Subject: [PATCH][GCC][Arm] PR target/95646: Do not clobber callee saved
registers with CMSE

Hi,

As reported in bugzilla when the -mcmse option is used while compiling
for size (-Os) with a thumb-1 target the generated code will clear the
registers r7-r10. These however are callee saved and should be preserved
accross ABI boundaries. The reason this happens is because these
registers are made "fixed" when optimising for size with Thumb-1 in a
way to make sure they are not used, as pushing and popping hi-registers
requires extra moves to and from LO_REGS.

To fix this, this patch uses 'callee_saved_reg_p', which accounts for
this optimisation, instead of 'call_used_or_fixed_reg_p'. Be aware of
'callee_saved_reg_p''s definition, as it does still take call used
registers into account, which aren't callee_saved in my opinion, so it
is a rather misnoemer, works in our advantage here though as it does
exactly what we need.

Regression tested on arm-none-eabi.

Is this OK for trunk? (Will eventually backport to previous versions if
stable.)

Ok.
Thanks,
Kyrill

As I was getting ready to push this I noticed I didn't add any skip-ifs
to prevent this failing with specific target options. So here's a new
version with those.

Still OK?


Hi,

This is not sufficient to skip arm-linux-gnueabi* configs built with
non-default cpu/fpu.

For instance, with arm-linux-gnueabihf --with-cpu=cortex-a9
--with-fpu=neon-fp16 --with-float=hard
I see:
FAIL: gcc.target/arm/pr95646.c (test for excess errors)
Excess errors:
cc1: error: ARMv8-M Security Extensions incompatible with selected FPU
cc1: error: target CPU does not support ARM mode

and the testcase is compiled with -mcpu=cortex-m23 -mcmse -Os

Resending as I don't think my earlier one made it to the lists (sorry if
you are receiving this double!)

I'm not following this, before I go off and try to reproduce it, what do
you mean by 'the testcase is compiled with -mcpu=cortex-m23 -mcmse -Os'?
These are the options you are seeing in the log file? Surely they should
override the default options? Only thing I can think of is this might
need an extra -mfloat-abi=soft to make sure it overrides the default
float-abi.  Could you give that a try?

No it doesn't make a difference alone.

I also had to add:
-mfpu=auto (that clears the above warning)
-mthumb otherwise we now get cc1: error: target CPU does not support ARM mode

Looks like some effective-target machinery is needed
So I had a look at this,  I was pretty sure that -mfloat-abi=soft 
overwrote -mfpu=<>, which in large it does, as in no FP instructions 
will be generated but the error you see only checks for the right number 
of FP registers. Which doesn't check whether 'TARGET_HARD_FLOAT' is set 
or not. I'll fix this too and use the check-effective-target for 
armv8-m.base for this test as it is indeed a better approach than my bag 
of skip-ifs. I'm testing it locally to make sure my changes don't break 
anything.


Cheers,
Andre


Christophe



Cheers,
Andre

Christophe


Cheers,
Andre

Cheers,
Andre

gcc/ChangeLog:
2020-06-22  Andre Vieira  

PR target/95646
* config/arm/arm.c: (cmse_nonsecure_entry_clear_before_return):
Use 'callee_saved_reg_p' instead of
'calL_used_or_fixed_reg_p'.

gcc/testsuite/ChangeLog:
2020-06-22  Andre Vieira  

PR target/95646
* gcc.target/arm/pr95646.c: New test.


Re: [PATCH][GCC][Arm] PR target/95646: Do not clobber callee saved registers with CMSE

2020-06-29 Thread Andre Vieira (lists)



On 23/06/2020 21:52, Christophe Lyon wrote:

On Tue, 23 Jun 2020 at 15:28, Andre Vieira (lists)
 wrote:

On 23/06/2020 13:10, Kyrylo Tkachov wrote:

-Original Message-
From: Andre Vieira (lists) 
Sent: 22 June 2020 09:52
To: gcc-patches@gcc.gnu.org
Cc: Kyrylo Tkachov 
Subject: [PATCH][GCC][Arm] PR target/95646: Do not clobber callee saved
registers with CMSE

Hi,

As reported in bugzilla when the -mcmse option is used while compiling
for size (-Os) with a thumb-1 target the generated code will clear the
registers r7-r10. These however are callee saved and should be preserved
accross ABI boundaries. The reason this happens is because these
registers are made "fixed" when optimising for size with Thumb-1 in a
way to make sure they are not used, as pushing and popping hi-registers
requires extra moves to and from LO_REGS.

To fix this, this patch uses 'callee_saved_reg_p', which accounts for
this optimisation, instead of 'call_used_or_fixed_reg_p'. Be aware of
'callee_saved_reg_p''s definition, as it does still take call used
registers into account, which aren't callee_saved in my opinion, so it
is a rather misnoemer, works in our advantage here though as it does
exactly what we need.

Regression tested on arm-none-eabi.

Is this OK for trunk? (Will eventually backport to previous versions if
stable.)

Ok.
Thanks,
Kyrill

As I was getting ready to push this I noticed I didn't add any skip-ifs
to prevent this failing with specific target options. So here's a new
version with those.

Still OK?


Hi,

This is not sufficient to skip arm-linux-gnueabi* configs built with
non-default cpu/fpu.

For instance, with arm-linux-gnueabihf --with-cpu=cortex-a9
--with-fpu=neon-fp16 --with-float=hard
I see:
FAIL: gcc.target/arm/pr95646.c (test for excess errors)
Excess errors:
cc1: error: ARMv8-M Security Extensions incompatible with selected FPU
cc1: error: target CPU does not support ARM mode

and the testcase is compiled with -mcpu=cortex-m23 -mcmse -Os
I'm not following this, before I go off and try to reproduce it, what do 
you mean by 'the testcase is compiled with -mcpu=cortex-m23 -mcmse -Os'? 
These are the options you are seeing in the log file? Surely they should 
override the default options? Only thing I can think of is this might 
need an extra -mfloat-abi=soft to make sure it overrides the default 
float-abi.  Could you give that a try?


Cheers,
Andre


Christophe


Cheers,
Andre

Cheers,
Andre

gcc/ChangeLog:
2020-06-22  Andre Vieira  

   PR target/95646
   * config/arm/arm.c: (cmse_nonsecure_entry_clear_before_return):
Use 'callee_saved_reg_p' instead of
   'calL_used_or_fixed_reg_p'.

gcc/testsuite/ChangeLog:
2020-06-22  Andre Vieira  

   PR target/95646
   * gcc.target/arm/pr95646.c: New test.


Re: [PATCH][GCC][Arm] PR target/95646: Do not clobber callee saved registers with CMSE

2020-06-29 Thread Andre Vieira (lists)



On 23/06/2020 21:52, Christophe Lyon wrote:

On Tue, 23 Jun 2020 at 15:28, Andre Vieira (lists)
 wrote:

On 23/06/2020 13:10, Kyrylo Tkachov wrote:

-Original Message-
From: Andre Vieira (lists) 
Sent: 22 June 2020 09:52
To: gcc-patches@gcc.gnu.org
Cc: Kyrylo Tkachov 
Subject: [PATCH][GCC][Arm] PR target/95646: Do not clobber callee saved
registers with CMSE

Hi,

As reported in bugzilla when the -mcmse option is used while compiling
for size (-Os) with a thumb-1 target the generated code will clear the
registers r7-r10. These however are callee saved and should be preserved
accross ABI boundaries. The reason this happens is because these
registers are made "fixed" when optimising for size with Thumb-1 in a
way to make sure they are not used, as pushing and popping hi-registers
requires extra moves to and from LO_REGS.

To fix this, this patch uses 'callee_saved_reg_p', which accounts for
this optimisation, instead of 'call_used_or_fixed_reg_p'. Be aware of
'callee_saved_reg_p''s definition, as it does still take call used
registers into account, which aren't callee_saved in my opinion, so it
is a rather misnoemer, works in our advantage here though as it does
exactly what we need.

Regression tested on arm-none-eabi.

Is this OK for trunk? (Will eventually backport to previous versions if
stable.)

Ok.
Thanks,
Kyrill

As I was getting ready to push this I noticed I didn't add any skip-ifs
to prevent this failing with specific target options. So here's a new
version with those.

Still OK?


Hi,

This is not sufficient to skip arm-linux-gnueabi* configs built with
non-default cpu/fpu.

For instance, with arm-linux-gnueabihf --with-cpu=cortex-a9
--with-fpu=neon-fp16 --with-float=hard
I see:
FAIL: gcc.target/arm/pr95646.c (test for excess errors)
Excess errors:
cc1: error: ARMv8-M Security Extensions incompatible with selected FPU
cc1: error: target CPU does not support ARM mode

and the testcase is compiled with -mcpu=cortex-m23 -mcmse -Os
Resending as I don't think my earlier one made it to the lists (sorry if 
you are receiving this double!)


I'm not following this, before I go off and try to reproduce it, what do 
you mean by 'the testcase is compiled with -mcpu=cortex-m23 -mcmse -Os'? 
These are the options you are seeing in the log file? Surely they should 
override the default options? Only thing I can think of is this might 
need an extra -mfloat-abi=soft to make sure it overrides the default 
float-abi.  Could you give that a try?


Cheers,
Andre


Christophe


Cheers,
Andre

Cheers,
Andre

gcc/ChangeLog:
2020-06-22  Andre Vieira  

   PR target/95646
   * config/arm/arm.c: (cmse_nonsecure_entry_clear_before_return):
Use 'callee_saved_reg_p' instead of
   'calL_used_or_fixed_reg_p'.

gcc/testsuite/ChangeLog:
2020-06-22  Andre Vieira  

   PR target/95646
   * gcc.target/arm/pr95646.c: New test.


Re: [PATCH][GCC][Arm] PR target/95646: Do not clobber callee saved registers with CMSE

2020-06-23 Thread Andre Vieira (lists)

On 23/06/2020 13:10, Kyrylo Tkachov wrote:



-Original Message-
From: Andre Vieira (lists) 
Sent: 22 June 2020 09:52
To: gcc-patches@gcc.gnu.org
Cc: Kyrylo Tkachov 
Subject: [PATCH][GCC][Arm] PR target/95646: Do not clobber callee saved
registers with CMSE

Hi,

As reported in bugzilla when the -mcmse option is used while compiling
for size (-Os) with a thumb-1 target the generated code will clear the
registers r7-r10. These however are callee saved and should be preserved
accross ABI boundaries. The reason this happens is because these
registers are made "fixed" when optimising for size with Thumb-1 in a
way to make sure they are not used, as pushing and popping hi-registers
requires extra moves to and from LO_REGS.

To fix this, this patch uses 'callee_saved_reg_p', which accounts for
this optimisation, instead of 'call_used_or_fixed_reg_p'. Be aware of
'callee_saved_reg_p''s definition, as it does still take call used
registers into account, which aren't callee_saved in my opinion, so it
is a rather misnoemer, works in our advantage here though as it does
exactly what we need.

Regression tested on arm-none-eabi.

Is this OK for trunk? (Will eventually backport to previous versions if
stable.)

Ok.
Thanks,
Kyrill
As I was getting ready to push this I noticed I didn't add any skip-ifs 
to prevent this failing with specific target options. So here's a new 
version with those.


Still OK?

Cheers,
Andre



Cheers,
Andre

gcc/ChangeLog:
2020-06-22  Andre Vieira  

      PR target/95646
      * config/arm/arm.c: (cmse_nonsecure_entry_clear_before_return):
Use 'callee_saved_reg_p' instead of
      'calL_used_or_fixed_reg_p'.

gcc/testsuite/ChangeLog:
2020-06-22  Andre Vieira  

      PR target/95646
      * gcc.target/arm/pr95646.c: New test.
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 
6b7ca829f1c8cbe3d427da474b079882dc522e1a..dac9a6fb5c41ce42cd7a278b417eab25239a043c
 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -26960,7 +26960,7 @@ cmse_nonsecure_entry_clear_before_return (void)
continue;
   if (IN_RANGE (regno, IP_REGNUM, PC_REGNUM))
continue;
-  if (call_used_or_fixed_reg_p (regno)
+  if (!callee_saved_reg_p (regno)
  && (!IN_RANGE (regno, FIRST_VFP_REGNUM, LAST_VFP_REGNUM)
  || TARGET_HARD_FLOAT))
bitmap_set_bit (to_clear_bitmap, regno);
diff --git a/gcc/testsuite/gcc.target/arm/pr95646.c 
b/gcc/testsuite/gcc.target/arm/pr95646.c
new file mode 100644
index 
..12d06a0c8c1ed7de1f8d4d15130432259e613a32
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/pr95646.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-march=*" } 
{ "-march=armv8-m.base" } } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-mcpu=*" } { 
"-mcpu=cortex-m23" } } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-mfpu=*" } { 
} } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { 
"-mfloat-abi=*" } { "-mfloat-abi=soft" } } */
+/* { dg-options "-mcpu=cortex-m23 -mcmse" } */
+/* { dg-additional-options "-Os" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+int __attribute__ ((cmse_nonsecure_entry))
+foo (void)
+{
+  return 1;
+}
+/* { { dg-final { scan-assembler-not "mov\tr9, r0" } } */
+
+/*
+** __acle_se_bar:
+** mov (r[0-3]), r9
+** push{\1}
+** ...
+** pop {(r[0-3])}
+** mov r9, \2
+** ...
+** bxnslr
+*/
+int __attribute__ ((cmse_nonsecure_entry))
+bar (void)
+{
+  asm ("": : : "r9");
+  return 1;
+}


[PATCH][GCC][Arm] PR target/95646: Do not clobber callee saved registers with CMSE

2020-06-22 Thread Andre Vieira (lists)

Hi,

As reported in bugzilla when the -mcmse option is used while compiling 
for size (-Os) with a thumb-1 target the generated code will clear the 
registers r7-r10. These however are callee saved and should be preserved 
accross ABI boundaries. The reason this happens is because these 
registers are made "fixed" when optimising for size with Thumb-1 in a 
way to make sure they are not used, as pushing and popping hi-registers 
requires extra moves to and from LO_REGS.


To fix this, this patch uses 'callee_saved_reg_p', which accounts for 
this optimisation, instead of 'call_used_or_fixed_reg_p'. Be aware of 
'callee_saved_reg_p''s definition, as it does still take call used 
registers into account, which aren't callee_saved in my opinion, so it 
is a rather misnoemer, works in our advantage here though as it does 
exactly what we need.


Regression tested on arm-none-eabi.

Is this OK for trunk? (Will eventually backport to previous versions if 
stable.)


Cheers,
Andre

gcc/ChangeLog:
2020-06-22  Andre Vieira  

    PR target/95646
    * config/arm/arm.c: (cmse_nonsecure_entry_clear_before_return): 
Use 'callee_saved_reg_p' instead of

    'calL_used_or_fixed_reg_p'.

gcc/testsuite/ChangeLog:
2020-06-22  Andre Vieira  

    PR target/95646
    * gcc.target/arm/pr95646.c: New test.

diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 
6b7ca829f1c8cbe3d427da474b079882dc522e1a..dac9a6fb5c41ce42cd7a278b417eab25239a043c
 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -26960,7 +26960,7 @@ cmse_nonsecure_entry_clear_before_return (void)
continue;
   if (IN_RANGE (regno, IP_REGNUM, PC_REGNUM))
continue;
-  if (call_used_or_fixed_reg_p (regno)
+  if (!callee_saved_reg_p (regno)
  && (!IN_RANGE (regno, FIRST_VFP_REGNUM, LAST_VFP_REGNUM)
  || TARGET_HARD_FLOAT))
bitmap_set_bit (to_clear_bitmap, regno);
diff --git a/gcc/testsuite/gcc.target/arm/pr95646.c 
b/gcc/testsuite/gcc.target/arm/pr95646.c
new file mode 100644
index 
..c9fdc37618ccaddcdb597647c7076054af17789a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/pr95646.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse -Os -mcpu=cortex-m23" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+int __attribute__ ((cmse_nonsecure_entry))
+foo (void)
+{
+  return 1;
+}
+/* { { dg-final { scan-assembler-not "mov\tr9, r0" } } */
+
+/*
+** __acle_se_bar:
+** mov (r[0-3]), r9
+** push{\1}
+** ...
+** pop {(r[0-3])}
+** mov r9, \2
+** ...
+** bxnslr
+*/
+int __attribute__ ((cmse_nonsecure_entry))
+bar (void)
+{
+  asm ("": : : "r9");
+  return 1;
+}


Re: [RFC][vect] BB SLP reduction prototype

2020-06-09 Thread Andre Vieira (lists)
The 'you' here is Richi, which Richi is probably aware but maybe not the 
rest of the list :')


On 09/06/2020 15:29, Andre Vieira (lists) wrote:

Hi,

So this is my rework of the code you sent me, I have not included the 
'permute' code you included as I can't figure out what it is meant to 
be doing. Maybe something to look at later.


I have also included three tests that show it working for some simple 
cases and even a nested one.


Unfortunately it will not handle other simple cases where reassoc 
doesn't put the reduction in the form of :

sum0 = a + b;
sum1 = c + sum0;
...

For instance a testcase I have been looking at is:
unsigned int u32_single_abs_sum (unsigned int * a, unsigned int * b)
{
  unsigned int sub0 = a[0] - b[0];
  unsigned int sub1 = a[1] - b[1];
  unsigned int sub2 = a[2] - b[2];
  unsigned int sub3 = a[3] - b[3];
  unsigned int sum = sub0 + sub1;
  sum += sub2;
  sum += sub3;
  return sum;
}

Unfortunately, the code that reaches slp will look like:
  _1 = *a_10(D);
  _2 = *b_11(D);
  _3 = MEM[(unsigned int *)a_10(D) + 4B];
  _4 = MEM[(unsigned int *)b_11(D) + 4B];
  _5 = MEM[(unsigned int *)a_10(D) + 8B];
  _6 = MEM[(unsigned int *)b_11(D) + 8B];
  _7 = MEM[(unsigned int *)a_10(D) + 12B];
  _8 = MEM[(unsigned int *)b_11(D) + 12B];
  _28 = _1 - _2;
  _29 = _3 + _28;
  _30 = _29 - _4;
  _31 = _5 + _30;
  _32 = _31 - _6;
  _33 = _7 + _32;
  sum_18 = _33 - _8;
  return sum_18;

Which doesn't have the format expected as I described above... I am 
wondering how to teach it to support this. Maybe starting with your 
suggestion of making plus_expr and minus_expr have the same hash, so 
it groups all these statements together might be a start, but you'd 
still need to 'rebalance' the tree somehow I need to give this a 
bit more thought but I wanted to share what I have so far.


The code is severely lacking in comments for now btw...

Cheers,
Andre



[RFC][vect] BB SLP reduction prototype

2020-06-09 Thread Andre Vieira (lists)

Hi,

So this is my rework of the code you sent me, I have not included the 
'permute' code you included as I can't figure out what it is meant to be 
doing. Maybe something to look at later.


I have also included three tests that show it working for some simple 
cases and even a nested one.


Unfortunately it will not handle other simple cases where reassoc 
doesn't put the reduction in the form of :

sum0 = a + b;
sum1 = c + sum0;
...

For instance a testcase I have been looking at is:
unsigned int u32_single_abs_sum (unsigned int * a, unsigned int * b)
{
  unsigned int sub0 = a[0] - b[0];
  unsigned int sub1 = a[1] - b[1];
  unsigned int sub2 = a[2] - b[2];
  unsigned int sub3 = a[3] - b[3];
  unsigned int sum = sub0 + sub1;
  sum += sub2;
  sum += sub3;
  return sum;
}

Unfortunately, the code that reaches slp will look like:
  _1 = *a_10(D);
  _2 = *b_11(D);
  _3 = MEM[(unsigned int *)a_10(D) + 4B];
  _4 = MEM[(unsigned int *)b_11(D) + 4B];
  _5 = MEM[(unsigned int *)a_10(D) + 8B];
  _6 = MEM[(unsigned int *)b_11(D) + 8B];
  _7 = MEM[(unsigned int *)a_10(D) + 12B];
  _8 = MEM[(unsigned int *)b_11(D) + 12B];
  _28 = _1 - _2;
  _29 = _3 + _28;
  _30 = _29 - _4;
  _31 = _5 + _30;
  _32 = _31 - _6;
  _33 = _7 + _32;
  sum_18 = _33 - _8;
  return sum_18;

Which doesn't have the format expected as I described above... I am 
wondering how to teach it to support this. Maybe starting with your 
suggestion of making plus_expr and minus_expr have the same hash, so it 
groups all these statements together might be a start, but you'd still 
need to 'rebalance' the tree somehow I need to give this a bit more 
thought but I wanted to share what I have so far.


The code is severely lacking in comments for now btw...

Cheers,
Andre

diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-reduc-1.c 
b/gcc/testsuite/gcc.dg/vect/bb-slp-reduc-1.c
new file mode 100644
index 
..66b53ff9bb1e77414e7493c07ab87d46f4d33651
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-reduc-1.c
@@ -0,0 +1,66 @@
+/* { dg-require-effective-target vect_int } */
+#include 
+#include "tree-vect.h"
+extern int abs (int);
+
+#define ABS4(N)\
+  sum += abs (a[N]);   \
+  sum += abs (a[N+1]); \
+  sum += abs (a[N+2]); \
+  sum += abs (a[N+3]);
+
+#define ABS8(N)  \
+  ABS4(N)\
+  ABS4(N+4)
+
+#define ABS16(N)  \
+  ABS8(N)\
+  ABS8(N+8)
+
+__attribute__ ((noipa)) unsigned char
+u8_single_abs_sum (signed char * a)
+{
+  unsigned char sum = 0;
+  ABS16(0)
+  return sum;
+}
+
+__attribute__ ((noipa)) unsigned short
+u16_single_abs_sum (signed short * a)
+{
+  unsigned short sum = 0;
+  ABS8(0)
+  return sum;
+}
+
+__attribute__ ((noipa)) unsigned int
+u32_single_abs_sum (signed int * a)
+{
+  unsigned int sum = 0;
+  ABS4(0)
+  return sum;
+}
+
+signed char u8[16] = {0, 1, 2, 3, 4, 5, 6, -7, -8, -9, -10, -11, -12, -13,
+   -14, -15};
+signed short u16[8] = {0, 1, 2, 3, 4, -5, -6, -7};
+signed int u32[4] = {-10, -20, 30, 40};
+
+
+int main (void)
+{
+  check_vect ();
+
+  if (u8_single_abs_sum (&(u8[0])) != 120)
+abort ();
+
+  if (u16_single_abs_sum (&(u16[0])) != 28)
+abort ();
+
+  if (u32_single_abs_sum (&(u32[0])) != 100)
+abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "basic block vectorized" 3 "slp2" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-reduc-2.c 
b/gcc/testsuite/gcc.dg/vect/bb-slp-reduc-2.c
new file mode 100644
index 
..298a22cfef687f6634d61bf808a41942c3ce4a85
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-reduc-2.c
@@ -0,0 +1,82 @@
+/* { dg-require-effective-target vect_int } */
+#include 
+#include "tree-vect.h"
+extern int abs (int);
+
+#define ABS4(N)\
+  sum += abs (a[N]);   \
+  sum += abs (a[N+1]); \
+  sum += abs (a[N+2]); \
+  sum += abs (a[N+3]);
+
+#define ABS8(N)  \
+  ABS4(N)\
+  ABS4(N+4)
+
+#define ABS16(N)  \
+  ABS8(N)\
+  ABS8(N+8)
+
+__attribute__ ((noipa)) unsigned char
+u8_double_abs_sum (signed char * a)
+{
+  unsigned char sum = 0;
+  ABS16(0)
+  ABS16(16)
+  return sum;
+}
+
+__attribute__ ((noipa)) unsigned short
+u16_double_abs_sum (signed short * a)
+{
+  unsigned short sum = 0;
+  ABS16(0)
+  return sum;
+}
+
+__attribute__ ((noipa)) unsigned int
+u32_double_abs_sum (signed int * a)
+{
+  unsigned int sum = 0;
+  ABS8(0)
+  return sum;
+}
+
+__attribute__ ((noipa)) unsigned int
+u32_triple_abs_sum (signed int * a)
+{
+  unsigned int sum = 0;
+  ABS8(0)
+  ABS4(8)
+  return sum;
+}
+
+signed char u8[32] = {0, 1, 2, 3, 4, 5, 6, -7, -8, -9, -10, -11, -12, -13,
+ -14, -15, 0, 1, 2, 3, 4, 5, 6, -7, -8, -9, -10, -11, -12, 
-13,
+ -14, -30};
+
+signed short u16[16] = {0, 1, 2, 3, 4, -5, -6, -7, 10, 20, 30, 40, -10, -20,
+  -30, -40};
+signed int u32[16] = {-10, -20, 30, 40, 100, 200, -300, -500, -600, -700, 1000,
+2000};
+
+i

[AArch64][GCC-8][GCC-9] Use __getauxval instead of getauxval in LSE detection code in libgcc

2020-05-28 Thread Andre Vieira (lists)

The patch applies cleanly on gcc-9 and gcc-8.
I bootstrapped this on aarch64-none-linux-gnu and tested 
aarch64-none-elf for both.


Is this OK for those backports?

libgcc/ChangeLog:
2020-05-28  Andre Vieira  

    Backport from mainline.
    2020-05-06  Kyrylo Tkachov  

    * config/aarch64/lse-init.c (init_have_lse_atomics): Use __getauxval
    instead of getauxval.
    (AT_HWCAP): Define.
    (HWCAP_ATOMICS): Define.
    Guard detection on __gnu_linux__.

On 06/05/2020 16:24, Kyrylo Tkachov wrote:



-Original Message-
From: Joseph Myers 
Sent: 06 May 2020 15:46
To: Richard Biener 
Cc: Kyrylo Tkachov ; Florian Weimer
; Szabolcs Nagy ; gcc-
patc...@gcc.gnu.org; Jakub Jelinek 
Subject: Re: [PATCH][AArch64] Use __getauxval instead of getauxval in LSE
detection code in libgcc

On Wed, 6 May 2020, Richard Biener wrote:


Here is the updated patch for the record.
Jakub, richi, is this ok for the GCC 10 branch?

I'll defer to Joseph who is release manager as well.

This version is OK with me.

Thank you Joseph,
I've committed this version to trunk and the gcc-10 branch.
Kyrill


--
Joseph S. Myers
jos...@codesourcery.com


[PATCH][GCC-8][Aarch64]: Backport Force TImode values into even registers

2020-04-29 Thread Andre Vieira (lists)

Hi,

This is a backport from trunk/gcc-9 that I think we need now that we 
have backported the casp LSE instructions.


Bootstrapped and regression tested on aarch64.

Is this OK for gcc-8?

Cheers,
Andre

The LSE CASP instruction requires values to be placed in even
register pairs.  A solution involving two additional register
classes was rejected in favor of the much simpler solution of
simply requiring all TImode values to be aligned.

gcc/ChangeLog:
2020-04-29  Andre Vieira  

    Backport from mainline.
    2018-10-31  Richard Henderson 

    * config/aarch64/aarch64.c (aarch64_hard_regno_mode_ok): Force
    16-byte modes held in GP registers to use an even regno.

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 
5eec1aae54abe04b8320deaf8202621c8e193c01..525deba56ea363a621cccec1a923da241908dd06
 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1369,10 +1369,14 @@ aarch64_hard_regno_mode_ok (unsigned regno, 
machine_mode mode)
   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
 return mode == Pmode;
 
-  if (GP_REGNUM_P (regno) && known_le (GET_MODE_SIZE (mode), 16))
-return true;
-
-  if (FP_REGNUM_P (regno))
+  if (GP_REGNUM_P (regno))
+{
+  if (known_le (GET_MODE_SIZE (mode), 8))
+   return true;
+  else if (known_le (GET_MODE_SIZE (mode), 16))
+   return (regno & 1) == 0;
+}
+  else if (FP_REGNUM_P (regno))
 {
   if (vec_flags & VEC_STRUCT)
return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;


[PATCH][GCC-8][Aarch64]: Fix for PR target/9481

2020-04-28 Thread Andre Vieira (lists)

Hi,

Backport of PR target/94518: Fix memmodel index in 
aarch64_store_exclusive_pair


This fixes bootstrap with --enable-checking=yes,rtl for aarch64.

OK for gcc-8?

Cheers,
Andre

gcc/ChangeLog:
2020-04-28  Andre Vieira  

    PR target/94814
    Backport from gcc-9.
    2020-04-07  Kyrylo Tkachov  

    PR target/94518
    2019-09-23  Richard Sandiford 

    * config/aarch64/atomics.md (aarch64_store_exclusive_pair): Fix
    memmodel index.

diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index 
1005462ae23aa13dbc3013a255aa189096e33366..0e0b03731922d8e50e8468de94e0ff345d10c32f
 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -752,7 +752,7 @@
  UNSPECV_SX))]
   ""
   {
-enum memmodel model = memmodel_from_int (INTVAL (operands[3]));
+enum memmodel model = memmodel_from_int (INTVAL (operands[4]));
 if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire 
(model))
   return "stxp\t%w0, %x2, %x3, %1";
 else


[PATCH][GCC][Arm]: Fix bootstrap failure with rtl-checking

2020-04-27 Thread Andre Vieira (lists)

Hi,

The code change that caused this regression was not meant to affect neon 
code-gen, however I missed the REG fall through.  This patch makes sure 
we only get the left-hand of the PLUS if it is indeed a PLUS expr.


I suggest that in gcc-11 this code is cleaned up, as I do not think we 
even need the overlap checks, NEON only loads from or stores to FP 
registers and these can't be used in its addressing modes.


Bootstrapped arm-linux-gnueabihf with '--enable-checking=yes,rtl' for 
armv7-a and amrv8-a.


Is this OK for trunk?

gcc/ChangeLog:
2020-04-27  Andre Vieira  

    * config/arm/arm.c (output_move_neon): Only get the first operand,
    if addr is PLUS.

diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 
0151bda90d961ae1a001c61cd5e94d6ec67e3aea..74454dddbb948a5d37f502e8e2146a81cb83d58b
 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -20145,7 +20145,8 @@ output_move_neon (rtx *operands)
}
   /* Fall through.  */
 case PLUS:
-  addr = XEXP (addr, 0);
+  if (GET_CODE (addr) == PLUS)
+   addr = XEXP (addr, 0);
   /* Fall through.  */
 case LABEL_REF:
   {


[PATCH][wwwdocs] Add -moutline-atomics for AArch64 on gcc-9 and gcc-8

2020-04-24 Thread Andre Vieira (lists)
Add the backported functionality of -moutline-atomics for AArch64 to the 
gcc-9 and gcc-8 changes.html


Validates. Is this OK?
diff --git a/htdocs/gcc-8/changes.html b/htdocs/gcc-8/changes.html
index 
83dd1bc010a6e4debb76790b3fe62275bf0e0657..83e57db181294110f71a5d59960fb4d3fed7be98
 100644
--- a/htdocs/gcc-8/changes.html
+++ b/htdocs/gcc-8/changes.html
@@ -1394,5 +1394,22 @@ known to be fixed in the 8.4 release. This list might 
not be
 complete (that is, it is possible that some PRs that have been fixed
 are not listed here).
 
+
+GCC 8.5
+
+ Target Specific Changes
+
+AArch64
+  
+
+  The option -moutline-atomics has been added to aid
+  deployment of the Large System Extensions (LSE) on GNU/Linux systems built
+  with a baseline architecture targeting Armv8-A.  When the option is
+  specified code is emitted to detect the presence of LSE instructions at
+  runtime and use them for standard atomic operations.
+  For more information please refer to the documentation.
+
+  
+
 
 
diff --git a/htdocs/gcc-9/changes.html b/htdocs/gcc-9/changes.html
index 
74c7cde72ef5ab8ec059e20a8da3e46907ecd9a3..a2a28a9aeb851cae298e828d2c4b57c6fa414cf4
 100644
--- a/htdocs/gcc-9/changes.html
+++ b/htdocs/gcc-9/changes.html
@@ -1132,5 +1132,21 @@ complete (that is, it is possible that some PRs that 
have been fixed
 are not listed here).
 
 
+GCC 9.4
+
+ Target Specific Changes
+
+AArch64
+  
+
+  The option -moutline-atomics has been added to aid
+  deployment of the Large System Extensions (LSE) on GNU/Linux systems built
+  with a baseline architecture targeting Armv8-A.  When the option is
+  specified code is emitted to detect the presence of LSE instructions at
+  runtime and use them for standard atomic operations.
+  For more information please refer to the documentation.
+
+  
+
 
 


[committed][gcc-9] aarch64: Fix bootstrap with old binutils [PR93053]

2020-04-22 Thread Andre Vieira (lists)

Went ahead and committed the backport to gcc-9.

As reported in the PR, GCC 10 (and also 9.3.1 but not 9.3.0) fails to build
when using older binutils which lack LSE support, because those instructions
are used in libgcc.
Thanks to Kyrylo's hint, the following patches (hopefully) allow it to build
even with older binutils by using .inst directive if LSE support isn't
available in the assembler.

2020-04-22  Andre Vieira  

    Backport from mainline.
    2020-04-15  Jakub Jelinek  

    PR target/93053
    * configure.ac (LIBGCC_CHECK_AS_LSE): Add HAVE_AS_LSE checking.
    * config/aarch64/lse.S: Include auto-target.h, if HAVE_AS_LSE
    is not defined, use just .arch armv8-a.
    (B, M, N, OPN): Define.
    (COMMENT): New .macro.
    (CAS, CASP, SWP, LDOP): Use .inst directive if HAVE_AS_LSE is not
    defined.  Otherwise, move the operands right after the glue? and
    comment out operands where the macros are used.
    * configure: Regenerated.
    * config.in: Regenerated.

On 22/04/2020 10:59, Kyrylo Tkachov wrote:

Hi Andre,


-Original Message-----
From: Andre Vieira (lists) 
Sent: 22 April 2020 09:26
To: Kyrylo Tkachov ; gcc-patches@gcc.gnu.org
Cc: Richard Sandiford ; s...@amazon.com
Subject: Re: [PATCH 0/19][GCC-8] aarch64: Backport outline atomics


On 20/04/2020 09:42, Kyrylo Tkachov wrote:

Hi Andre,


-Original Message-----
From: Andre Vieira (lists) 
Sent: 16 April 2020 13:24
To: gcc-patches@gcc.gnu.org
Cc: Kyrylo Tkachov ; Richard Sandiford
; s...@amazon.com
Subject: [PATCH 0/19][GCC-8] aarch64: Backport outline atomics

Hi,

This series backports all the patches and fixes regarding outline
atomics to the gcc-8 branch.

Bootstrapped the series for aarch64-linux-gnu and regression tested.
Is this OK for gcc-8?

Andre Vieira (19):
aarch64: Add early clobber for aarch64_store_exclusive
aarch64: Simplify LSE cas generation
aarch64: Improve cas generation
aarch64: Improve swp generation
aarch64: Improve atomic-op lse generation
aarch64: Remove early clobber from ATOMIC_LDOP scratch
aarch64: Extend %R for integer registers
aarch64: Implement TImode compare-and-swap
aarch64: Tidy aarch64_split_compare_and_swap
aarch64: Add out-of-line functions for LSE atomics
Add visibility to libfunc constructors
aarch64: Implement -moutline-atomics
Aarch64: Fix shrinkwrapping interactions with atomics (PR92692)
aarch64: Fix store-exclusive in load-operate LSE helpers
aarch64: Configure for sys/auxv.h in libgcc for lse-init.c
aarch64: Fix up aarch64_compare_and_swaphi pattern [PR94368]
aarch64: Fix bootstrap with old binutils [PR93053]

Thanks for putting these together.
Before they can go in we need to get this fix for PR93053 into GCC 9.
Can you please test it on that branch to help Jakub out?
Thanks,
Kyrill

Bootstrapped and regression tested the PR93053 fix from Jakub on gcc-9
branch and it looks good.

Thanks, can you please apply the patch to the gcc-9 branch then? (making sure 
the PR markers are there in the commit message so that Bugzilla is updated).
We can then proceed with the GCC 8 backports.

Kyrill


aarch64: Fix ICE due to aarch64_gen_compare_reg_maybe_ze [PR94435]
re PR target/90724 (ICE with __sync_bool_compare_and_swap with
-march=armv8.2-a+sve)
diff --git a/libgcc/config.in b/libgcc/config.in
index 
59a3d8daf52e72e548d3d9425d6043d5e0c663ad..5be5321d2584392bac1ec3af779cd96823212902
 100644
--- a/libgcc/config.in
+++ b/libgcc/config.in
@@ -10,6 +10,9 @@
*/
 #undef HAVE_AS_CFI_SECTIONS
 
+/* Define to 1 if the assembler supports LSE. */
+#undef HAVE_AS_LSE
+
 /* Define to 1 if the target assembler supports thread-local storage. */
 #undef HAVE_CC_TLS
 
diff --git a/libgcc/config/aarch64/lse.S b/libgcc/config/aarch64/lse.S
index 
c7979382ad7770b61bb1c64d32ba2395963a9d7a..f7f1c19587beaec2ccb6371378d54d50139ba1c9
 100644
--- a/libgcc/config/aarch64/lse.S
+++ b/libgcc/config/aarch64/lse.S
@@ -48,8 +48,14 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If 
not, see
  * separately to minimize code size.
  */
 
+#include "auto-target.h"
+
 /* Tell the assembler to accept LSE instructions.  */
+#ifdef HAVE_AS_LSE
.arch armv8-a+lse
+#else
+   .arch armv8-a
+#endif
 
 /* Declare the symbol gating the LSE implementations.  */
.hidden __aarch64_have_lse_atomics
@@ -58,12 +64,19 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  
If not, see
 #if SIZE == 1
 # define S b
 # define UXT   uxtb
+# define B 0x
 #elif SIZE == 2
 # define S h
 # define UXT   uxth
+# define B 0x4000
 #elif SIZE == 4 || SIZE == 8 || SIZE == 16
 # define S
 # define UXT   mov
+# if SIZE == 4
+#  define B0x8000
+# elif SIZE == 8
+#  define B0xc000
+# endif
 #else
 # error
 #endif
@@ -72,18 +85,26 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  
If not, see
 # define SUFF  _relax
 # define A
 # define L
+# define M 0x00
+# define N 0x00
 #elif MODEL == 2
 # define SUFF  _acq
 # define

Re: [PATCH 0/19][GCC-8] aarch64: Backport outline atomics

2020-04-22 Thread Andre Vieira (lists)



On 20/04/2020 09:42, Kyrylo Tkachov wrote:

Hi Andre,


-Original Message-
From: Andre Vieira (lists) 
Sent: 16 April 2020 13:24
To: gcc-patches@gcc.gnu.org
Cc: Kyrylo Tkachov ; Richard Sandiford
; s...@amazon.com
Subject: [PATCH 0/19][GCC-8] aarch64: Backport outline atomics

Hi,

This series backports all the patches and fixes regarding outline
atomics to the gcc-8 branch.

Bootstrapped the series for aarch64-linux-gnu and regression tested.
Is this OK for gcc-8?

Andre Vieira (19):
aarch64: Add early clobber for aarch64_store_exclusive
aarch64: Simplify LSE cas generation
aarch64: Improve cas generation
aarch64: Improve swp generation
aarch64: Improve atomic-op lse generation
aarch64: Remove early clobber from ATOMIC_LDOP scratch
aarch64: Extend %R for integer registers
aarch64: Implement TImode compare-and-swap
aarch64: Tidy aarch64_split_compare_and_swap
aarch64: Add out-of-line functions for LSE atomics
Add visibility to libfunc constructors
aarch64: Implement -moutline-atomics
Aarch64: Fix shrinkwrapping interactions with atomics (PR92692)
aarch64: Fix store-exclusive in load-operate LSE helpers
aarch64: Configure for sys/auxv.h in libgcc for lse-init.c
aarch64: Fix up aarch64_compare_and_swaphi pattern [PR94368]
aarch64: Fix bootstrap with old binutils [PR93053]

Thanks for putting these together.
Before they can go in we need to get this fix for PR93053 into GCC 9.
Can you please test it on that branch to help Jakub out?
Thanks,
Kyrill
Bootstrapped and regression tested the PR93053 fix from Jakub on gcc-9 
branch and it looks good.

aarch64: Fix ICE due to aarch64_gen_compare_reg_maybe_ze [PR94435]
re PR target/90724 (ICE with __sync_bool_compare_and_swap with
-march=armv8.2-a+sve)




Re: [PATCH 0/19][GCC-8] aarch64: Backport outline atomics

2020-04-16 Thread Andre Vieira (lists)

On 16/04/2020 13:24, Andre Vieira (lists) wrote:

Hi,

This series backports all the patches and fixes regarding outline 
atomics to the gcc-8 branch.


Bootstrapped the series for aarch64-linux-gnu and regression tested.
Is this OK for gcc-8?

Andre Vieira (19):
aarch64: Add early clobber for aarch64_store_exclusive
aarch64: Simplify LSE cas generation
aarch64: Improve cas generation
aarch64: Improve swp generation
aarch64: Improve atomic-op lse generation
aarch64: Remove early clobber from ATOMIC_LDOP scratch
aarch64: Extend %R for integer registers
aarch64: Implement TImode compare-and-swap
aarch64: Tidy aarch64_split_compare_and_swap
aarch64: Add out-of-line functions for LSE atomics
Add visibility to libfunc constructors
aarch64: Implement -moutline-atomics
Aarch64: Fix shrinkwrapping interactions with atomics (PR92692)
aarch64: Fix store-exclusive in load-operate LSE helpers
aarch64: Configure for sys/auxv.h in libgcc for lse-init.c
aarch64: Fix up aarch64_compare_and_swaphi pattern [PR94368]
aarch64: Fix bootstrap with old binutils [PR93053]
aarch64: Fix ICE due to aarch64_gen_compare_reg_maybe_ze [PR94435]
re PR target/90724 (ICE with __sync_bool_compare_and_swap with 
-march=armv8.2-a+sve)


Hmm something went wrong when sending these, I had tried to make the 
N/19 patches reply to this one, but failed and also I was pretty sure I 
had CC'ed Kyrill and Richard S.


Adding them now.



[PATCH 15/19][GCC-8] aarch64: Configure for sys/auxv.h in libgcc for lse-init.c

2020-04-16 Thread Andre Vieira (lists)

2020-04-16  Andre Vieira 

    Backport from mainline
    2019-09-25  Richard Henderson 

    PR target/91833
    * config/aarch64/lse-init.c: Include auto-target.h.  Disable
    initialization if !HAVE_SYS_AUXV_H.
    * configure.ac (AC_CHECK_HEADERS): Add sys/auxv.h.
    * config.in, configure: Rebuild.

diff --git a/libgcc/config.in b/libgcc/config.in
index 
d634af9d949741e26f5acc2606d40062d491dd8b..59a3d8daf52e72e548d3d9425d6043d5e0c663ad
 100644
--- a/libgcc/config.in
+++ b/libgcc/config.in
@@ -43,6 +43,9 @@
 /* Define to 1 if you have the  header file. */
 #undef HAVE_STRING_H
 
+/* Define to 1 if you have the  header file. */
+#undef HAVE_SYS_AUXV_H
+
 /* Define to 1 if you have the  header file. */
 #undef HAVE_SYS_STAT_H
 
@@ -82,6 +85,11 @@
 /* Define to 1 if the target use emutls for thread-local storage. */
 #undef USE_EMUTLS
 
+/* Enable large inode numbers on Mac OS X 10.5.  */
+#ifndef _DARWIN_USE_64_BIT_INODE
+# define _DARWIN_USE_64_BIT_INODE 1
+#endif
+
 /* Number of bits in a file offset, on hosts where this is settable. */
 #undef _FILE_OFFSET_BITS
 
diff --git a/libgcc/config/aarch64/lse-init.c b/libgcc/config/aarch64/lse-init.c
index 
33d2914747994a1e07dcae906f0352e64045ab02..1a8f4c55213f25c67c8bb8cdc1cc6f1bbe3255cb
 100644
--- a/libgcc/config/aarch64/lse-init.c
+++ b/libgcc/config/aarch64/lse-init.c
@@ -23,12 +23,14 @@ a copy of the GCC Runtime Library Exception along with this 
program;
 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 <http://www.gnu.org/licenses/>.  */
 
+#include "auto-target.h"
+
 /* Define the symbol gating the LSE implementations.  */
 _Bool __aarch64_have_lse_atomics
   __attribute__((visibility("hidden"), nocommon));
 
 /* Disable initialization of __aarch64_have_lse_atomics during bootstrap.  */
-#ifndef inhibit_libc
+#if !defined(inhibit_libc) && defined(HAVE_SYS_AUXV_H)
 # include 
 
 /* Disable initialization if the system headers are too old.  */
diff --git a/libgcc/configure b/libgcc/configure
old mode 100644
new mode 100755
index 
b2f3f8708441e473b8e2941c4748748b6c7c40b8..7962cd9b87e1eb67037180e110f7d0de145bb2e1
--- a/libgcc/configure
+++ b/libgcc/configure
@@ -641,6 +641,7 @@ infodir
 docdir
 oldincludedir
 includedir
+runstatedir
 localstatedir
 sharedstatedir
 sysconfdir
@@ -729,6 +730,7 @@ datadir='${datarootdir}'
 sysconfdir='${prefix}/etc'
 sharedstatedir='${prefix}/com'
 localstatedir='${prefix}/var'
+runstatedir='${localstatedir}/run'
 includedir='${prefix}/include'
 oldincludedir='/usr/include'
 docdir='${datarootdir}/doc/${PACKAGE_TARNAME}'
@@ -980,6 +982,15 @@ do
   | -silent | --silent | --silen | --sile | --sil)
 silent=yes ;;
 
+  -runstatedir | --runstatedir | --runstatedi | --runstated \
+  | --runstate | --runstat | --runsta | --runst | --runs \
+  | --run | --ru | --r)
+ac_prev=runstatedir ;;
+  -runstatedir=* | --runstatedir=* | --runstatedi=* | --runstated=* \
+  | --runstate=* | --runstat=* | --runsta=* | --runst=* | --runs=* \
+  | --run=* | --ru=* | --r=*)
+runstatedir=$ac_optarg ;;
+
   -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb)
 ac_prev=sbindir ;;
   -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \
@@ -1117,7 +1128,7 @@ fi
 for ac_var in  exec_prefix prefix bindir sbindir libexecdir datarootdir \
datadir sysconfdir sharedstatedir localstatedir includedir \
oldincludedir docdir infodir htmldir dvidir pdfdir psdir \
-   libdir localedir mandir
+   libdir localedir mandir runstatedir
 do
   eval ac_val=\$$ac_var
   # Remove trailing slashes.
@@ -1272,6 +1283,7 @@ Fine tuning of the installation directories:
   --sysconfdir=DIRread-only single-machine data [PREFIX/etc]
   --sharedstatedir=DIRmodifiable architecture-independent data [PREFIX/com]
   --localstatedir=DIR modifiable single-machine data [PREFIX/var]
+  --runstatedir=DIR   modifiable per-process data [LOCALSTATEDIR/run]
   --libdir=DIRobject code libraries [EPREFIX/lib]
   --includedir=DIRC header files [PREFIX/include]
   --oldincludedir=DIR C header files for non-gcc [/usr/include]
@@ -4091,7 +4103,7 @@ else
 We can't simply define LARGE_OFF_T to be 9223372036854775807,
 since some C++ compilers masquerading as C compilers
 incorrectly reject 9223372036854775807.  */
-#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62))
+#define LARGE_OFF_T off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31))
   int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721
   && LARGE_OFF_T % 2147483647 == 1)
  ? 1 : -1];
@@ -4137,7 +4149,7 @@ else
 We can't simply define LARGE_OFF_T to be 9223372036854775807,
 since some C++ compilers masquerading as C compilers
 incorr

[PATCH 19/19][GCC-8] re PR target/90724 (ICE with __sync_bool_compare_and_swap with -march=armv8.2-a+sve)

2020-04-16 Thread Andre Vieira (lists)

2020-04-16  Andre Vieira 

    Backport from mainline
    2019-08-21  Prathamesh Kulkarni 

    PR target/90724
    * config/aarch64/aarch64.c (aarch64_gen_compare_reg_maybe_ze): Force y
    in reg if it fails aarch64_plus_operand predicate.

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 
6bac63402e508027e77a9f4557cb10c578ea7c2c..0da927be15c339295ef940d6e05a37e95135aa5a
 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1574,6 +1574,9 @@ aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, 
rtx y,
}
 }
 
+  if (!aarch64_plus_operand (y, y_mode))
+y = force_reg (y_mode, y);
+
   return aarch64_gen_compare_reg (code, x, y);
 }
 


[PATCH 16/19][GCC-8] aarch64: Fix up aarch64_compare_and_swaphi pattern [PR94368]

2020-04-16 Thread Andre Vieira (lists)

2020-04-16  Andre Vieira 

    Backport from mainline
    2020-03-31  Jakub Jelinek 

    PR target/94368
    * config/aarch64/constraints.md (Uph): New constraint.
    * config/aarch64/atomics.md (cas_short_expected_imm): New mode attr.
    (aarch64_compare_and_swap): Use it instead of n in operand 2's
    constraint.

    * gcc.dg/pr94368.c: New test.

diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index 
0ee8d2efac05877d610981b719bd02afdf93a832..1005462ae23aa13dbc3013a255aa189096e33366
 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -38,6 +38,8 @@
 
 (define_mode_attr cas_short_expected_pred
   [(QI "aarch64_reg_or_imm") (HI "aarch64_plushi_operand")])
+(define_mode_attr cas_short_expected_imm
+  [(QI "n") (HI "Uph")])
 
 (define_insn_and_split "aarch64_compare_and_swap"
   [(set (reg:CC CC_REGNUM) ;; bool out
@@ -47,7 +49,8 @@
   (match_operand:SHORT 1 "aarch64_sync_memory_operand" "+Q"))) ;; memory
(set (match_dup 1)
 (unspec_volatile:SHORT
-  [(match_operand:SHORT 2 "" "rn");; 
expected
+  [(match_operand:SHORT 2 ""
+ "r")  ;; expected
(match_operand:SHORT 3 "aarch64_reg_or_zero" "rZ")  ;; desired
(match_operand:SI 4 "const_int_operand");; 
is_weak
(match_operand:SI 5 "const_int_operand");; mod_s
diff --git a/gcc/config/aarch64/constraints.md 
b/gcc/config/aarch64/constraints.md
index 
32a0fa60a198c714f7c0b8b987da6bc26992845d..03626d2faf87e0b038bf3b8602d4feb8ef7d077c
 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -213,6 +213,13 @@
   (and (match_code "const_int")
(match_test "(unsigned) exact_log2 (ival) <= 4")))
 
+(define_constraint "Uph"
+  "@internal
+  A constraint that matches HImode integers zero extendable to
+  SImode plus_operand."
+  (and (match_code "const_int")
+   (match_test "aarch64_plushi_immediate (op, VOIDmode)")))
+
 (define_memory_constraint "Q"
  "A memory address which uses a single base register with no offset."
  (and (match_code "mem")
diff --git a/gcc/testsuite/gcc.dg/pr94368.c b/gcc/testsuite/gcc.dg/pr94368.c
new file mode 100644
index 
..1267b8220983ef1477a8339bdcc6369abaeca592
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr94368.c
@@ -0,0 +1,25 @@
+/* PR target/94368 */
+/* { dg-do compile { target fpic } } */
+/* { dg-options "-fpic -O1 -fcommon" } */
+
+int b, c, d, e, f, h;
+short g;
+int foo (int) __attribute__ ((__const__));
+
+void
+bar (void)
+{
+  while (1)
+{
+  while (1)
+   {
+ __atomic_load_n (&e, 0);
+ if (foo (2))
+   __sync_val_compare_and_swap (&c, 0, f);
+ b = 1;
+ if (h == e)
+   break;
+   }
+  __sync_val_compare_and_swap (&g, -1, f);
+}
+}


[PATCH 18/19][GCC-8] aarch64: Fix ICE due to aarch64_gen_compare_reg_maybe_ze [PR94435]

2020-04-16 Thread Andre Vieira (lists)

The following testcase ICEs, because aarch64_gen_compare_reg_maybe_ze emits
invalid RTL.
For y_mode [QH]Imode it expects y to be of that mode (or CONST_INT that fits
into that mode) and x being SImode; for non-CONST_INT y it zero extends y
into SImode and compares that against x, for CONST_INT y it zero extends y
into SImode.  The problem is that when the zero extended constant isn't
usable directly, it forces it into a REG, but with y_mode mode, and then
compares against y.  That is wrong, because it should force it into a SImode
REG and compare that way.

2020-04-16  Andre Vieira 

    Backport from mainline
    2020-04-02  Jakub Jelinek 

    PR target/94435
    * config/aarch64/aarch64.c (aarch64_gen_compare_reg_maybe_ze): For
    y_mode E_[QH]Imode and y being a CONST_INT, change y_mode to SImode.

    * gcc.target/aarch64/pr94435.c: New test.

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 
21124b5a3479dd388eb767402e080e2181153467..6bac63402e508027e77a9f4557cb10c578ea7c2c
 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1556,7 +1556,10 @@ aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, 
rtx y,
   if (y_mode == E_QImode || y_mode == E_HImode)
 {
   if (CONST_INT_P (y))
-   y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
+   {
+ y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
+ y_mode = SImode;
+   }
   else
{
  rtx t, cc_reg;
diff --git a/gcc/testsuite/gcc.target/aarch64/pr94435.c 
b/gcc/testsuite/gcc.target/aarch64/pr94435.c
new file mode 100644
index 
..5713c14d5f90b1d42f92d040e9030ecc03c97d51
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr94435.c
@@ -0,0 +1,25 @@
+/* PR target/94435 */
+/* { dg-do compile } */
+/* { dg-options "-march=armv8-a+nolse -moutline-atomics" } */
+
+int b, c, d, e, f, h;
+short g;
+int foo (int) __attribute__ ((__const__));
+
+void
+bar (void)
+{
+  while (1)
+{
+  while (1)
+   {
+ __atomic_load_n (&e, 0);
+ if (foo (2))
+   __sync_val_compare_and_swap (&c, 0, f);
+ b = 1;
+ if (h == e)
+   break;
+   }
+  __sync_val_compare_and_swap (&g, -1, f);
+}
+}


[PATCH 17/19][GCC-8] aarch64: Fix bootstrap with old binutils [PR93053]

2020-04-16 Thread Andre Vieira (lists)

As reported in the PR, GCC 10 (and also 9.3.1 but not 9.3.0) fails to build
when using older binutils which lack LSE support, because those instructions
are used in libgcc.
Thanks to Kyrylo's hint, the following patches (hopefully) allow it to build
even with older binutils by using .inst directive if LSE support isn't
available in the assembler.

2020-04-16 Andre Vieira 

    Backport from mainline
    2020-04-15  Jakub Jelinek 

    PR target/93053
    * configure.ac (LIBGCC_CHECK_AS_LSE): Add HAVE_AS_LSE checking.
    * config/aarch64/lse.S: Include auto-target.h, if HAVE_AS_LSE
    is not defined, use just .arch armv8-a.
    (B, M, N, OPN): Define.
    (COMMENT): New .macro.
    (CAS, CASP, SWP, LDOP): Use .inst directive if HAVE_AS_LSE is not
    defined.  Otherwise, move the operands right after the glue? and
    comment out operands where the macros are used.
    * configure: Regenerated.
    * config.in: Regenerated.

diff --git a/libgcc/config.in b/libgcc/config.in
index 
59a3d8daf52e72e548d3d9425d6043d5e0c663ad..5be5321d2584392bac1ec3af779cd96823212902
 100644
--- a/libgcc/config.in
+++ b/libgcc/config.in
@@ -10,6 +10,9 @@
*/
 #undef HAVE_AS_CFI_SECTIONS
 
+/* Define to 1 if the assembler supports LSE. */
+#undef HAVE_AS_LSE
+
 /* Define to 1 if the target assembler supports thread-local storage. */
 #undef HAVE_CC_TLS
 
diff --git a/libgcc/config/aarch64/lse.S b/libgcc/config/aarch64/lse.S
index 
c7979382ad7770b61bb1c64d32ba2395963a9d7a..f7f1c19587beaec2ccb6371378d54d50139ba1c9
 100644
--- a/libgcc/config/aarch64/lse.S
+++ b/libgcc/config/aarch64/lse.S
@@ -48,8 +48,14 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If 
not, see
  * separately to minimize code size.
  */
 
+#include "auto-target.h"
+
 /* Tell the assembler to accept LSE instructions.  */
+#ifdef HAVE_AS_LSE
.arch armv8-a+lse
+#else
+   .arch armv8-a
+#endif
 
 /* Declare the symbol gating the LSE implementations.  */
.hidden __aarch64_have_lse_atomics
@@ -58,12 +64,19 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  
If not, see
 #if SIZE == 1
 # define S b
 # define UXT   uxtb
+# define B 0x
 #elif SIZE == 2
 # define S h
 # define UXT   uxth
+# define B 0x4000
 #elif SIZE == 4 || SIZE == 8 || SIZE == 16
 # define S
 # define UXT   mov
+# if SIZE == 4
+#  define B0x8000
+# elif SIZE == 8
+#  define B0xc000
+# endif
 #else
 # error
 #endif
@@ -72,18 +85,26 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  
If not, see
 # define SUFF  _relax
 # define A
 # define L
+# define M 0x00
+# define N 0x00
 #elif MODEL == 2
 # define SUFF  _acq
 # define A a
 # define L
+# define M 0x40
+# define N 0x80
 #elif MODEL == 3
 # define SUFF  _rel
 # define A
 # define L l
+# define M 0x008000
+# define N 0x40
 #elif MODEL == 4
 # define SUFF  _acq_rel
 # define A a
 # define L l
+# define M 0x408000
+# define N 0xc0
 #else
 # error
 #endif
@@ -144,9 +165,13 @@ STARTFNNAME(cas)
JUMP_IF_NOT_LSE 8f
 
 #if SIZE < 16
-#define CASglue4(cas, A, L, S)
+#ifdef HAVE_AS_LSE
+# define CAS   glue4(cas, A, L, S) s(0), s(1), [x2]
+#else
+# define CAS   .inst 0x08a07c41 + B + M
+#endif
 
-   CAS s(0), s(1), [x2]
+   CAS /* s(0), s(1), [x2] */
ret
 
 8: UXT s(tmp0), s(0)
@@ -160,9 +185,13 @@ STARTFNNAME(cas)
 #else
 #define LDXP   glue3(ld, A, xp)
 #define STXP   glue3(st, L, xp)
-#define CASP   glue3(casp, A, L)
+#ifdef HAVE_AS_LSE
+# define CASP  glue3(casp, A, L)   x0, x1, x2, x3, [x4]
+#else
+# define CASP  .inst 0x48207c82 + M
+#endif
 
-   CASPx0, x1, x2, x3, [x4]
+   CASP/* x0, x1, x2, x3, [x4] */
ret
 
 8: mov x(tmp0), x0
@@ -181,12 +210,16 @@ ENDFN NAME(cas)
 #endif
 
 #ifdef L_swp
-#define SWPglue4(swp, A, L, S)
+#ifdef HAVE_AS_LSE
+# define SWP   glue4(swp, A, L, S) s(0), s(0), [x1]
+#else
+# define SWP   .inst 0x38208020 + B + N
+#endif
 
 STARTFNNAME(swp)
JUMP_IF_NOT_LSE 8f
 
-   SWP s(0), s(0), [x1]
+   SWP /* s(0), s(0), [x1] */
ret
 
 8: mov s(tmp0), s(0)
@@ -204,24 +237,32 @@ ENDFN NAME(swp)
 #ifdef L_ldadd
 #define LDNM   ldadd
 #define OP add
+#define OPN0x
 #elif defined(L_ldclr)
 #define LDNM   ldclr
 #define OP bic
+#define OPN0x1000
 #elif defined(L_ldeor)
 #define LDNM   ldeor
 #define OP eor
+#define OPN0x2000
 #elif defined(L_ldset)
 #define LDNM   ldset
 #define OP orr
+#define OPN0x3000
 #else
 #error
 #endif
-#define LDOP   glue4(LDNM, A, L, S)
+#ifdef HAVE_AS_LSE
+# define LDOP  glue4(LDNM, A, L, S)s(0), s(0), [x1]
+#else
+# define LDOP  .inst 0x38200020 + OPN + B + N
+#endif
 
 STARTFNNAME(LDNM)
JUMP_IF_NOT_LSE 8f
 
-   LDOPs(0), 

[PATCH 13/19][GCC-8] Aarch64: Fix shrinkwrapping interactions with atomics

2020-04-16 Thread Andre Vieira (lists)

2020-04-16  Andre Vieira 

    Backport from mainline
    2020-01-17  Wilco Dijkstra 

    PR target/92692
    * config/aarch64/atomics.md (aarch64_compare_and_swap)
    Use epilogue_completed rather than reload_completed.

diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index 
28a1dbc4231009333c2e766d9d3aead54a491631..0ee8d2efac05877d610981b719bd02afdf93a832
 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -104,7 +104,7 @@
(clobber (match_scratch:SI 7 "=&r"))]
   ""
   "#"
-  "&& reload_completed"
+  "&& epilogue_completed"
   [(const_int 0)]
   {
 aarch64_split_compare_and_swap (operands);


[PATCH 8/19][GCC-8] aarch64: Implement TImode compare-and-swap

2020-04-16 Thread Andre Vieira (lists)

2020-04-16  Andre Vieira 

    Backport from mainline.
    2019-09-19  Richard Henderson 

    * config/aarch64/aarch64.c (aarch64_gen_compare_reg): Add support
    for NE comparison of TImode values.
    (aarch64_emit_load_exclusive): Add support for TImode.
    (aarch64_emit_store_exclusive): Likewise.
    (aarch64_split_compare_and_swap): Disable strong_zero_p for TImode.
    * config/aarch64/atomics.md (atomic_compare_and_swapti):
    Change iterator from ALLI to ALLI_TI.
    (atomic_compare_and_swapti): New.
    (atomic_compare_and_swapti: New.
    (aarch64_load_exclusive_pair): New.
    (aarch64_store_exclusive_pair): New.
    * config/aarch64/iterators.md (ALLI_TI): New iterator.

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 
317571e018c4f96046799675e042cdfaabb5b94a..09e78313489d266daaca9eba3647f150534893f6
 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1517,10 +1517,33 @@ emit_set_insn (rtx x, rtx y)
 rtx
 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
 {
-  machine_mode mode = SELECT_CC_MODE (code, x, y);
-  rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
+  machine_mode cmp_mode = GET_MODE (x);
+  machine_mode cc_mode;
+  rtx cc_reg;
 
-  emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
+  if (cmp_mode == TImode)
+{
+  gcc_assert (code == NE);
+
+  cc_mode = CCmode;
+  cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
+
+  rtx x_lo = operand_subword (x, 0, 0, TImode);
+  rtx y_lo = operand_subword (y, 0, 0, TImode);
+  emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
+
+  rtx x_hi = operand_subword (x, 1, 0, TImode);
+  rtx y_hi = operand_subword (y, 1, 0, TImode);
+  emit_insn (gen_ccmpdi (cc_reg, cc_reg, x_hi, y_hi,
+gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
+GEN_INT (AARCH64_EQ)));
+}
+  else
+{
+  cc_mode = SELECT_CC_MODE (code, x, y);
+  cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
+  emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
+}
   return cc_reg;
 }
 
@@ -14145,40 +14168,54 @@ static void
 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
 rtx mem, rtx model_rtx)
 {
-  rtx (*gen) (rtx, rtx, rtx);
-
-  switch (mode)
+  if (mode == TImode)
+emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
+   gen_highpart (DImode, rval),
+   mem, model_rtx));
+  else
 {
-case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
-case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
-case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
-case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
-default:
-  gcc_unreachable ();
-}
+  rtx (*gen) (rtx, rtx, rtx);
+
+  switch (mode)
+   {
+   case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
+   case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
+   case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
+   case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
+   default:
+ gcc_unreachable ();
+   }
 
-  emit_insn (gen (rval, mem, model_rtx));
+  emit_insn (gen (rval, mem, model_rtx));
+}
 }
 
 /* Emit store exclusive.  */
 
 static void
 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
- rtx rval, rtx mem, rtx model_rtx)
+ rtx mem, rtx rval, rtx model_rtx)
 {
-  rtx (*gen) (rtx, rtx, rtx, rtx);
-
-  switch (mode)
+  if (mode == TImode)
+emit_insn (gen_aarch64_store_exclusive_pair
+  (bval, mem, operand_subword (rval, 0, 0, TImode),
+   operand_subword (rval, 1, 0, TImode), model_rtx));
+  else
 {
-case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
-case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
-case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
-case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
-default:
-  gcc_unreachable ();
-}
+  rtx (*gen) (rtx, rtx, rtx, rtx);
+
+  switch (mode)
+   {
+   case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
+   case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
+   case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
+   case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
+   default:
+ gcc_unreachable ();
+   }
 
-  emit_insn (gen (bval, rval, mem, model_rtx));
+  emit_insn (gen (bval, mem, rval, model_rtx));
+}
 }
 
 /* Mark the previous jump instruction as unlikely.  */
@@ -14197,16 +14234,6 @@ aarch64_expand_compare_and_swap (rtx operands[])
 {
   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
   machine_mode mode, r_mode;
-  typedef rtx (*gen_atomic_cas_fn) (rtx, rtx, rtx, rtx

[PATCH 7/19][GCC-8] aarch64: Extend %R for integer registers

2020-04-16 Thread Andre Vieira (lists)

2020-04-16  Andre Vieira 

    Backport from mainline.
    2019-09-19  Richard Henderson 

    * config/aarch64/aarch64.c (aarch64_print_operand): Allow integer
    registers with %R.

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 
1068cfd899a759c506e3217e1e2c19cd778b4372..317571e018c4f96046799675e042cdfaabb5b94a
 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -6627,7 +6627,7 @@ sizetochar (int size)
  'S/T/U/V':Print a FP/SIMD register name for a register 
list.
The register printed is the FP/SIMD register name
of X + 0/1/2/3 for S/T/U/V.
- 'R':  Print a scalar FP/SIMD register name + 1.
+ 'R':  Print a scalar Integer/FP/SIMD register name + 1.
  'X':  Print bottom 16 bits of integer constant in hex.
  'w/x':Print a general register name or the zero register
(32-bit or 64-bit).
@@ -6813,12 +6813,13 @@ aarch64_print_operand (FILE *f, rtx x, int code)
   break;
 
 case 'R':
-  if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
-   {
- output_operand_lossage ("incompatible floating point / vector 
register operand for '%%%c'", code);
- return;
-   }
-  asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
+  if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
+   asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
+  else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
+   asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
+  else
+   output_operand_lossage ("incompatible register operand for '%%%c'",
+   code);
   break;
 
 case 'X':


[PATCH 14/19][GCC-8] aarch64: Fix store-exclusive in load-operate LSE helpers

2020-04-16 Thread Andre Vieira (lists)

2020-04-16  Andre Vieira 

    Backport from mainline
    2019-09-25  Richard Henderson 

    PR target/91834
    * config/aarch64/lse.S (LDNM): Ensure STXR output does not
    overlap the inputs.

diff --git a/libgcc/config/aarch64/lse.S b/libgcc/config/aarch64/lse.S
index 
a5f6673596c73c497156a6f128799cc43b400504..c7979382ad7770b61bb1c64d32ba2395963a9d7a
 100644
--- a/libgcc/config/aarch64/lse.S
+++ b/libgcc/config/aarch64/lse.S
@@ -227,8 +227,8 @@ STARTFN NAME(LDNM)
 8: mov s(tmp0), s(0)
 0: LDXRs(0), [x1]
OP  s(tmp1), s(0), s(tmp0)
-   STXRw(tmp1), s(tmp1), [x1]
-   cbnzw(tmp1), 0b
+   STXRw(tmp2), s(tmp1), [x1]
+   cbnzw(tmp2), 0b
ret
 
 ENDFN  NAME(LDNM)


[PATCH 10/19][GCC-8] aarch64: Add out-of-line functions for LSE atomics

2020-04-16 Thread Andre Vieira (lists)

This is the libgcc part of the interface -- providing the functions.
Rationale is provided at the top of libgcc/config/aarch64/lse.S.

2020-04-16  Andre Vieira 

    Backport from mainline
    2019-09-19  Richard Henderson 

    * config/aarch64/lse-init.c: New file.
    * config/aarch64/lse.S: New file.
    * config/aarch64/t-lse: New file.
    * config.host: Add t-lse to all aarch64 tuples.

diff --git a/libgcc/config.host b/libgcc/config.host
index 
b12c86267dac9da8da9e1ab4123d5171c3e07f40..e436ade1a68c6cd918d2f370b14d61682cb9fd59
 100644
--- a/libgcc/config.host
+++ b/libgcc/config.host
@@ -337,23 +337,27 @@ aarch64*-*-elf | aarch64*-*-rtems*)
extra_parts="$extra_parts crtbegin.o crtend.o crti.o crtn.o"
extra_parts="$extra_parts crtfastmath.o"
tmake_file="${tmake_file} ${cpu_type}/t-aarch64"
+   tmake_file="${tmake_file} ${cpu_type}/t-lse t-slibgcc-libgcc"
tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp t-crtfm"
md_unwind_header=aarch64/aarch64-unwind.h
;;
 aarch64*-*-freebsd*)
extra_parts="$extra_parts crtfastmath.o"
tmake_file="${tmake_file} ${cpu_type}/t-aarch64"
+   tmake_file="${tmake_file} ${cpu_type}/t-lse t-slibgcc-libgcc"
tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp t-crtfm"
md_unwind_header=aarch64/freebsd-unwind.h
;;
 aarch64*-*-fuchsia*)
tmake_file="${tmake_file} ${cpu_type}/t-aarch64"
+   tmake_file="${tmake_file} ${cpu_type}/t-lse t-slibgcc-libgcc"
tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp"
;;
 aarch64*-*-linux*)
extra_parts="$extra_parts crtfastmath.o"
md_unwind_header=aarch64/linux-unwind.h
tmake_file="${tmake_file} ${cpu_type}/t-aarch64"
+   tmake_file="${tmake_file} ${cpu_type}/t-lse t-slibgcc-libgcc"
tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp t-crtfm"
;;
 alpha*-*-linux*)
diff --git a/libgcc/config/aarch64/lse-init.c b/libgcc/config/aarch64/lse-init.c
new file mode 100644
index 
..33d2914747994a1e07dcae906f0352e64045ab02
--- /dev/null
+++ b/libgcc/config/aarch64/lse-init.c
@@ -0,0 +1,45 @@
+/* Out-of-line LSE atomics for AArch64 architecture, Init.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   Contributed by Linaro Ltd.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+/* Define the symbol gating the LSE implementations.  */
+_Bool __aarch64_have_lse_atomics
+  __attribute__((visibility("hidden"), nocommon));
+
+/* Disable initialization of __aarch64_have_lse_atomics during bootstrap.  */
+#ifndef inhibit_libc
+# include 
+
+/* Disable initialization if the system headers are too old.  */
+# if defined(AT_HWCAP) && defined(HWCAP_ATOMICS)
+
+static void __attribute__((constructor))
+init_have_lse_atomics (void)
+{
+  unsigned long hwcap = getauxval (AT_HWCAP);
+  __aarch64_have_lse_atomics = (hwcap & HWCAP_ATOMICS) != 0;
+}
+
+# endif /* HWCAP */
+#endif /* inhibit_libc */
diff --git a/libgcc/config/aarch64/lse.S b/libgcc/config/aarch64/lse.S
new file mode 100644
index 
..a5f6673596c73c497156a6f128799cc43b400504
--- /dev/null
+++ b/libgcc/config/aarch64/lse.S
@@ -0,0 +1,235 @@
+/* Out-of-line LSE atomics for AArch64 architecture.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   Contributed by Linaro Ltd.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+Under Section 7 of GPL version 3, you are granted addi

[PATCH 9/19][GCC-8] aarch64: Tidy aarch64_split_compare_and_swap

2020-04-16 Thread Andre Vieira (lists)

2020-04-16  Andre Vieira 

    Backport from mainline.
    2019-09-19  Richard Henderson 

    * config/aarch64/aarch64 (aarch64_split_compare_and_swap):Unify 
some code paths;

    use aarch64_gen_compare_reg instead of open-coding.

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 
09e78313489d266daaca9eba3647f150534893f6..2df5bf3db97d9362155c3c8d9c9d7f14c41b9520
 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -14359,13 +14359,11 @@ aarch64_split_compare_and_swap (rtx operands[])
   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
   gcc_assert (epilogue_completed);
 
-  rtx rval, mem, oldval, newval, scratch;
+  rtx rval, mem, oldval, newval, scratch, x, model_rtx;
   machine_mode mode;
   bool is_weak;
   rtx_code_label *label1, *label2;
-  rtx x, cond;
   enum memmodel model;
-  rtx model_rtx;
 
   rval = operands[0];
   mem = operands[1];
@@ -14386,7 +14384,7 @@ aarch64_split_compare_and_swap (rtx operands[])
CBNZscratch, .label1
 .label2:
CMP rval, 0.  */
-  bool strong_zero_p = !is_weak && oldval == const0_rtx && mode != TImode;
+  bool strong_zero_p = (!is_weak && oldval == const0_rtx && mode != TImode);
 
   label1 = NULL;
   if (!is_weak)
@@ -14399,26 +14397,20 @@ aarch64_split_compare_and_swap (rtx operands[])
   /* The initial load can be relaxed for a __sync operation since a final
  barrier will be emitted to stop code hoisting.  */
   if (is_mm_sync (model))
-aarch64_emit_load_exclusive (mode, rval, mem,
-GEN_INT (MEMMODEL_RELAXED));
+aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
   else
 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
 
   if (strong_zero_p)
-{
-  x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
-  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
-   gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
-  aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
-}
+x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
   else
 {
-  cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
-  x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
-  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
-   gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
-  aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
+  rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
+  x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
 }
+  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+   gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
+  aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
 
   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
 
@@ -14430,22 +14422,16 @@ aarch64_split_compare_and_swap (rtx operands[])
   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
 }
   else
-{
-  cond = gen_rtx_REG (CCmode, CC_REGNUM);
-  x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
-  emit_insn (gen_rtx_SET (cond, x));
-}
+aarch64_gen_compare_reg (NE, scratch, const0_rtx);
 
   emit_label (label2);
+
   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
  to set the condition flags.  If this is not used it will be removed by
  later passes.  */
   if (strong_zero_p)
-{
-  cond = gen_rtx_REG (CCmode, CC_REGNUM);
-  x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
-  emit_insn (gen_rtx_SET (cond, x));
-}
+aarch64_gen_compare_reg (NE, rval, const0_rtx);
+
   /* Emit any final barrier needed for a __sync operation.  */
   if (is_mm_sync (model))
 aarch64_emit_post_barrier (model);


[PATCH 12/19][GCC-8] aarch64: Implement -moutline-atomics

2020-04-16 Thread Andre Vieira (lists)

2020-04-16  Andre Vieira 

    Backport from mainline
    2019-09-19  Richard Henderson 

    * config/aarch64/aarch64.opt (-moutline-atomics): New.
    * config/aarch64/aarch64.c (aarch64_atomic_ool_func): New.
    (aarch64_ool_cas_names, aarch64_ool_swp_names): New.
    (aarch64_ool_ldadd_names, aarch64_ool_ldset_names): New.
    (aarch64_ool_ldclr_names, aarch64_ool_ldeor_names): New.
    (aarch64_expand_compare_and_swap): Honor TARGET_OUTLINE_ATOMICS.
    * config/aarch64/atomics.md (atomic_exchange): Likewise.
    (atomic_): Likewise.
    (atomic_fetch_): Likewise.
    (atomic__fetch): Likewise.
    * doc/invoke.texi: Document -moutline-atomics.

    * gcc.target/aarch64/atomic-op-acq_rel.c: Use -mno-outline-atomics.
    * gcc.target/aarch64/atomic-comp-swap-release-acquire.c: Likewise.
    * gcc.target/aarch64/atomic-op-acquire.c: Likewise.
    * gcc.target/aarch64/atomic-op-char.c: Likewise.
    * gcc.target/aarch64/atomic-op-consume.c: Likewise.
    * gcc.target/aarch64/atomic-op-imm.c: Likewise.
    * gcc.target/aarch64/atomic-op-int.c: Likewise.
    * gcc.target/aarch64/atomic-op-long.c: Likewise.
    * gcc.target/aarch64/atomic-op-relaxed.c: Likewise.
    * gcc.target/aarch64/atomic-op-release.c: Likewise.
    * gcc.target/aarch64/atomic-op-seq_cst.c: Likewise.
    * gcc.target/aarch64/atomic-op-short.c: Likewise.
    * gcc.target/aarch64/atomic_cmp_exchange_zero_reg_1.c: Likewise.
    * gcc.target/aarch64/atomic_cmp_exchange_zero_strong_1.c: Likewise.
    * gcc.target/aarch64/sync-comp-swap.c: Likewise.
    * gcc.target/aarch64/sync-op-acquire.c: Likewise.
    * gcc.target/aarch64/sync-op-full.c: Likewise.

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 
da68ce0e7d096bf4a512c2b8ef52bf236f8f76f4..0f1dc75a27f3fdd2218e57811e208fc28139ac4a
 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -548,4 +548,17 @@ rtl_opt_pass *make_pass_fma_steering (gcc::context *ctxt);
 
 poly_uint64 aarch64_regmode_natural_size (machine_mode);
 
+struct atomic_ool_names
+{
+const char *str[5][4];
+};
+
+rtx aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
+   const atomic_ool_names *names);
+extern const atomic_ool_names aarch64_ool_swp_names;
+extern const atomic_ool_names aarch64_ool_ldadd_names;
+extern const atomic_ool_names aarch64_ool_ldset_names;
+extern const atomic_ool_names aarch64_ool_ldclr_names;
+extern const atomic_ool_names aarch64_ool_ldeor_names;
+
 #endif /* GCC_AARCH64_PROTOS_H */
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 
2df5bf3db97d9362155c3c8d9c9d7f14c41b9520..21124b5a3479dd388eb767402e080e2181153467
 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -14227,6 +14227,82 @@ aarch64_emit_unlikely_jump (rtx insn)
   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
 }
 
+/* We store the names of the various atomic helpers in a 5x4 array.
+   Return the libcall function given MODE, MODEL and NAMES.  */
+
+rtx
+aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
+   const atomic_ool_names *names)
+{
+  memmodel model = memmodel_base (INTVAL (model_rtx));
+  int mode_idx, model_idx;
+
+  switch (mode)
+{
+case E_QImode:
+  mode_idx = 0;
+  break;
+case E_HImode:
+  mode_idx = 1;
+  break;
+case E_SImode:
+  mode_idx = 2;
+  break;
+case E_DImode:
+  mode_idx = 3;
+  break;
+case E_TImode:
+  mode_idx = 4;
+  break;
+default:
+  gcc_unreachable ();
+}
+
+  switch (model)
+{
+case MEMMODEL_RELAXED:
+  model_idx = 0;
+  break;
+case MEMMODEL_CONSUME:
+case MEMMODEL_ACQUIRE:
+  model_idx = 1;
+  break;
+case MEMMODEL_RELEASE:
+  model_idx = 2;
+  break;
+case MEMMODEL_ACQ_REL:
+case MEMMODEL_SEQ_CST:
+  model_idx = 3;
+  break;
+default:
+  gcc_unreachable ();
+}
+
+  return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
+ VISIBILITY_HIDDEN);
+}
+
+#define DEF0(B, N) \
+  { "__aarch64_" #B #N "_relax", \
+"__aarch64_" #B #N "_acq", \
+"__aarch64_" #B #N "_rel", \
+"__aarch64_" #B #N "_acq_rel" }
+
+#define DEF4(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
+{ NULL, NULL, NULL, NULL }
+#define DEF5(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
+
+static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
+const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
+const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
+const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
+const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
+const atomic_ool_names aarch64_ool_ldeor_n

[PATCH 11/19][GCC-8] Add visibility to libfunc constructors

2020-04-16 Thread Andre Vieira (lists)

2020-04-16  Andre Vieira 

    Backport from mainline.
    2018-10-31  Richard Henderson 

    * optabs-libfuncs.c (build_libfunc_function_visibility):
    New, split out from...
    (build_libfunc_function): ... here.
    (init_one_libfunc_visibility): New, split out from ...
    (init_one_libfunc): ... here.

diff --git a/gcc/optabs-libfuncs.h b/gcc/optabs-libfuncs.h
index 
0669ea1fdd7dc666d28fc0407a2288de86b3918b..cf39da36887516193aa789446ef0b6a7c24fb1ef
 100644
--- a/gcc/optabs-libfuncs.h
+++ b/gcc/optabs-libfuncs.h
@@ -63,7 +63,9 @@ void gen_satfract_conv_libfunc (convert_optab, const char *,
 void gen_satfractuns_conv_libfunc (convert_optab, const char *,
   machine_mode, machine_mode);
 
+tree build_libfunc_function_visibility (const char *, symbol_visibility);
 tree build_libfunc_function (const char *);
+rtx init_one_libfunc_visibility (const char *, symbol_visibility);
 rtx init_one_libfunc (const char *);
 rtx set_user_assembler_libfunc (const char *, const char *);
 
diff --git a/gcc/optabs-libfuncs.c b/gcc/optabs-libfuncs.c
index 
bd0df8baa3711febcbdf2745588d5d43519af72b..73a28e9ca7a1e5b1564861071e0923d8b8219d25
 100644
--- a/gcc/optabs-libfuncs.c
+++ b/gcc/optabs-libfuncs.c
@@ -719,10 +719,10 @@ struct libfunc_decl_hasher : ggc_ptr_hash
 /* A table of previously-created libfuncs, hashed by name.  */
 static GTY (()) hash_table *libfunc_decls;
 
-/* Build a decl for a libfunc named NAME.  */
+/* Build a decl for a libfunc named NAME with visibility VIS.  */
 
 tree
-build_libfunc_function (const char *name)
+build_libfunc_function_visibility (const char *name, symbol_visibility vis)
 {
   /* ??? We don't have any type information; pretend this is "int foo ()".  */
   tree decl = build_decl (UNKNOWN_LOCATION, FUNCTION_DECL,
@@ -731,7 +731,7 @@ build_libfunc_function (const char *name)
   DECL_EXTERNAL (decl) = 1;
   TREE_PUBLIC (decl) = 1;
   DECL_ARTIFICIAL (decl) = 1;
-  DECL_VISIBILITY (decl) = VISIBILITY_DEFAULT;
+  DECL_VISIBILITY (decl) = vis;
   DECL_VISIBILITY_SPECIFIED (decl) = 1;
   gcc_assert (DECL_ASSEMBLER_NAME (decl));
 
@@ -742,11 +742,19 @@ build_libfunc_function (const char *name)
   return decl;
 }
 
+/* Build a decl for a libfunc named NAME.  */
+
+tree
+build_libfunc_function (const char *name)
+{
+  return build_libfunc_function_visibility (name, VISIBILITY_DEFAULT);
+}
+
 /* Return a libfunc for NAME, creating one if we don't already have one.
-   The returned rtx is a SYMBOL_REF.  */
+   The decl is given visibility VIS.  The returned rtx is a SYMBOL_REF.  */
 
 rtx
-init_one_libfunc (const char *name)
+init_one_libfunc_visibility (const char *name, symbol_visibility vis)
 {
   tree id, decl;
   hashval_t hash;
@@ -763,12 +771,18 @@ init_one_libfunc (const char *name)
 {
   /* Create a new decl, so that it can be passed to
 targetm.encode_section_info.  */
-  decl = build_libfunc_function (name);
+  decl = build_libfunc_function_visibility (name, vis);
   *slot = decl;
 }
   return XEXP (DECL_RTL (decl), 0);
 }
 
+rtx
+init_one_libfunc (const char *name)
+{
+  return init_one_libfunc_visibility (name, VISIBILITY_DEFAULT);
+}
+
 /* Adjust the assembler name of libfunc NAME to ASMSPEC.  */
 
 rtx


[PATCH 2/19][GCC-8] aarch64: Simplify LSE cas generation

2020-04-16 Thread Andre Vieira (lists)

The cas insn is a single insn, and if expanded properly need not
be split after reload.  Use the proper inputs for the insn.

2020-04-16  Andre Vieira 

    Backport from mainline.
    2018-10-31  Richard Henderson 

    * config/aarch64/aarch64.c (aarch64_expand_compare_and_swap):
    Force oldval into the rval register for TARGET_LSE; emit the compare
    during initial expansion so that it may be deleted if unused.
    (aarch64_gen_atomic_cas): Remove.
    * config/aarch64/atomics.md (aarch64_compare_and_swap_lse):
    Change =&r to +r for operand 0; use match_dup for operand 2;
    remove is_weak and mod_f operands as unused.  Drop the split
    and merge with...
    (aarch64_atomic_cas): ... this pattern's output; remove.
    (aarch64_compare_and_swap_lse): Similarly.
    (aarch64_atomic_cas): Similarly.

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 
cda2895d28e7496f8fd6c1b365c4bb497b54c323..a03565c3b4e13990dc1a0064f9cbbc38bb109795
 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -496,7 +496,6 @@ rtx aarch64_load_tp (rtx);
 
 void aarch64_expand_compare_and_swap (rtx op[]);
 void aarch64_split_compare_and_swap (rtx op[]);
-void aarch64_gen_atomic_cas (rtx, rtx, rtx, rtx, rtx);
 
 bool aarch64_atomic_ldop_supported_p (enum rtx_code);
 void aarch64_gen_atomic_ldop (enum rtx_code, rtx, rtx, rtx, rtx, rtx);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 
20761578fb6051e600299cd58f245774bd457432..c83a9f7ae78d4ed3da6636fce4d1f57c27048756
 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -14169,17 +14169,19 @@ aarch64_expand_compare_and_swap (rtx operands[])
 {
   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
   machine_mode mode, cmp_mode;
-  typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
+  typedef rtx (*gen_split_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
+  typedef rtx (*gen_atomic_cas_fn) (rtx, rtx, rtx, rtx);
   int idx;
-  gen_cas_fn gen;
-  const gen_cas_fn split_cas[] =
+  gen_split_cas_fn split_gen;
+  gen_atomic_cas_fn atomic_gen;
+  const gen_split_cas_fn split_cas[] =
   {
 gen_aarch64_compare_and_swapqi,
 gen_aarch64_compare_and_swaphi,
 gen_aarch64_compare_and_swapsi,
 gen_aarch64_compare_and_swapdi
   };
-  const gen_cas_fn atomic_cas[] =
+  const gen_atomic_cas_fn atomic_cas[] =
   {
 gen_aarch64_compare_and_swapqi_lse,
 gen_aarch64_compare_and_swaphi_lse,
@@ -14238,14 +14240,29 @@ aarch64_expand_compare_and_swap (rtx operands[])
   gcc_unreachable ();
 }
   if (TARGET_LSE)
-gen = atomic_cas[idx];
+{
+  atomic_gen = atomic_cas[idx];
+  /* The CAS insn requires oldval and rval overlap, but we need to
+have a copy of oldval saved across the operation to tell if
+the operation is successful.  */
+  if (mode == QImode || mode == HImode)
+   rval = copy_to_mode_reg (SImode, gen_lowpart (SImode, oldval));
+  else if (reg_overlap_mentioned_p (rval, oldval))
+rval = copy_to_mode_reg (mode, oldval);
+  else
+   emit_move_insn (rval, oldval);
+  emit_insn (atomic_gen (rval, mem, newval, mod_s));
+  aarch64_gen_compare_reg (EQ, rval, oldval);
+}
   else
-gen = split_cas[idx];
-
-  emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
+{
+  split_gen = split_cas[idx];
+  emit_insn (split_gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
+}
 
   if (mode == QImode || mode == HImode)
-emit_move_insn (operands[1], gen_lowpart (mode, rval));
+rval = gen_lowpart (mode, rval);
+  emit_move_insn (operands[1], rval);
 
   x = gen_rtx_REG (CCmode, CC_REGNUM);
   x = gen_rtx_EQ (SImode, x, const0_rtx);
@@ -14295,42 +14312,6 @@ aarch64_emit_post_barrier (enum memmodel model)
 }
 }
 
-/* Emit an atomic compare-and-swap operation.  RVAL is the destination register
-   for the data in memory.  EXPECTED is the value expected to be in memory.
-   DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
-   is the memory ordering to use.  */
-
-void
-aarch64_gen_atomic_cas (rtx rval, rtx mem,
-   rtx expected, rtx desired,
-   rtx model)
-{
-  rtx (*gen) (rtx, rtx, rtx, rtx);
-  machine_mode mode;
-
-  mode = GET_MODE (mem);
-
-  switch (mode)
-{
-case E_QImode: gen = gen_aarch64_atomic_casqi; break;
-case E_HImode: gen = gen_aarch64_atomic_cashi; break;
-case E_SImode: gen = gen_aarch64_atomic_cassi; break;
-case E_DImode: gen = gen_aarch64_atomic_casdi; break;
-default:
-  gcc_unreachable ();
-}
-
-  /* Move the expected value into the CAS destination register.  */
-  emit_insn (gen_rtx_SET (rval, expected));
-
-  /* Emit the CAS.  */
-  emit_insn (gen (rval, mem, desired, model));
-
-  /* Compare the expected value with the value loaded by the CAS, to establish
- wh

[PATCH 6/19][GCC-8] aarch64: Remove early clobber from ATOMIC_LDOP scratch

2020-04-16 Thread Andre Vieira (lists)

2020-04-16  Andre Vieira 

    Backport from mainline.
    2018-10-31  Richard Henderson 

    * config/aarch64/atomics.md (aarch64_atomic__lse):
    scratch register need not be early-clobber.  Document the reason
    why we cannot use ST.

diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index 
47a8a40c5b82e349b2caf4e48f9f81577f4c3ed3..d740f4a100b1b624eafdb279f38ac1ce9db587dd
 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -263,6 +263,18 @@
   }
 )
 
+;; It is tempting to want to use ST for relaxed and release
+;; memory models here.  However, that is incompatible with the
+;; C++ memory model for the following case:
+;;
+;; atomic_fetch_add(ptr, 1, memory_order_relaxed);
+;; atomic_thread_fence(memory_order_acquire);
+;;
+;; The problem is that the architecture says that ST (and LD
+;; insns where the destination is XZR) are not regarded as a read.
+;; However we also implement the acquire memory barrier with DMB LD,
+;; and so the ST is not blocked by the barrier.
+
 (define_insn "aarch64_atomic__lse"
   [(set (match_operand:ALLI 0 "aarch64_sync_memory_operand" "+Q")
(unspec_volatile:ALLI
@@ -270,7 +282,7 @@
   (match_operand:ALLI 1 "register_operand" "r")
   (match_operand:SI 2 "const_int_operand")]
   ATOMIC_LDOP))
-   (clobber (match_scratch:ALLI 3 "=&r"))]
+   (clobber (match_scratch:ALLI 3 "=r"))]
   "TARGET_LSE"
   {
enum memmodel model = memmodel_from_int (INTVAL (operands[2]));


[PATCH 4/19][GCC-8] aarch64: Improve swp generation

2020-04-16 Thread Andre Vieira (lists)

Allow zero as an input; fix constraints; avoid unnecessary split.

2020-04-16  Andre Vieira 

    Backport from mainline.
    2018-10-31  Richard Henderson 

    * config/aarch64/aarch64.c (aarch64_emit_atomic_swap): Remove.
    (aarch64_gen_atomic_ldop): Don't call it.
    * config/aarch64/atomics.md (atomic_exchange):
    Use aarch64_reg_or_zero.
    (aarch64_atomic_exchange): Likewise.
    (aarch64_atomic_exchange_lse): Remove split; remove & from
    operand 0; use aarch64_reg_or_zero for input; merge ...
    (aarch64_atomic_swp): ... this and remove.

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 
b6a6e314153ecf4a7ae1b83cfb64e6192197edc5..bac69474598ff19161b72748505151b0d6185a9b
 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -14454,27 +14454,6 @@ aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, 
rtx s2, int shift)
   emit_insn (gen (dst, s2, shift_rtx, s1));
 }
 
-/* Emit an atomic swap.  */
-
-static void
-aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
- rtx mem, rtx model)
-{
-  rtx (*gen) (rtx, rtx, rtx, rtx);
-
-  switch (mode)
-{
-case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
-case E_HImode: gen = gen_aarch64_atomic_swphi; break;
-case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
-case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
-default:
-  gcc_unreachable ();
-}
-
-  emit_insn (gen (dst, mem, value, model));
-}
-
 /* Operations supported by aarch64_emit_atomic_load_op.  */
 
 enum aarch64_atomic_load_op_code
@@ -14587,10 +14566,6 @@ aarch64_gen_atomic_ldop (enum rtx_code code, rtx 
out_data, rtx out_result,
  a SET then emit a swap instruction and finish.  */
   switch (code)
 {
-case SET:
-  aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
-  return;
-
 case MINUS:
   /* Negate the value and treat it as a PLUS.  */
   {
diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index 
b0e84b8addd809598b3e358a265b86582ce96462..6cc14fbf6c103ab19e6c201333a9eba06b90c469
 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -136,7 +136,7 @@
 (define_expand "atomic_exchange"
  [(match_operand:ALLI 0 "register_operand" "")
   (match_operand:ALLI 1 "aarch64_sync_memory_operand" "")
-  (match_operand:ALLI 2 "register_operand" "")
+  (match_operand:ALLI 2 "aarch64_reg_or_zero" "")
   (match_operand:SI 3 "const_int_operand" "")]
   ""
   {
@@ -156,10 +156,10 @@
 
 (define_insn_and_split "aarch64_atomic_exchange"
   [(set (match_operand:ALLI 0 "register_operand" "=&r");; 
output
-(match_operand:ALLI 1 "aarch64_sync_memory_operand" "+Q")) ;; memory
+(match_operand:ALLI 1 "aarch64_sync_memory_operand" "+Q")) ;; memory
(set (match_dup 1)
 (unspec_volatile:ALLI
-  [(match_operand:ALLI 2 "register_operand" "r")   ;; input
+  [(match_operand:ALLI 2 "aarch64_reg_or_zero" "rZ")   ;; input
(match_operand:SI 3 "const_int_operand" "")];; model
   UNSPECV_ATOMIC_EXCHG))
(clobber (reg:CC CC_REGNUM))
@@ -175,22 +175,25 @@
   }
 )
 
-(define_insn_and_split "aarch64_atomic_exchange_lse"
-  [(set (match_operand:ALLI 0 "register_operand" "=&r")
+(define_insn "aarch64_atomic_exchange_lse"
+  [(set (match_operand:ALLI 0 "register_operand" "=r")
 (match_operand:ALLI 1 "aarch64_sync_memory_operand" "+Q"))
(set (match_dup 1)
 (unspec_volatile:ALLI
-  [(match_operand:ALLI 2 "register_operand" "r")
+  [(match_operand:ALLI 2 "aarch64_reg_or_zero" "rZ")
(match_operand:SI 3 "const_int_operand" "")]
   UNSPECV_ATOMIC_EXCHG))]
   "TARGET_LSE"
-  "#"
-  "&& reload_completed"
-  [(const_int 0)]
   {
-aarch64_gen_atomic_ldop (SET, operands[0], NULL, operands[1],
-operands[2], operands[3]);
-DONE;
+enum memmodel model = memmodel_from_int (INTVAL (operands[3]));
+if (is_mm_relaxed (model))
+  return "swp\t%2, %0, %1";
+else if (is_mm_acquire (model) || is_mm_consume (model))
+  return "swpa\t%2, %0, %1";
+else if (is_mm_release (model))
+  return "swpl\t%2, %0, %1";
+else
+  return "swpal\t%2, %0, %1";
   }
 )
 
@@ -582,28 +585,6 @@
 
 ;; ARMv8.1-A LSE instructions.
 
-;; Atomic swap with memory.
-(define_insn "aarch64_atomic_swp"
- [(set (match_operand:ALLI 0 "register_operand" "+&r")
-   (match_operand:A

[PATCH 5/19][GCC-8] aarch64: Improve atomic-op lse generation

2020-04-16 Thread Andre Vieira (lists)

Fix constraints; avoid unnecessary split.  Drop the use of the atomic_op
iterator in favor of the ATOMIC_LDOP iterator; this is simplier and more
logical for ldclr aka bic.

2020-04-16  Andre Vieira 

    Backport from mainline.
    2018-10-31  Richard Henderson 

    * config/aarch64/aarch64.c (aarch64_emit_bic): Remove.
    (aarch64_atomic_ldop_supported_p): Remove.
    (aarch64_gen_atomic_ldop): Remove.
    * config/aarch64/atomic.md (atomic_):
    Fully expand LSE operations here.
    (atomic_fetch_): Likewise.
    (atomic__fetch): Likewise.
    (aarch64_atomic__lse): Drop atomic_op iterator
    and use ATOMIC_LDOP instead; use register_operand for the input;
    drop the split and emit insns directly.
    (aarch64_atomic_fetch__lse): Likewise.
    (aarch64_atomic__fetch_lse): Remove.
    (aarch64_atomic_load): Remove.

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 
a03565c3b4e13990dc1a0064f9cbbc38bb109795..da68ce0e7d096bf4a512c2b8ef52bf236f8f76f4
 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -497,8 +497,6 @@ rtx aarch64_load_tp (rtx);
 void aarch64_expand_compare_and_swap (rtx op[]);
 void aarch64_split_compare_and_swap (rtx op[]);
 
-bool aarch64_atomic_ldop_supported_p (enum rtx_code);
-void aarch64_gen_atomic_ldop (enum rtx_code, rtx, rtx, rtx, rtx, rtx);
 void aarch64_split_atomic_op (enum rtx_code, rtx, rtx, rtx, rtx, rtx, rtx);
 
 bool aarch64_gen_adjusted_ldpstp (rtx *, bool, scalar_mode, RTX_CODE);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 
bac69474598ff19161b72748505151b0d6185a9b..1068cfd899a759c506e3217e1e2c19cd778b4372
 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -14292,32 +14292,6 @@ aarch64_expand_compare_and_swap (rtx operands[])
   emit_insn (gen_rtx_SET (bval, x));
 }
 
-/* Test whether the target supports using a atomic load-operate instruction.
-   CODE is the operation and AFTER is TRUE if the data in memory after the
-   operation should be returned and FALSE if the data before the operation
-   should be returned.  Returns FALSE if the operation isn't supported by the
-   architecture.  */
-
-bool
-aarch64_atomic_ldop_supported_p (enum rtx_code code)
-{
-  if (!TARGET_LSE)
-return false;
-
-  switch (code)
-{
-case SET:
-case AND:
-case IOR:
-case XOR:
-case MINUS:
-case PLUS:
-  return true;
-default:
-  return false;
-}
-}
-
 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
sequence implementing an atomic operation.  */
 
@@ -14435,227 +14409,6 @@ aarch64_split_compare_and_swap (rtx operands[])
 aarch64_emit_post_barrier (model);
 }
 
-/* Emit a BIC instruction.  */
-
-static void
-aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
-{
-  rtx shift_rtx = GEN_INT (shift);
-  rtx (*gen) (rtx, rtx, rtx, rtx);
-
-  switch (mode)
-{
-case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
-case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
-default:
-  gcc_unreachable ();
-}
-
-  emit_insn (gen (dst, s2, shift_rtx, s1));
-}
-
-/* Operations supported by aarch64_emit_atomic_load_op.  */
-
-enum aarch64_atomic_load_op_code
-{
-  AARCH64_LDOP_PLUS,   /* A + B  */
-  AARCH64_LDOP_XOR,/* A ^ B  */
-  AARCH64_LDOP_OR, /* A | B  */
-  AARCH64_LDOP_BIC /* A & ~B  */
-};
-
-/* Emit an atomic load-operate.  */
-
-static void
-aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
-machine_mode mode, rtx dst, rtx src,
-rtx mem, rtx model)
-{
-  typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
-  const aarch64_atomic_load_op_fn plus[] =
-  {
-gen_aarch64_atomic_loadaddqi,
-gen_aarch64_atomic_loadaddhi,
-gen_aarch64_atomic_loadaddsi,
-gen_aarch64_atomic_loadadddi
-  };
-  const aarch64_atomic_load_op_fn eor[] =
-  {
-gen_aarch64_atomic_loadeorqi,
-gen_aarch64_atomic_loadeorhi,
-gen_aarch64_atomic_loadeorsi,
-gen_aarch64_atomic_loadeordi
-  };
-  const aarch64_atomic_load_op_fn ior[] =
-  {
-gen_aarch64_atomic_loadsetqi,
-gen_aarch64_atomic_loadsethi,
-gen_aarch64_atomic_loadsetsi,
-gen_aarch64_atomic_loadsetdi
-  };
-  const aarch64_atomic_load_op_fn bic[] =
-  {
-gen_aarch64_atomic_loadclrqi,
-gen_aarch64_atomic_loadclrhi,
-gen_aarch64_atomic_loadclrsi,
-gen_aarch64_atomic_loadclrdi
-  };
-  aarch64_atomic_load_op_fn gen;
-  int idx = 0;
-
-  switch (mode)
-{
-case E_QImode: idx = 0; break;
-case E_HImode: idx = 1; break;
-case E_SImode: idx = 2; break;
-case E_DImode: idx = 3; break;
-default:
-  gcc_unreachable ();
-}
-
-  switch (code)
-{
-case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
-case AARCH64_LDOP_XOR: gen = eor[idx]; break;
-case AARCH64_LDOP_OR: gen = ior[idx]; break;

[PATCH 3/19] aarch64: Improve cas generation

2020-04-16 Thread Andre Vieira (lists)

Do not zero-extend the input to the cas for subword operations;
instead, use the appropriate zero-extending compare insns.
Correct the predicates and constraints for immediate expected operand.

2020-04-16  Andre Vieira 

    Backport from mainline.
    2018-10-31  Richard Henderson 

    * config/aarch64/aarch64.c (aarch64_gen_compare_reg_maybe_ze): New.
    (aarch64_split_compare_and_swap): Use it.
    (aarch64_expand_compare_and_swap): Likewise.  Remove convert_modes;
    test oldval against the proper predicate.
    * config/aarch64/atomics.md (atomic_compare_and_swap):
    Use nonmemory_operand for expected.
    (cas_short_expected_pred): New.
    (aarch64_compare_and_swap): Use it; use "rn" not "rI" to match.
    (aarch64_compare_and_swap): Use "rn" not "rI" for expected.
    * config/aarch64/predicates.md (aarch64_plushi_immediate): New.
    (aarch64_plushi_operand): New.

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 
c83a9f7ae78d4ed3da6636fce4d1f57c27048756..b6a6e314153ecf4a7ae1b83cfb64e6192197edc5
 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1524,6 +1524,33 @@ aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
   return cc_reg;
 }
 
+/* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
+
+static rtx
+aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
+  machine_mode y_mode)
+{
+  if (y_mode == E_QImode || y_mode == E_HImode)
+{
+  if (CONST_INT_P (y))
+   y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
+  else
+   {
+ rtx t, cc_reg;
+ machine_mode cc_mode;
+
+ t = gen_rtx_ZERO_EXTEND (SImode, y);
+ t = gen_rtx_COMPARE (CC_SWPmode, t, x);
+ cc_mode = CC_SWPmode;
+ cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
+ emit_set_insn (cc_reg, t);
+ return cc_reg;
+   }
+}
+
+  return aarch64_gen_compare_reg (code, x, y);
+}
+
 /* Build the SYMBOL_REF for __tls_get_addr.  */
 
 static GTY(()) rtx tls_get_addr_libfunc;
@@ -14167,20 +14194,11 @@ aarch64_emit_unlikely_jump (rtx insn)
 void
 aarch64_expand_compare_and_swap (rtx operands[])
 {
-  rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
-  machine_mode mode, cmp_mode;
-  typedef rtx (*gen_split_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
+  rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
+  machine_mode mode, r_mode;
   typedef rtx (*gen_atomic_cas_fn) (rtx, rtx, rtx, rtx);
   int idx;
-  gen_split_cas_fn split_gen;
   gen_atomic_cas_fn atomic_gen;
-  const gen_split_cas_fn split_cas[] =
-  {
-gen_aarch64_compare_and_swapqi,
-gen_aarch64_compare_and_swaphi,
-gen_aarch64_compare_and_swapsi,
-gen_aarch64_compare_and_swapdi
-  };
   const gen_atomic_cas_fn atomic_cas[] =
   {
 gen_aarch64_compare_and_swapqi_lse,
@@ -14198,36 +14216,19 @@ aarch64_expand_compare_and_swap (rtx operands[])
   mod_s = operands[6];
   mod_f = operands[7];
   mode = GET_MODE (mem);
-  cmp_mode = mode;
 
   /* Normally the succ memory model must be stronger than fail, but in the
  unlikely event of fail being ACQUIRE and succ being RELEASE we need to
  promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
-
   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
   && is_mm_release (memmodel_from_int (INTVAL (mod_s
 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
 
-  switch (mode)
+  r_mode = mode;
+  if (mode == QImode || mode == HImode)
 {
-case E_QImode:
-case E_HImode:
-  /* For short modes, we're going to perform the comparison in SImode,
-so do the zero-extension now.  */
-  cmp_mode = SImode;
-  rval = gen_reg_rtx (SImode);
-  oldval = convert_modes (SImode, mode, oldval, true);
-  /* Fall through.  */
-
-case E_SImode:
-case E_DImode:
-  /* Force the value into a register if needed.  */
-  if (!aarch64_plus_operand (oldval, mode))
-   oldval = force_reg (cmp_mode, oldval);
-  break;
-
-default:
-  gcc_unreachable ();
+  r_mode = SImode;
+  rval = gen_reg_rtx (r_mode);
 }
 
   switch (mode)
@@ -14245,27 +14246,49 @@ aarch64_expand_compare_and_swap (rtx operands[])
   /* The CAS insn requires oldval and rval overlap, but we need to
 have a copy of oldval saved across the operation to tell if
 the operation is successful.  */
-  if (mode == QImode || mode == HImode)
-   rval = copy_to_mode_reg (SImode, gen_lowpart (SImode, oldval));
-  else if (reg_overlap_mentioned_p (rval, oldval))
-rval = copy_to_mode_reg (mode, oldval);
+  if (reg_overlap_mentioned_p (rval, oldval))
+rval = copy_to_mode_reg (r_mode, oldval);
   else
-   emit_move_insn (rval, oldval);
+   emit_move_insn (rval, gen_lowpart (r_mode, oldval));
+
   emit_insn (atomic_gen (rval, mem, 

[PATCH 1/19][GCC-8] aarch64: Fix up aarch64_compare_and_swaphi pattern [PR94368]

2020-04-16 Thread Andre Vieira (lists)

gcc/ChangeLog:
2020-04-16  Andre Vieira 

    Backport from mainline.
    2018-07-16  Ramana Radhakrishnan 

    * config/aarch64/atomics.md (aarch64_store_execlusive): Add
    early clobber.

diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index 
686e39ff2ee5940e9e93d0c2b802b46ff9f2c4e4..fba5ec6db5832a184b0323e62041f9c473761bae
 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -530,7 +530,7 @@
 )
 
 (define_insn "aarch64_store_exclusive"
-  [(set (match_operand:SI 0 "register_operand" "=r")
+  [(set (match_operand:SI 0 "register_operand" "=&r")
 (unspec_volatile:SI [(const_int 0)] UNSPECV_SX))
(set (match_operand:ALLI 1 "aarch64_sync_memory_operand" "=Q")
 (unspec_volatile:ALLI


[PATCH 0/19][GCC-8] aarch64: Backport outline atomics

2020-04-16 Thread Andre Vieira (lists)

Hi,

This series backports all the patches and fixes regarding outline 
atomics to the gcc-8 branch.


Bootstrapped the series for aarch64-linux-gnu and regression tested.
Is this OK for gcc-8?

Andre Vieira (19):
aarch64: Add early clobber for aarch64_store_exclusive
aarch64: Simplify LSE cas generation
aarch64: Improve cas generation
aarch64: Improve swp generation
aarch64: Improve atomic-op lse generation
aarch64: Remove early clobber from ATOMIC_LDOP scratch
aarch64: Extend %R for integer registers
aarch64: Implement TImode compare-and-swap
aarch64: Tidy aarch64_split_compare_and_swap
aarch64: Add out-of-line functions for LSE atomics
Add visibility to libfunc constructors
aarch64: Implement -moutline-atomics
Aarch64: Fix shrinkwrapping interactions with atomics (PR92692)
aarch64: Fix store-exclusive in load-operate LSE helpers
aarch64: Configure for sys/auxv.h in libgcc for lse-init.c
aarch64: Fix up aarch64_compare_and_swaphi pattern [PR94368]
aarch64: Fix bootstrap with old binutils [PR93053]
aarch64: Fix ICE due to aarch64_gen_compare_reg_maybe_ze [PR94435]
re PR target/90724 (ICE with __sync_bool_compare_and_swap with 
-march=armv8.2-a+sve)




[PATCH][GCC][Arm]: MVE: Add mve vec_duplicate pattern

2020-04-15 Thread Andre Vieira (lists)

Hi,

This patch fixes an ICE we were seeing due to a missing vec_duplicate 
pattern.


Regression tested on arm-none-eabi.

Is this OK for trunk?

gcc/ChangeLog:
2020-04-15  Andre Vieira  

    * config/arm/mve.md (mve_vec_duplicate): New pattern.
    (V_sz_elem2): Remove unused mode attribute.

gcc/testsuite/ChangeLog:
2020-04-15  Andre Vieira 
    Srinath Parvathaneni 

    * gcc.target/arm/mve/intrinsics/mve_vec_duplicate.c: New test.

diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 
c49c14c4240838ce086f424f58726e2e94cf190e..047b4769a28daebdc0175804c578a0d11830a291
 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -17,8 +17,6 @@
 ;; along with GCC; see the file COPYING3.  If not see
 ;; <http://www.gnu.org/licenses/>.
 
-(define_mode_attr V_sz_elem2 [(V16QI "s8") (V8HI "u16") (V4SI "u32")
- (V2DI "u64")])
 (define_mode_iterator MVE_types [V16QI V8HI V4SI V2DI TI V8HF V4SF V2DF])
 (define_mode_iterator MVE_VLD_ST [V16QI V8HI V4SI V8HF V4SF])
 (define_mode_iterator MVE_0 [V8HF V4SF])
@@ -11301,3 +11299,10 @@ (define_insn "mve_vshlcq_m_"
  "vpst\;vshlct\t%q0, %1, %4"
  [(set_attr "type" "mve_move")
   (set_attr "length" "8")])
+
+(define_insn "*mve_vec_duplicate"
+ [(set (match_operand:MVE_VLD_ST 0 "s_register_operand" "=w")
+   (vec_duplicate:MVE_VLD_ST (match_operand: 1 "general_operand" 
"r")))]
+ "TARGET_HAVE_MVE || TARGET_HAVE_MVE_FLOAT"
+ "vdup.\t%q0, %1"
+ [(set_attr "type" "mve_move")])
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vec_duplicate.c 
b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vec_duplicate.c
new file mode 100644
index 
..eda836151b3a16eb54ddebabf185be3cd8980acc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vec_duplicate.c
@@ -0,0 +1,13 @@
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O2" } */
+
+#include "arm_mve.h"
+
+float32x4_t a;
+
+void foo (void)
+{
+  a = 1.41176471f - 0.47058824f * a;
+}
+


<    1   2   3   4   5   6   7   8   >