Re: [PATCH-3v2] Value Range: Add range op for builtin isnormal

2024-06-19 Thread HAO CHEN GUI
Hi,
  Gently ping it.
https://gcc.gnu.org/pipermail/gcc-patches/2024-May/653095.html

Thanks
Gui Haochen

在 2024/5/30 10:46, HAO CHEN GUI 写道:
> Hi,
>   This patch adds the range op for builtin isnormal. It also adds two
> help function in frange to detect range of normal floating-point and
> range of subnormal or zero.
> 
>   Compared to previous version, the main change is to set the range to
> 1 if it's normal number otherwise to 0.
> https://gcc.gnu.org/pipermail/gcc-patches/2024-May/652221.html
> 
>   Bootstrapped and tested on x86 and powerpc64-linux BE and LE with no
> regressions. Is it OK for the trunk?
> 
> Thanks
> Gui Haochen
> 
> ChangeLog
> Value Range: Add range op for builtin isnormal
> 
> The former patch adds optab for builtin isnormal. Thus builtin isnormal
> might not be folded at front end.  So the range op for isnormal is needed
> for value range analysis.  This patch adds range op for builtin isnormal.
> 
> gcc/
>   * gimple-range-op.cc (class cfn_isfinite): New.
>   (op_cfn_finite): New variables.
>   (gimple_range_op_handler::maybe_builtin_call): Handle
>   CFN_BUILT_IN_ISFINITE.
>   * value-range.h (class frange): Declare known_isnormal and
>   known_isdenormal_or_zero.
>   (frange::known_isnormal): Define.
>   (frange::known_isdenormal_or_zero): Define.
> 
> gcc/testsuite/
>   * gcc/testsuite/gcc.dg/tree-ssa/range-isnormal.c: New test.
> 
> patch.diff
> diff --git a/gcc/gimple-range-op.cc b/gcc/gimple-range-op.cc
> index 5ec5c828fa4..6787f532f11 100644
> --- a/gcc/gimple-range-op.cc
> +++ b/gcc/gimple-range-op.cc
> @@ -1289,6 +1289,61 @@ public:
>}
>  } op_cfn_isfinite;
> 
> +//Implement range operator for CFN_BUILT_IN_ISNORMAL
> +class cfn_isnormal :  public range_operator
> +{
> +public:
> +  using range_operator::fold_range;
> +  using range_operator::op1_range;
> +  virtual bool fold_range (irange &r, tree type, const frange &op1,
> +const irange &, relation_trio) const override
> +  {
> +if (op1.undefined_p ())
> +  return false;
> +
> +if (op1.known_isnormal ())
> +  {
> + wide_int one = wi::one (TYPE_PRECISION (type));
> + r.set (type, one, one);
> + return true;
> +  }
> +
> +if (op1.known_isnan ()
> + || op1.known_isinf ()
> + || op1.known_isdenormal_or_zero ())
> +  {
> + r.set_zero (type);
> + return true;
> +  }
> +
> +r.set_varying (type);
> +return true;
> +  }
> +  virtual bool op1_range (frange &r, tree type, const irange &lhs,
> +   const frange &, relation_trio) const override
> +  {
> +if (lhs.undefined_p ())
> +  return false;
> +
> +if (lhs.zero_p ())
> +  {
> + r.set_varying (type);
> + return true;
> +  }
> +
> +if (!range_includes_zero_p (lhs))
> +  {
> + nan_state nan (false);
> + r.set (type, real_min_representable (type),
> +real_max_representable (type), nan);
> + return true;
> +  }
> +
> +r.set_varying (type);
> +return true;
> +  }
> +} op_cfn_isnormal;
> +
>  // Implement range operator for CFN_BUILT_IN_
>  class cfn_parity : public range_operator
>  {
> @@ -1391,6 +1446,11 @@ gimple_range_op_handler::maybe_builtin_call ()
>m_operator = &op_cfn_isfinite;
>break;
> 
> +case CFN_BUILT_IN_ISNORMAL:
> +  m_op1 = gimple_call_arg (call, 0);
> +  m_operator = &op_cfn_isnormal;
> +  break;
> +
>  CASE_CFN_COPYSIGN_ALL:
>m_op1 = gimple_call_arg (call, 0);
>m_op2 = gimple_call_arg (call, 1);
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/range-isnormal.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/range-isnormal.c
> new file mode 100644
> index 000..c4df4d839b0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/range-isnormal.c
> @@ -0,0 +1,37 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-evrp" } */
> +
> +#include 
> +void link_error();
> +
> +void test1 (double x)
> +{
> +  if (x < __DBL_MAX__ && x > __DBL_MIN__ && !__builtin_isnormal (x))
> +link_error ();
> +
> +  if (x < -__DBL_MIN__ && x > -__DBL_MAX__ && !__builtin_isnormal (x))
> +link_error ();
> +}
> +
> +void test2 (float x)
> +{
> +  if (x < __FLT_MAX__ && x > __FLT_MIN__ && !__builtin_isnormal (x))
> +link_error ();
> +
> +  if (x < -__FLT_MIN__ && x > - __FLT_MAX__ && !__builtin_isnormal (x))
> +link_error ();
> +}
> +
> +void test3 (double x)
> +{
> +  if (__builtin_isnormal (x) && __builtin_isinf (x))
> +link_error ();
> +}
> +
> +void test4 (float x)
> +{
> +  if (__builtin_isnormal (x) && __builtin_isinf (x))
> +link_error ();
> +}
> +
> +/* { dg-final { scan-tree-dump-not "link_error" "evrp" } } */
> diff --git a/gcc/value-range.h b/gcc/value-range.h
> index 37ce91dc52d..1443d1906e5 100644
> --- a/gcc/value-range.h
> +++ b/gcc/value-range.h
> @@ -588,6 +588,8 @@ public:
>bool maybe_isinf () const;
>bool signbit_p (bool &signbit) const;
>bool nan_signbit_p (bool &si

Re: [PATCH-2v4] Value Range: Add range op for builtin isfinite

2024-06-19 Thread HAO CHEN GUI
Hi,
  Gently ping it.
https://gcc.gnu.org/pipermail/gcc-patches/2024-May/653094.html

Thanks
Gui Haochen

在 2024/5/30 10:46, HAO CHEN GUI 写道:
> Hi,
>   This patch adds the range op for builtin isfinite.
> 
>   Compared to previous version, the main change is to set the range to
> 1 if it's finite number otherwise to 0.
> https://gcc.gnu.org/pipermail/gcc-patches/2024-May/652220.html
> 
>   Bootstrapped and tested on x86 and powerpc64-linux BE and LE with no
> regressions. Is it OK for the trunk?
> 
> Thanks
> Gui Haochen
> 
> ChangeLog
> Value Range: Add range op for builtin isfinite
> 
> The former patch adds optab for builtin isfinite. Thus builtin isfinite
> might not be folded at front end.  So the range op for isfinite is needed
> for value range analysis.  This patch adds range op for builtin isfinite.
> 
> gcc/
>   * gimple-range-op.cc (class cfn_isfinite): New.
>   (op_cfn_finite): New variables.
>   (gimple_range_op_handler::maybe_builtin_call): Handle
>   CFN_BUILT_IN_ISFINITE.
> 
> gcc/testsuite/
>   * gcc/testsuite/gcc.dg/tree-ssa/range-isfinite.c: New test.
> 
> patch.diff
> diff --git a/gcc/gimple-range-op.cc b/gcc/gimple-range-op.cc
> index 4e60a42eaac..5ec5c828fa4 100644
> --- a/gcc/gimple-range-op.cc
> +++ b/gcc/gimple-range-op.cc
> @@ -1233,6 +1233,62 @@ public:
>}
>  } op_cfn_isinf;
> 
> +//Implement range operator for CFN_BUILT_IN_ISFINITE
> +class cfn_isfinite : public range_operator
> +{
> +public:
> +  using range_operator::fold_range;
> +  using range_operator::op1_range;
> +  virtual bool fold_range (irange &r, tree type, const frange &op1,
> +const irange &, relation_trio) const override
> +  {
> +if (op1.undefined_p ())
> +  return false;
> +
> +if (op1.known_isfinite ())
> +  {
> + wide_int one = wi::one (TYPE_PRECISION (type));
> + r.set (type, one, one);
> + return true;
> +  }
> +
> +if (op1.known_isnan ()
> + || op1.known_isinf ())
> +  {
> + r.set_zero (type);
> + return true;
> +  }
> +
> +r.set_varying (type);
> +return true;
> +  }
> +  virtual bool op1_range (frange &r, tree type, const irange &lhs,
> +   const frange &, relation_trio) const override
> +  {
> +if (lhs.undefined_p ())
> +  return false;
> +
> +if (lhs.zero_p ())
> +  {
> + // The range is [-INF,-INF][+INF,+INF] NAN, but it can't be represented.
> + // Set range to varying
> + r.set_varying (type);
> + return true;
> +  }
> +
> +if (!range_includes_zero_p (lhs))
> +  {
> + nan_state nan (false);
> + r.set (type, real_min_representable (type),
> +real_max_representable (type), nan);
> + return true;
> +  }
> +
> +r.set_varying (type);
> +return true;
> +  }
> +} op_cfn_isfinite;
> +
>  // Implement range operator for CFN_BUILT_IN_
>  class cfn_parity : public range_operator
>  {
> @@ -1330,6 +1386,11 @@ gimple_range_op_handler::maybe_builtin_call ()
>m_operator = &op_cfn_isinf;
>break;
> 
> +case CFN_BUILT_IN_ISFINITE:
> +  m_op1 = gimple_call_arg (call, 0);
> +  m_operator = &op_cfn_isfinite;
> +  break;
> +
>  CASE_CFN_COPYSIGN_ALL:
>m_op1 = gimple_call_arg (call, 0);
>m_op2 = gimple_call_arg (call, 1);
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/range-isfinite.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/range-isfinite.c
> new file mode 100644
> index 000..f5dce0a0486
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/range-isfinite.c
> @@ -0,0 +1,31 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-evrp" } */
> +
> +#include 
> +void link_error();
> +
> +void test1 (double x)
> +{
> +  if (x < __DBL_MAX__ && x > -__DBL_MAX__ && !__builtin_isfinite (x))
> +link_error ();
> +}
> +
> +void test2 (float x)
> +{
> +  if (x < __FLT_MAX__ && x > -__FLT_MAX__ && !__builtin_isfinite (x))
> +link_error ();
> +}
> +
> +void test3 (double x)
> +{
> +  if (__builtin_isfinite (x) && __builtin_isinf (x))
> +link_error ();
> +}
> +
> +void test4 (float x)
> +{
> +  if (__builtin_isfinite (x) && __builtin_isinf (x))
> +link_error ();
> +}
> +
> +/* { dg-final { scan-tree-dump-not "link_error" "evrp" } } */


Ping [PATCH-1v3] Value Range: Add range op for builtin isinf

2024-06-19 Thread HAO CHEN GUI
Hi,
  Gently ping it.
https://gcc.gnu.org/pipermail/gcc-patches/2024-May/653096.html

Thanks
Gui Haochen

在 2024/5/30 10:46, HAO CHEN GUI 写道:
> Hi,
>   The builtin isinf is not folded at front end if the corresponding optab
> exists. It causes the range evaluation failed on the targets which has
> optab_isinf. For instance, range-sincos.c will fail on the targets which
> has optab_isinf as it calls builtin_isinf.
> 
>   This patch fixed the problem by adding range op for builtin isinf.
> 
>   Compared with previous version, the main change is to set the range to
> 1 if it's infinite number otherwise to 0.
> https://gcc.gnu.org/pipermail/gcc-patches/2024-May/652219.html
> 
>   Bootstrapped and tested on x86 and powerpc64-linux BE and LE with no
> regressions. Is it OK for the trunk?
> 
> Thanks
> Gui Haochen
> 
> 
> ChangeLog
> Value Range: Add range op for builtin isinf
> 
> The builtin isinf is not folded at front end if the corresponding optab
> exists.  So the range op for isinf is needed for value range analysis.
> This patch adds range op for builtin isinf.
> 
> gcc/
>   * gimple-range-op.cc (class cfn_isinf): New.
>   (op_cfn_isinf): New variables.
>   (gimple_range_op_handler::maybe_builtin_call): Handle
>   CASE_FLT_FN (BUILT_IN_ISINF).
> 
> gcc/testsuite/
>   * gcc/testsuite/gcc.dg/tree-ssa/range-isinf.c: New test.
> 
> patch.diff
> diff --git a/gcc/gimple-range-op.cc b/gcc/gimple-range-op.cc
> index 55dfbb23ce2..4e60a42eaac 100644
> --- a/gcc/gimple-range-op.cc
> +++ b/gcc/gimple-range-op.cc
> @@ -1175,6 +1175,63 @@ private:
>bool m_is_pos;
>  } op_cfn_goacc_dim_size (false), op_cfn_goacc_dim_pos (true);
> 
> +// Implement range operator for CFN_BUILT_IN_ISINF
> +class cfn_isinf : public range_operator
> +{
> +public:
> +  using range_operator::fold_range;
> +  using range_operator::op1_range;
> +  virtual bool fold_range (irange &r, tree type, const frange &op1,
> +const irange &, relation_trio) const override
> +  {
> +if (op1.undefined_p ())
> +  return false;
> +
> +if (op1.known_isinf ())
> +  {
> + wide_int one = wi::one (TYPE_PRECISION (type));
> + r.set (type, one, one);
> + return true;
> +  }
> +
> +if (op1.known_isnan ()
> + || (!real_isinf (&op1.lower_bound ())
> + && !real_isinf (&op1.upper_bound (
> +  {
> + r.set_zero (type);
> + return true;
> +  }
> +
> +r.set_varying (type);
> +return true;
> +  }
> +  virtual bool op1_range (frange &r, tree type, const irange &lhs,
> +   const frange &, relation_trio) const override
> +  {
> +if (lhs.undefined_p ())
> +  return false;
> +
> +if (lhs.zero_p ())
> +  {
> + nan_state nan (true);
> + r.set (type, real_min_representable (type),
> +real_max_representable (type), nan);
> + return true;
> +  }
> +
> +if (!range_includes_zero_p (lhs))
> +  {
> + // The range is [-INF,-INF][+INF,+INF], but it can't be represented.
> + // Set range to [-INF,+INF]
> + r.set_varying (type);
> + r.clear_nan ();
> + return true;
> +  }
> +
> +r.set_varying (type);
> +return true;
> +  }
> +} op_cfn_isinf;
> 
>  // Implement range operator for CFN_BUILT_IN_
>  class cfn_parity : public range_operator
> @@ -1268,6 +1325,11 @@ gimple_range_op_handler::maybe_builtin_call ()
>m_operator = &op_cfn_signbit;
>break;
> 
> +CASE_FLT_FN (BUILT_IN_ISINF):
> +  m_op1 = gimple_call_arg (call, 0);
> +  m_operator = &op_cfn_isinf;
> +  break;
> +
>  CASE_CFN_COPYSIGN_ALL:
>m_op1 = gimple_call_arg (call, 0);
>m_op2 = gimple_call_arg (call, 1);
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/range-isinf.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/range-isinf.c
> new file mode 100644
> index 000..468f1bcf5c7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/range-isinf.c
> @@ -0,0 +1,44 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-evrp" } */
> +
> +#include 
> +void link_error();
> +
> +void
> +test1 (double x)
> +{
> +  if (x > __DBL_MAX__ && !__builtin_isinf (x))
> +link_error ();
> +  if (x < -__DBL_MAX__ && !__builtin_isinf (x))
> +link_error ();
> +}
> +
> +void
> +test2 (float x)
> +{
> +  if (x > __FLT_MAX__ && !__builtin_isinf (x))
> +link_error ();
> +  if (x < -__FLT_MAX__ && !__builtin_isinf (x))
> +link_error ();
> +}
> +
> +void
> +test3 (double x)
> +{
> +  if (!__builtin_isinf (x) && !__builtin_isnan (x) && x > __DBL_MAX__)
> +link_error ();
> +  if (!__builtin_isinf (x) && !__builtin_isnan (x) && x < -__DBL_MAX__)
> +link_error ();
> +}
> +
> +void
> +test4 (float x)
> +{
> +  if (!__builtin_isinf (x) && !__builtin_isnan (x) && x > __FLT_MAX__)
> +link_error ();
> +  if (!__builtin_isinf (x) && !__builtin_isnan (x) && x < -__FLT_MAX__)
> +link_error ();
> +}
> +
> +/* { dg-final { scan-tree-dump-not "link_error" "evrp" } } */
> +


Re: [PATCH] middle-end/114070 - folding breaking VEC_COND expansion

2024-06-19 Thread Richard Biener
On Wed, 19 Jun 2024, Andrew Pinski wrote:

> On Wed, Jun 19, 2024 at 7:44 AM Vaseeharan Vinayagamoorthy
>  wrote:
> >
> > Hi,
> >
> > I have found that this patch has introduced a regression in the 
> > arm-none-eabi toolchain for a testcase, which was previously passing:
> >
> > PASS->FAIL: gcc.dg/tree-ssa/andnot-2.c scan-tree-dump-not forwprop3 "_expr"
> >
> > The toolchain was built with:
> > Build = x86_64-none-linux-gnu
> > Host = x86_64-none-linux-gnu
> > Target = arm-none-eabi
> >
> > This is also affecting the gcc-13 and gcc-14 branches.
> > Could you please let me know the impact of this regression, and whether you 
> > plan to fix the regression?
> 
> See the thread starting at
> https://gcc.gnu.org/pipermail/gcc-patches/2024-February/646587.html
> for information on the testcase regression and what needs to be done.
> I suspect this only now effects targets which don't have a vector
> modes enabled.

I think it should be OK for those.  The problematical ones are those
with no or partial vcond_mask support, but since we try to get
rid of vcond{,u,eq} that case should fix itself as well.

So no, I don't plan to fix anything besides pushing for the latter
to happen for GCC 15.

Richard.

> Note it is a (minor) missed optimization regression so the impact
> looks to be small.
> I am not sure if people have written code with this pattern, it
> requires vectors and it fails only on targets where there is no vector
> support enabled.
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95906
> 
> Thanks,
> Andrew Pinski
> 
> >
> >
> > Kind regards,
> > Vasee
> >
> > 
> > From: Richard Biener 
> > Sent: 26 February 2024 07:42
> > To: gcc-patches@gcc.gnu.org
> > Subject: [PATCH] middle-end/114070 - folding breaking VEC_COND expansion
> >
> > The following properly guards the simplifications that move
> > operations into VEC_CONDs, in particular when that changes the
> > type constraints on this operation.
> >
> > This needed a genmatch fix which was recording spurious implicit fors
> > when tcc_comparison is used in a C expression.
> >
> > Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.
> >
> > PR middle-end/114070
> > * genmatch.cc (parser::parse_c_expr): Do not record operand
> > lists but only mark operators used.
> > * match.pd ((c ? a : b) op (c ? d : e)  -->  c ? (a op d) : (b op 
> > e)):
> > Properly guard the case of tcc_comparison changing the VEC_COND
> > value operand type.
> >
> > * gcc.dg/torture/pr114070.c: New testcase.
> > ---
> >  gcc/genmatch.cc |  6 ++
> >  gcc/match.pd| 15 ---
> >  gcc/testsuite/gcc.dg/torture/pr114070.c | 12 
> >  3 files changed, 26 insertions(+), 7 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.dg/torture/pr114070.c
> >
> > diff --git a/gcc/genmatch.cc b/gcc/genmatch.cc
> > index 375ae90ae6c..d9ae436ce5c 100644
> > --- a/gcc/genmatch.cc
> > +++ b/gcc/genmatch.cc
> > @@ -4760,10 +4760,8 @@ parser::parse_c_expr (cpp_ttype start)
> > = (const char *)CPP_HASHNODE (token->val.node.node)->ident.str;
> >   if (strcmp (str, "return") == 0)
> > fatal_at (token, "return statement not allowed in C 
> > expression");
> > - id_base *idb = get_operator (str);
> > - user_id *p;
> > - if (idb && (p = dyn_cast (idb)) && p->is_oper_list)
> > -   record_operlist (token->src_loc, p);
> > + /* Mark user operators corresponding to 'str' as used.  */
> > + get_operator (str);
> > }
> >
> >/* Record the token.  */
> > diff --git a/gcc/match.pd b/gcc/match.pd
> > index c5b6540f939..67007fc2017 100644
> > --- a/gcc/match.pd
> > +++ b/gcc/match.pd
> > @@ -5149,15 +5149,24 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >  /* (c ? a : b) op (c ? d : e)  -->  c ? (a op d) : (b op e) */
> >   (simplify
> >(op (vec_cond:s @0 @1 @2) (vec_cond:s @0 @3 @4))
> > -  (vec_cond @0 (op! @1 @3) (op! @2 @4)))
> > +  (if (TREE_CODE_CLASS (op) != tcc_comparison
> > +   || types_match (type, TREE_TYPE (@1))
> > +   || expand_vec_cond_expr_p (type, TREE_TYPE (@0), ERROR_MARK))
> > +   (vec_cond @0 (op! @1 @3) (op! @2 @4
> >
> >  /* (c ? a : b) op d  -->  c ? (a op d) : (b op d) */
> >   (simplify
> >(op (vec_cond:s @0 @1 @2) @3)
> > -  (vec_cond @0 (op! @1 @3) (op! @2 @3)))
> > +  (if (TREE_CODE_CLASS (op) != tcc_comparison
> > +   || types_match (type, TREE_TYPE (@1))
> > +   || expand_vec_cond_expr_p (type, TREE_TYPE (@0), ERROR_MARK))
> > +   (vec_cond @0 (op! @1 @3) (op! @2 @3
> >   (simplify
> >(op @3 (vec_cond:s @0 @1 @2))
> > -  (vec_cond @0 (op! @3 @1) (op! @3 @2
> > +  (if (TREE_CODE_CLASS (op) != tcc_comparison
> > +   || types_match (type, TREE_TYPE (@1))
> > +   || expand_vec_cond_expr_p (type, TREE_TYPE (@0), ERROR_MARK))
> > +   (vec_cond @0 (op! @3 @1) (op! @3 @2)
> 

[PATCH-1v5] fwprop: Replace rtx_cost with insn_cost in try_fwprop_subst_pattern [PR113325]

2024-06-19 Thread HAO CHEN GUI
Hi,
  This patch replaces rtx_cost with insn_cost in forward propagation.
In the PR, one constant vector should be propagated and replace a
pseudo in a store insn if we know it's a duplicated constant vector.
It reduces the insn cost but not rtx cost. In this case, the cost is
determined by destination operand (memory or pseudo). Unfortunately,
rtx cost can't help.

  The test case is added in the second rs6000 specific patch.

  Compared to previous version, the main changes are:
1. Remove !single_set at checking likely_profitable_p. Add is_debug_insn
here, so that debug insn still need to be profitable.
2. Remove single_set check for cost comparison. Add !is_debug_insn here,
so that debug insn doesn't need to check the insn cost.

Previous version
https://gcc.gnu.org/pipermail/gcc-patches/2024-June/654964.html

  The patch causes a regression case on i386 as the pattern cost
regulation has a bug. Please refer the patch and discussion here.
https://gcc.gnu.org/pipermail/gcc-patches/2024-May/651363.html

  Bootstrapped and tested on powerpc64-linux BE and LE with no
regressions. Is it OK for the trunk?

Thanks
Gui Haochen

ChangeLog
fwprop: invoke change_is_worthwhile to judge if a replacement is worthwhile

gcc/
* fwprop.cc (try_fwprop_subst_pattern): Invoke change_is_worthwhile
to judge if a replacement is worthwhile.  Remove single_set check
and add is_debug_insn check.
* recog.cc (swap_change): Invalidate recog_data when the cached INSN
is swapped out.
* rtl-ssa/changes.cc (rtl_ssa::changes_are_worthwhile): Check if the
insn cost of new rtl is unknown and fail the replacement.

patch.diff
diff --git a/gcc/fwprop.cc b/gcc/fwprop.cc
index de543923b92..60e3ea31edc 100644
--- a/gcc/fwprop.cc
+++ b/gcc/fwprop.cc
@@ -453,7 +453,7 @@ try_fwprop_subst_pattern (obstack_watermark &attempt, 
insn_change &use_change,
   && (prop.changed_mem_p ()
  || contains_mem_rtx_p (src)
  || use_insn->is_asm ()
- || !single_set (use_rtl)))
+ || use_insn->is_debug_insn ()))
 {
   if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file, "cannot propagate from insn %d into"
@@ -471,29 +471,18 @@ try_fwprop_subst_pattern (obstack_watermark &attempt, 
insn_change &use_change,
   redo_changes (0);
 }

-  /* ??? In theory, it should be better to use insn costs rather than
- set_src_costs here.  That would involve replacing this code with
- change_is_worthwhile.  */
   bool ok = recog (attempt, use_change);
-  if (ok && !prop.changed_mem_p () && !use_insn->is_asm ())
-if (rtx use_set = single_set (use_rtl))
-  {
-   bool speed = optimize_bb_for_speed_p (BLOCK_FOR_INSN (use_rtl));
-   temporarily_undo_changes (0);
-   auto old_cost = set_src_cost (SET_SRC (use_set),
- GET_MODE (SET_DEST (use_set)), speed);
-   redo_changes (0);
-   auto new_cost = set_src_cost (SET_SRC (use_set),
- GET_MODE (SET_DEST (use_set)), speed);
-   if (new_cost > old_cost
-   || (new_cost == old_cost && !prop.likely_profitable_p ()))
- {
-   if (dump_file)
- fprintf (dump_file, "change not profitable"
-  " (cost %d -> cost %d)\n", old_cost, new_cost);
-   ok = false;
- }
-  }
+  if (ok && !prop.changed_mem_p () && !use_insn->is_asm ()
+  && !use_insn->is_debug_insn ())
+{
+  bool strict_p = !prop.likely_profitable_p ();
+  if (!change_is_worthwhile (use_change, strict_p))
+   {
+ if (dump_file)
+   fprintf (dump_file, "change not profitable");
+ ok = false;
+   }
+}

   if (!ok)
 {
diff --git a/gcc/recog.cc b/gcc/recog.cc
index a6799e3f5e6..56370e40e01 100644
--- a/gcc/recog.cc
+++ b/gcc/recog.cc
@@ -614,7 +614,11 @@ swap_change (int num)
   else
 std::swap (*changes[num].loc, changes[num].old);
   if (changes[num].object && !MEM_P (changes[num].object))
-std::swap (INSN_CODE (changes[num].object), changes[num].old_code);
+{
+  std::swap (INSN_CODE (changes[num].object), changes[num].old_code);
+  if (recog_data.insn == changes[num].object)
+   recog_data.insn = nullptr;
+}
 }

 /* Temporarily undo all the changes numbered NUM and up, with a view
diff --git a/gcc/rtl-ssa/changes.cc b/gcc/rtl-ssa/changes.cc
index 11639e81bb7..c5ac4956a19 100644
--- a/gcc/rtl-ssa/changes.cc
+++ b/gcc/rtl-ssa/changes.cc
@@ -186,6 +186,14 @@ rtl_ssa::changes_are_worthwhile (array_slice changes,
   if (!change->is_deletion ())
{
  change->new_cost = insn_cost (change->rtl (), for_speed);
+ /* If the cost is unknown, replacement is not worthwhile.  */
+ if (!change->new_cost)
+   {
+ if (dump_file && (dump_flags & TDF_DETAILS))
+   fprintf (dump_file,
+"Reject replacement due to unknown insn cost.\n");

Re: [PATCH 8/8] vect: Optimize order of lane-reducing statements in loop def-use cycles

2024-06-19 Thread Feng Xue OS
This patch was updated with some new change.

When transforming multiple lane-reducing operations in a loop reduction chain,
originally, corresponding vectorized statements are generated into def-use
cycles starting from 0. The def-use cycle with smaller index, would contain
more statements, which means more instruction dependency. For example:

   int sum = 0;
   for (i)
 {
   sum += d0[i] * d1[i];  // dot-prod 
   sum += w[i];   // widen-sum 
   sum += abs(s0[i] - s1[i]); // sad 
 }

Original transformation result:

   for (i / 16)
 {
   sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
   sum_v1 = sum_v1;  // copy
   sum_v2 = sum_v2;  // copy
   sum_v3 = sum_v3;  // copy

   sum_v0 = WIDEN_SUM (w_v0[i: 0 ~ 15], sum_v0);
   sum_v1 = sum_v1;  // copy
   sum_v2 = sum_v2;  // copy
   sum_v3 = sum_v3;  // copy

   sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0);
   sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1);
   sum_v2 = sum_v2;  // copy
   sum_v3 = sum_v3;  // copy
 }

For a higher instruction parallelism in final vectorized loop, an optimal
means is to make those effective vectorized lane-reducing statements be
distributed evenly among all def-use cycles. Transformed as the below,
DOT_PROD, WIDEN_SUM and SADs are generated into disparate cycles,
instruction dependency could be eliminated.

   for (i / 16)
 {
   sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
   sum_v1 = sum_v1;  // copy
   sum_v2 = sum_v2;  // copy
   sum_v3 = sum_v3;  // copy

   sum_v0 = sum_v0;  // copy
   sum_v1 = WIDEN_SUM (w_v1[i: 0 ~ 15], sum_v1);
   sum_v2 = sum_v2;  // copy
   sum_v3 = sum_v3;  // copy

   sum_v0 = sum_v0;  // copy
   sum_v1 = sum_v1;  // copy
   sum_v2 = SAD (s0_v2[i: 0 ~ 7 ], s1_v2[i: 0 ~ 7 ], sum_v2);
   sum_v3 = SAD (s0_v3[i: 8 ~ 15], s1_v3[i: 8 ~ 15], sum_v3);
 }

2024-03-22 Feng Xue 

gcc/
PR tree-optimization/114440
* tree-vectorizer.h (struct _stmt_vec_info): Add a new field
reduc_result_pos.
* tree-vect-loop.cc (vect_transform_reduction): Generate lane-reducing
statements in an optimized order.
---
 gcc/tree-vect-loop.cc | 43 +++
 gcc/tree-vectorizer.h |  6 ++
 2 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 5a27a2c3d9c..adee54350d4 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -8821,9 +8821,9 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
   sum_v2 = sum_v2;  // copy
   sum_v3 = sum_v3;  // copy

-  sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0);
-  sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1);
-  sum_v2 = sum_v2;  // copy
+  sum_v0 = sum_v0;  // copy
+  sum_v1 = SAD (s0_v1[i: 0 ~ 7 ], s1_v1[i: 0 ~ 7 ], sum_v1);
+  sum_v2 = SAD (s0_v2[i: 8 ~ 15], s1_v2[i: 8 ~ 15], sum_v2);
   sum_v3 = sum_v3;  // copy

   sum_v0 += n_v0[i: 0  ~ 3 ];
@@ -8831,7 +8831,12 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
   sum_v2 += n_v2[i: 8  ~ 11];
   sum_v3 += n_v3[i: 12 ~ 15];
 }
-   */
+
+Moreover, for a higher instruction parallelism in final vectorized
+loop, it is considered to make those effective vectorized lane-
+reducing statements be distributed evenly among all def-use cycles.
+In the above example, SADs are generated into other cycles rather
+than that of DOT_PROD.  */
   tree phi_vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
   unsigned all_ncopies = vect_get_num_copies (loop_vinfo, phi_vectype_in);
   unsigned use_ncopies = vec_oprnds[0].length ();
@@ -8855,6 +8860,36 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
  gcc_assert (vec_oprnds[i].length () == use_ncopies);
  vec_oprnds[i].safe_grow_cleared (all_ncopies);
}
+
+ /* Find suitable def-use cycles to generate vectorized statements
+into, and reorder operands based on the selection.  */
+ unsigned curr_pos = reduc_info->reduc_result_pos;
+ unsigned next_pos = (curr_pos + use_ncopies) % all_ncopies;
+
+ gcc_assert (curr_pos < all_ncopies);
+  reduc_info->reduc_result_pos = next_pos;
+
+ if (curr_pos)
+   {
+ unsigned count = all_ncopies - use_ncopies;
+ unsigned start = curr_pos - count;
+
+ if ((int) start < 0)
+   {
+ count = curr_pos;
+ start = 0;
+   }
+
+ for (unsigned i = 0; i < op.num_ops - 1; i++)
+   {
+ for (unsigned j = use_ncopies; j > start; j--)
+   {
+   

Re: [PATCH 7/8] vect: Support multiple lane-reducing operations for loop reduction [PR114440]

2024-06-19 Thread Feng Xue OS
Updated the patch to some new changes.


For lane-reducing operation(dot-prod/widen-sum/sad) in loop reduction, current
vectorizer could only handle the pattern if the reduction chain does not
contain other operation, no matter the other is normal or lane-reducing.

Actually, to allow multiple arbitrary lane-reducing operations, we need to
support vectorization of loop reduction chain with mixed input vectypes. Since
lanes of vectype may vary with operation, the effective ncopies of vectorized
statements for operation also may not be same to each other, this causes
mismatch on vectorized def-use cycles. A simple way is to align all operations
with the one that has the most ncopies, the gap could be complemented by
generating extra trivial pass-through copies. For example:

   int sum = 0;
   for (i)
 {
   sum += d0[i] * d1[i];  // dot-prod 
   sum += w[i];   // widen-sum 
   sum += abs(s0[i] - s1[i]); // sad 
   sum += n[i];   // normal 
 }

The vector size is 128-bit vectorization factor is 16. Reduction statements
would be transformed as:

   vector<4> int sum_v0 = { 0, 0, 0, 0 };
   vector<4> int sum_v1 = { 0, 0, 0, 0 };
   vector<4> int sum_v2 = { 0, 0, 0, 0 };
   vector<4> int sum_v3 = { 0, 0, 0, 0 };

   for (i / 16)
 {
   sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
   sum_v1 = sum_v1;  // copy
   sum_v2 = sum_v2;  // copy
   sum_v3 = sum_v3;  // copy

   sum_v0 = WIDEN_SUM (w_v0[i: 0 ~ 15], sum_v0);
   sum_v1 = sum_v1;  // copy
   sum_v2 = sum_v2;  // copy
   sum_v3 = sum_v3;  // copy

   sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0);
   sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1);
   sum_v2 = sum_v2;  // copy
   sum_v3 = sum_v3;  // copy

   sum_v0 += n_v0[i: 0  ~ 3 ];
   sum_v1 += n_v1[i: 4  ~ 7 ];
   sum_v2 += n_v2[i: 8  ~ 11];
   sum_v3 += n_v3[i: 12 ~ 15];
 }

2024-03-22 Feng Xue 

gcc/
PR tree-optimization/114440
* tree-vectorizer.h (vectorizable_lane_reducing): New function
declaration.
* tree-vect-stmts.cc (vect_analyze_stmt): Call new function
vectorizable_lane_reducing to analyze lane-reducing operation.
* tree-vect-loop.cc (vect_model_reduction_cost): Remove cost computation
code related to emulated_mixed_dot_prod.
(vect_reduction_update_partial_vector_usage): Compute ncopies as the
original means for single-lane slp node.
(vectorizable_lane_reducing): New function.
(vectorizable_reduction): Allow multiple lane-reducing operations in
loop reduction. Move some original lane-reducing related code to
vectorizable_lane_reducing.
(vect_transform_reduction): Extend transformation to support reduction
statements with mixed input vectypes.

gcc/testsuite/
PR tree-optimization/114440
* gcc.dg/vect/vect-reduc-chain-1.c
* gcc.dg/vect/vect-reduc-chain-2.c
* gcc.dg/vect/vect-reduc-chain-3.c
* gcc.dg/vect/vect-reduc-chain-dot-slp-1.c
* gcc.dg/vect/vect-reduc-chain-dot-slp-2.c
* gcc.dg/vect/vect-reduc-chain-dot-slp-3.c
* gcc.dg/vect/vect-reduc-chain-dot-slp-4.c
* gcc.dg/vect/vect-reduc-dot-slp-1.c
---
 .../gcc.dg/vect/vect-reduc-chain-1.c  |  62 
 .../gcc.dg/vect/vect-reduc-chain-2.c  |  77 
 .../gcc.dg/vect/vect-reduc-chain-3.c  |  66 
 .../gcc.dg/vect/vect-reduc-chain-dot-slp-1.c  |  95 +
 .../gcc.dg/vect/vect-reduc-chain-dot-slp-2.c  |  67 
 .../gcc.dg/vect/vect-reduc-chain-dot-slp-3.c  |  79 +
 .../gcc.dg/vect/vect-reduc-chain-dot-slp-4.c  |  63 
 .../gcc.dg/vect/vect-reduc-dot-slp-1.c|  60 
 gcc/tree-vect-loop.cc | 334 ++
 gcc/tree-vect-stmts.cc|   2 +
 gcc/tree-vectorizer.h |   2 +
 11 files changed, 834 insertions(+), 73 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-1.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-2.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-3.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-dot-slp-4.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c

diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c 
b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c
new file mode 100644
index 000..04bfc419dbd
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c
@@ -0,0 +1,62 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-e

Re: [PATCH] i386: Fix some ISA bit test in option_override

2024-06-19 Thread Uros Bizjak
On Thu, Jun 20, 2024 at 3:16 AM Hongyu Wang  wrote:
>
> Hi,
>
> This patch adjusts several new feature check in ix86_option_override_interal
> that directly use TARGET_* instead of TARGET_*_P (opts->ix86_isa_flags),
> which caused cmdline option overrides target_attribute isa flag.
>
> Bootstrapped && regtested on x86_64-pc-linux-gnu.
>
> Ok for trunk?
>
> gcc/ChangeLog:
>
> * config/i386/i386-options.cc (ix86_option_override_internal):
> Use TARGET_*_P (opts->x_ix86_isa_flags*) instead of TARGET_*
> for UINTR, LAM and APX_F.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/apx-ccmp-2.c: Remove -mno-apxf in option.
> * gcc.target/i386/funcspec-56.inc: Drop uintr tests.
> * gcc.target/i386/funcspec-6.c: Add uintr tests.

OK.

Thanks,
Uros.

> ---
>  gcc/config/i386/i386-options.cc   | 14 +-
>  gcc/testsuite/gcc.target/i386/apx-ccmp-2.c|  2 +-
>  gcc/testsuite/gcc.target/i386/funcspec-56.inc |  2 --
>  gcc/testsuite/gcc.target/i386/funcspec-6.c|  2 ++
>  4 files changed, 12 insertions(+), 8 deletions(-)
>
> diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
> index f2cecc0e254..34adedb3127 100644
> --- a/gcc/config/i386/i386-options.cc
> +++ b/gcc/config/i386/i386-options.cc
> @@ -2113,15 +2113,18 @@ ix86_option_override_internal (bool main_args_p,
>opts->x_ix86_stringop_alg = no_stringop;
>  }
>
> -  if (TARGET_APX_F && !TARGET_64BIT)
> +  if (TARGET_APX_F_P (opts->x_ix86_isa_flags2)
> +  && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
>  error ("%<-mapxf%> is not supported for 32-bit code");
> -  else if (opts->x_ix86_apx_features != apx_none && !TARGET_64BIT)
> +  else if (opts->x_ix86_apx_features != apx_none
> +  && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
>  error ("%<-mapx-features=%> option is not supported for 32-bit code");
>
> -  if (TARGET_UINTR && !TARGET_64BIT)
> +  if (TARGET_UINTR_P (opts->x_ix86_isa_flags2)
> +  && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
>  error ("%<-muintr%> not supported for 32-bit code");
>
> -  if (ix86_lam_type && !TARGET_LP64)
> +  if (ix86_lam_type && !TARGET_LP64_P (opts->x_ix86_isa_flags))
>  error ("%<-mlam=%> option: [u48|u57] not supported for 32-bit code");
>
>if (!opts->x_ix86_arch_string)
> @@ -2502,7 +2505,8 @@ ix86_option_override_internal (bool main_args_p,
>init_machine_status = ix86_init_machine_status;
>
>/* Override APX flag here if ISA bit is set.  */
> -  if (TARGET_APX_F && !OPTION_SET_P (ix86_apx_features))
> +  if (TARGET_APX_F_P (opts->x_ix86_isa_flags2)
> +  && !OPTION_SET_P (ix86_apx_features))
>  opts->x_ix86_apx_features = apx_all;
>
>/* Validate -mregparm= value.  */
> diff --git a/gcc/testsuite/gcc.target/i386/apx-ccmp-2.c 
> b/gcc/testsuite/gcc.target/i386/apx-ccmp-2.c
> index 4a0784394c3..192c0458728 100644
> --- a/gcc/testsuite/gcc.target/i386/apx-ccmp-2.c
> +++ b/gcc/testsuite/gcc.target/i386/apx-ccmp-2.c
> @@ -1,6 +1,6 @@
>  /* { dg-do run { target { ! ia32 } } } */
>  /* { dg-require-effective-target apxf } */
> -/* { dg-options "-O3 -mno-apxf" } */
> +/* { dg-options "-O3" } */
>
>  __attribute__((noinline, noclone, target("apxf")))
>  int foo_apx(int a, int b, int c, int d)
> diff --git a/gcc/testsuite/gcc.target/i386/funcspec-56.inc 
> b/gcc/testsuite/gcc.target/i386/funcspec-56.inc
> index 2a50f5bf67c..8825e88768a 100644
> --- a/gcc/testsuite/gcc.target/i386/funcspec-56.inc
> +++ b/gcc/testsuite/gcc.target/i386/funcspec-56.inc
> @@ -69,7 +69,6 @@ extern void test_avx512vp2intersect (void)
> __attribute__((__target__("avx512vp2i
>  extern void test_amx_tile (void)   
> __attribute__((__target__("amx-tile")));
>  extern void test_amx_int8 (void)   
> __attribute__((__target__("amx-int8")));
>  extern void test_amx_bf16 (void)   
> __attribute__((__target__("amx-bf16")));
> -extern void test_uintr (void)  
> __attribute__((__target__("uintr")));
>  extern void test_hreset (void) 
> __attribute__((__target__("hreset")));
>  extern void test_keylocker (void)  
> __attribute__((__target__("kl")));
>  extern void test_widekl (void) 
> __attribute__((__target__("widekl")));
> @@ -158,7 +157,6 @@ extern void test_no_avx512vp2intersect (void)   
> __attribute__((__target__("no-avx5
>  extern void test_no_amx_tile (void)
> __attribute__((__target__("no-amx-tile")));
>  extern void test_no_amx_int8 (void)
> __attribute__((__target__("no-amx-int8")));
>  extern void test_no_amx_bf16 (void)
> __attribute__((__target__("no-amx-bf16")));
> -extern void test_no_uintr (void)   
> __attribute__((__target__("no-uintr")));
>  extern void test_no_hreset (void)  
> __attribute__((__target__("no-hreset")));
>  extern void test_no_keylocker (void)   
> __attribute__((__target__("no-kl")));
>  extern void test_no

Re: [PATCH 4/8] vect: Determine input vectype for multiple lane-reducing

2024-06-19 Thread Feng Xue OS
>> + if (lane_reducing_op_p (op.code))
>> +   {
>> + unsigned group_size = slp_node ? SLP_TREE_LANES (slp_node) : 0;
>> + tree op_type = TREE_TYPE (op.ops[0]);
>> + tree new_vectype_in = get_vectype_for_scalar_type (loop_vinfo,
>> +op_type,
>> +group_size);
> 
> I think doing it this way does not adhere to the vector type size constraint
> with loop vectorization.  You should use vect_is_simple_use like the
> original code did as the actual vector definition determines the vector type
> used.

OK, though this might be wordy. 

Actually, STMT_VINFO_REDUC_VECTYPE_IN is logically equivalent to nunits_vectype
that is determined in vect_determine_vf_for_stmt_1(). So how about setting the 
type
in this function?

> 
> You are always using op.ops[0] here - I think that works because
> reduc_idx is the last operand of all lane-reducing ops.  But then
> we should assert reduc_idx != 0 here and add a comment.

Already added in the following assertion.

>> +
>> + /* The last operand of lane-reducing operation is for
>> +reduction.  */
>> + gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 
>> 1);

 ^^
>> +
>> + /* For lane-reducing operation vectorizable analysis needs the
>> +reduction PHI information */
>> + STMT_VINFO_REDUC_DEF (def) = phi_info;
>> +
>> + if (!new_vectype_in)
>> +   return false;
>> +
>> + /* Each lane-reducing operation has its own input vectype, 
>> while
>> +reduction PHI will record the input vectype with the least
>> +lanes.  */
>> + STMT_VINFO_REDUC_VECTYPE_IN (vdef) = new_vectype_in;
>> +
>> + /* To accommodate lane-reducing operations of mixed input
>> +vectypes, choose input vectype with the least lanes for the
>> +reduction PHI statement, which would result in the most
>> +ncopies for vectorized reduction results.  */
>> + if (!vectype_in
>> + || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE 
>> (vectype_in)))
>> +  < GET_MODE_SIZE (SCALAR_TYPE_MODE (op_type
>> +   vectype_in = new_vectype_in;
> 
> I know this is a fragile area but I always wonder since the accumulating 
> operand
> is the largest (all lane-reducing ops are widening), and that will be
> equal to the
> type of the PHI node, how this condition can be ever true.

In the original code, accumulating operand is skipped! While it is correctly, we
should not count the operand, this is why we call operation lane-reducing.

> 
> ncopies is determined by the VF, so the comment is at least misleading.
> 
>> +   }
>> + else
>> +   vectype_in = STMT_VINFO_VECTYPE (phi_info);
> 
> Please initialize vectype_in from phi_info before the loop (that
> should never be NULL).
> 

May not, as the below explanation. 

> I'll note that with your patch it seems we'd initialize vectype_in to
> the biggest
> non-accumulation vector type involved in lane-reducing ops but the 
> accumulating
> type might still be larger.   Why, when we have multiple lane-reducing
> ops, would
> we chose the largest input here?  I see we eventually do
> 
>   if (slp_node)
> ncopies = 1;
>   else
> ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
> 
> but then IIRC we always force a single cycle def for lane-reducing ops(?).


> In particular for vect_transform_reduction and SLP we rely on
> SLP_TREE_NUMBER_OF_VEC_STMTS while non-SLP uses
> STMT_VINFO_REDUC_VECTYPE_IN.
> 
> So I wonder what breaks when we set vectype_in = vector type of PHI?
> 

Yes. It is right, nothing is broken. Suppose that a loop contains three 
dot_prods,
two are <16 * char>, one is <8 * short>, and choose <4 * int> as vectype_in:

With the patch #7, we get:

  vector<4> int sum_v0 = { 0, 0, 0, 0 };
  vector<4> int sum_v1 = { 0, 0, 0, 0 };
  vector<4> int sum_v2 = { 0, 0, 0, 0 };
  vector<4> int sum_v3 = { 0, 0, 0, 0 };

  loop () {
 sum_v0 = dot_prod<16 * char>(char_a0, char_a1, sum_v0);
 
 sum_v0 = dot_prod<16 * char>(char_b0, char_b1, sum_v0);

 sum_v0 = dot_prod<8 * short>(short_c0_lo, short_c1_lo, sum_v0);
 sum_v1 = dot_prod<8 * short>(short_c0_hi, short_c1_hi, sum_v1);

 sum_v2 = sum_v2;
 sum_v3 = sum_v3;
  }

The def/use cycles (sum_v2 and sum_v3> would be optimized away finally.
Then this gets same result as setting vectype_in to <8 * short>.

With the patch #8, we get:

  vector<4> int sum_v0 = { 0, 0, 0, 0 };
  vector<4> int sum_v1 = { 0, 0, 0, 0 };
  vector<4> int sum_v2 = { 0, 0, 0, 0 };
  vector<4> int sum_v3 = { 0, 0, 0, 0 };

  lo

Re: [PATCH] MIPS: Use Reg0 instead of const0_rtx for TRAP

2024-06-19 Thread YunQiang Su
YunQiang Su  于2024年6月20日周四 11:20写道:
>
> Maciej W. Rozycki  于2024年6月20日周四 01:24写道:
> >
> > On Wed, 19 Jun 2024, YunQiang Su wrote:
> >
> > > MIPSr6 removes condition trap instructions with imm, so the instruction
> > > like `teq $2,imm` will be converted to
> > >   li $at, imm
> > >   teq $2, $at
> > >
> > > The current version of Gas cannot detect if imm is zero, and output
> > >   teq $2, $0
> > > Let's do it in GCC.
> >
> >  It seems like an output pattern issue with `*conditional_trap_reg'
> > insn to me.
> >
>
> Yes. You are right. We should update `*conditional_trap_reg'.
>
> > > diff --git a/gcc/config/mips/mips.cc b/gcc/config/mips/mips.cc
> > > index 48924116937..ba1e6214656 100644
> > > --- a/gcc/config/mips/mips.cc
> > > +++ b/gcc/config/mips/mips.cc
> > > @@ -6026,7 +6026,7 @@ mips_expand_conditional_trap (rtx comparison)
> > >
> > >emit_insn (gen_rtx_TRAP_IF (VOIDmode,
> > > gen_rtx_fmt_ee (code, mode, op0, op1),
> > > -   const0_rtx));
> > > +   gen_rtx_REG (mode, GP_REG_FIRST)));
> >
> >  IOW this just papers over the actual issue.
> >
>
> I think that we still need it, as it will make the RTL more easy to 
> understand.
> I think that we should make the surprise in RTL as less as possible.
>

Ohh, you are right. It seems some RTL optimization passes prefers const0_rtx
much more. It is not easy to use REG0 here.

> >  FWIW,
> >
> >   Maciej


[PATCH v2] MIPS: Output $0 for conditional trap if !ISA_HAS_COND_TRAPI

2024-06-19 Thread YunQiang Su
MIPSr6 removes condition trap instructions with imm, so the instruction
like `teq $2,imm` will be converted to
  li $at, imm
  teq $2, $at

The current version of Gas cannot detect if imm is zero, and output
  teq $2, $0
Let's do it in GCC.

gcc
* config/mips/mips.md(conditional_trap_reg): Output $0 instead
of 0 if !ISA_HAS_COND_TRAPI.
---
 gcc/config/mips/mips.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/config/mips/mips.md b/gcc/config/mips/mips.md
index 9962313602a..fd64d3d001a 100644
--- a/gcc/config/mips/mips.md
+++ b/gcc/config/mips/mips.md
@@ -1245,7 +1245,7 @@ (define_insn "*conditional_trap_reg"
 (match_operand:GPR 2 "reg_or_0_operand" "dJ")])
(const_int 0))]
   "ISA_HAS_COND_TRAP && !ISA_HAS_COND_TRAPI"
-  "t%C0\t%z1,%2"
+  "t%C0\t%z1,%z2"
   [(set_attr "type" "trap")])
 
 (define_insn "*conditional_trap"
-- 
2.39.3 (Apple Git-146)



Re: [PATCH 00/11] AArch64/OpenMP: Test SVE ACLE types with various OpenMP constructs.

2024-06-19 Thread Tejas Belagod

PING for the series.

Thanks,
Tejas.

On 5/27/24 10:36 AM, Tejas Belagod wrote:

Note: This patch series is based on Richard's initial patch
   https://gcc.gnu.org/pipermail/gcc-patches/2022-November/606741.html
and Jakub's suggestion
   https://gcc.gnu.org/pipermail/gcc-patches/2023-February/611892.html

The following patch series handles various scenarios with OpenMP and SVE types.
The starting point for the series follows a suggestion from Jakub to cover all
the possible scenarios that could arise when OMP constructs/clauses etc are
used with SVE ACLE types. Here are a few instances that this patch series tests
and in some cases fixes the expected output.  This patch series does not follow
a formal definition or a spec of how OMP interacts with SVE ACLE types, so it's
more of a proposed behaviour.  Comments and discussion welcome.

This list is not exhaustive, but covers most scenarios of how SVE ACLE types
ought to interact with OMP constructs/clauses.

1. Poly-int structures that represent variable-sized objects and OMP runtime.

Currently poly-int type structures are passed by value to OpenMP runtime
functions for shared clauses etc.  This patch improves on this by passing
around poly-int structures by address to avoid copy-overhead.

2. SVE ACLE types in OMP Shared clauses.

We test the behaviour where SVE ACLE type objects are shared in the following
methods into an OMP region:
   a. Explicit Shared clause on SVE ACLE type objects.
   b. Implicit shared clause.
   c. Implicit shared with default clause.
   d. SVE ALCE types in the presence of predetermined (static) shared objects.

The associated tests ensure that all such shared objects are passed by address
into the OMP runtime.  There are runtime tests to verify the functional
correctness of the change.

3. Offloading and SVE ACLE types.

The target clause in OpenMP is used to offload loop kernels to accelerator
peripeherals.  target's 'map' clause is used to move data from and to the
accelarator.  When the data is SVE type, it may not be suitable because of
various reasons i.e. the two SVE targets may not agree on vector size or
some targets don't support variable vector size.  This makes SVE unsuitable
for use in OMP's 'map' clause.  We diagnose all such cases and issue errors
where appropriate.  The cases we cover in this patch are:

   a. Implicitly-mapped SVE ACLE types in OMP target regions are diagnosed.
   b. Explicitly-mapped SVE ACLE types in OMP target regions using map clause
  are diagnosed.
   c. Explicilty-mapped SVLE ACLE types of various directions - to, from, tofrom
  in the map clause are diagnosed.
   d. target enter and exit data clauses with map on SVE ACLE types are
  diagnosed.
   e. target data map with alloc on SVE ACLE types are diagnosed.
   f. target update from clause on SVE ACLE types are diagnosed.
   g. target private firstprivate with SVE ACLE types are diagnosed.
   h. All combinations of target with work-sharing constructs like parallel,
  loop, simd, teams, distribute etc are also diagnosed when SVE ACLE types
  are involved.

3. Lastprivate and SVE ACLE types.

Various OpenMP lastprivate clause scenarios with SVE object types are
diagnosed.  Worksharing constructs like sections, for, distribute bind to an
implicit outer parallel region in whose scope SVE ACLE types are declared and
are therefore default private.  The lastprivate clause list with SVE ACLE type
object items are diagnosed in this scenario.

4. Threadprivate on SVE ACLE type objects.

We ensure threadprivate SVE ACLE type objects are supported. We also ensure
copyin clause is also supported.

5. User-Defined Reductions on SVE ACLE types.

We define a reduction using OMP declare reduction using SVE ACLE intrinsics and
ensure its functional correctness with various work-sharing constructs like
for, simd, parallel, task, taskloop.

6. Uniform and Aligned Clause with SVE ACLE

We ensure the uniform clause's functional correctness with simd construct and
associated SVE ACLE intrinsics in the simd region.  There is no direct
interaction between uniform and SVE ACLE type objects, but we ensure the uniform
clause applies correctly to a region where SVE ACLE intrinsics are present.
Similarly for the aligned clause.

7. Linear clause and SVE ACLE type.

We diagnose if a linear clause list item has SVE ACLE type objects present.
Its doesn't mean much if the linear clause is applied to SVE ACLE types.

8. Depend clause and SVE ACLE objects.

We test for functional correctness many combinations of dependency of shared
SVE ACLE type objects in parallel regions.  We test if in, out dependencies and
anti-dependencies are supported for SVE ACLE type objects using the depend
clause with work-sharing constructs like task.

9. 'doacross' clause and SVE ACLE object types.

doacross is mainly supported for scalars and loop iteration variables.  We
diagnose cases where SVE ACLE objects are used in doacross list items.

Tejas Belagod (11):
   OpenM

Re: [committed] [RISC-V] Fix wrong patch application

2024-06-19 Thread Christoph Müllner
Hi Jeff,

the test should probably also be skipped on -Oz:

=== gcc: Unexpected fails for rv64imafdc lp64d medlow  ===
FAIL: gcc.target/riscv/zbs-ext-2.c  -Oz   scan-assembler-times andi\t 1
FAIL: gcc.target/riscv/zbs-ext-2.c  -Oz   scan-assembler-times andn\t 1
FAIL: gcc.target/riscv/zbs-ext-2.c  -Oz   scan-assembler-times li\t 1

BR
Christoph

On Tue, Jun 18, 2024 at 8:14 PM Jeff Law  wrote:
>
>
> Applied the wrong patch which didn't have the final testsuite adjustment
> to skip -Os on the new test.  Fixed thusly.
>
> Pushed to the trunk.
>
> Jeff
>


[PATCH v2] RISC-V: Remove integer vector eqne pattern

2024-06-19 Thread demin.han
We can unify eqne and other comparison operations.

Tested on RV32 and RV64.

gcc/ChangeLog:

* config/riscv/predicates.md (comparison_except_eqge_operator): Only
  exclude ge
(comparison_except_ge_operator): Ditto
* config/riscv/riscv-string.cc (expand_rawmemchr): Use cmp pattern
(expand_strcmp): Ditto
* config/riscv/riscv-vector-builtins-bases.cc: Remove eqne cond
* config/riscv/vector.md (@pred_eqne_scalar): Remove eqne
  patterns
(*pred_eqne_scalar_merge_tie_mask): Ditto
(*pred_eqne_scalar): Ditto
(*pred_eqne_scalar_narrow): Ditto
(*pred_eqne_extended_scalar_merge_tie_mask): Ditto
(*pred_eqne_extended_scalar): Ditto
(*pred_eqne_extended_scalar_narrow): Ditto

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/base/integer-cmp-eqne.c: New test.

Signed-off-by: demin.han 
---
v2 changes:
  1. add test

 gcc/config/riscv/predicates.md|   4 +-
 gcc/config/riscv/riscv-string.cc  |   4 +-
 .../riscv/riscv-vector-builtins-bases.cc  |   3 -
 gcc/config/riscv/vector.md| 279 +-
 .../riscv/rvv/base/integer-cmp-eqne.c |  66 +
 5 files changed, 81 insertions(+), 275 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/integer-cmp-eqne.c

diff --git a/gcc/config/riscv/predicates.md b/gcc/config/riscv/predicates.md
index 0fb5729fdcf..9971fabc587 100644
--- a/gcc/config/riscv/predicates.md
+++ b/gcc/config/riscv/predicates.md
@@ -568,8 +568,8 @@ (define_predicate "ltge_operator"
 (define_predicate "comparison_except_ltge_operator"
   (match_code "eq,ne,le,leu,gt,gtu"))
 
-(define_predicate "comparison_except_eqge_operator"
-  (match_code "le,leu,gt,gtu,lt,ltu"))
+(define_predicate "comparison_except_ge_operator"
+  (match_code "eq,ne,le,leu,gt,gtu,lt,ltu"))
 
 (define_predicate "ge_operator"
   (match_code "ge,geu"))
diff --git a/gcc/config/riscv/riscv-string.cc b/gcc/config/riscv/riscv-string.cc
index 83e7afbd693..4702001bd9b 100644
--- a/gcc/config/riscv/riscv-string.cc
+++ b/gcc/config/riscv/riscv-string.cc
@@ -1342,7 +1342,7 @@ expand_rawmemchr (machine_mode mode, rtx dst, rtx 
haystack, rtx needle,
   /* Compare needle with haystack and store in a mask.  */
   rtx eq = gen_rtx_EQ (mask_mode, gen_const_vec_duplicate (vmode, needle), 
vec);
   rtx vmsops[] = {mask, eq, vec, needle};
-  emit_nonvlmax_insn (code_for_pred_eqne_scalar (vmode),
+  emit_nonvlmax_insn (code_for_pred_cmp_scalar (vmode),
  riscv_vector::COMPARE_OP, vmsops, cnt);
 
   /* Find the first bit in the mask.  */
@@ -1468,7 +1468,7 @@ expand_strcmp (rtx result, rtx src1, rtx src2, rtx nbytes,
 = gen_rtx_EQ (mask_mode, gen_const_vec_duplicate (vmode, CONST0_RTX 
(mode)),
  vec1);
   rtx vmsops1[] = {mask0, eq0, vec1, CONST0_RTX (mode)};
-  emit_nonvlmax_insn (code_for_pred_eqne_scalar (vmode),
+  emit_nonvlmax_insn (code_for_pred_cmp_scalar (vmode),
  riscv_vector::COMPARE_OP, vmsops1, cnt);
 
   /* Look for vec1 != vec2 (includes vec2[i] == 0).  */
diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc 
b/gcc/config/riscv/riscv-vector-builtins-bases.cc
index 596b88cc8a3..6483faba39c 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.cc
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc
@@ -718,9 +718,6 @@ public:
  if (CODE == GE || CODE == GEU)
return e.use_compare_insn (CODE, code_for_pred_ge_scalar (
   e.vector_mode ()));
- else if (CODE == EQ || CODE == NE)
-   return e.use_compare_insn (CODE, code_for_pred_eqne_scalar (
-  e.vector_mode ()));
  else
return e.use_compare_insn (CODE, code_for_pred_cmp_scalar (
   e.vector_mode ()));
diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index f8fae6557d9..fe18ee5b5f7 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -4704,7 +4704,7 @@ (define_expand "@pred_cmp_scalar"
 (match_operand 8 "const_int_operand")
 (reg:SI VL_REGNUM)
 (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
- (match_operator: 3 "comparison_except_eqge_operator"
+ (match_operator: 3 "comparison_except_ge_operator"
 [(match_operand:V_VLSI_QHS 4 "register_operand")
  (vec_duplicate:V_VLSI_QHS
(match_operand: 5 "register_operand"))])
@@ -4722,7 +4722,7 @@ (define_insn "*pred_cmp_scalar_merge_tie_mask"
 (match_operand 7 "const_int_operand"  "  i")
 (reg:SI VL_REGNUM)
 (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
- (match_operator: 2 "comparison_except_eqge_operator"
+ (match_operator: 2 "comparison_except_ge_operator"
 [(match_operand:V_VLSI_QHS 3 "register_operand"   

Re: [PATCH] MIPS: Use Reg0 instead of const0_rtx for TRAP

2024-06-19 Thread YunQiang Su
Maciej W. Rozycki  于2024年6月20日周四 01:24写道:
>
> On Wed, 19 Jun 2024, YunQiang Su wrote:
>
> > MIPSr6 removes condition trap instructions with imm, so the instruction
> > like `teq $2,imm` will be converted to
> >   li $at, imm
> >   teq $2, $at
> >
> > The current version of Gas cannot detect if imm is zero, and output
> >   teq $2, $0
> > Let's do it in GCC.
>
>  It seems like an output pattern issue with `*conditional_trap_reg'
> insn to me.
>

Yes. You are right. We should update `*conditional_trap_reg'.

> > diff --git a/gcc/config/mips/mips.cc b/gcc/config/mips/mips.cc
> > index 48924116937..ba1e6214656 100644
> > --- a/gcc/config/mips/mips.cc
> > +++ b/gcc/config/mips/mips.cc
> > @@ -6026,7 +6026,7 @@ mips_expand_conditional_trap (rtx comparison)
> >
> >emit_insn (gen_rtx_TRAP_IF (VOIDmode,
> > gen_rtx_fmt_ee (code, mode, op0, op1),
> > -   const0_rtx));
> > +   gen_rtx_REG (mode, GP_REG_FIRST)));
>
>  IOW this just papers over the actual issue.
>

I think that we still need it, as it will make the RTL more easy to understand.
I think that we should make the surprise in RTL as less as possible.

>  FWIW,
>
>   Maciej


Re: [PATCH] build: Fix missing variable quotes and typo

2024-06-19 Thread YunQiang Su
Collin Funk  于2024年6月20日周四 07:40写道:
>
> I've just fixed the quotes and that typo in one patch.  I hope you don't
> mind.  When using Autoconf 2.69 and Automake 1.15.1 that copyright diff
> goes away.  I'm not familiar with the gcc-autoregen bot but I think this
> should make it happy.
>
> -- >8 --
>
> When dlopen and pthread_create are in libc the variable is
> set to "none required", therefore running configure will show
> the following errors:
>
> ./configure: line 8997: test: too many arguments
> ./configure: line 8999: test: too many arguments
> ./configure: line 9003: test: too many arguments
> ./configure: line 9005: test: =: unary operator expected
>
> ChangeLog:
>
> PR bootstrap/115453
> * configure.ac: Quote variable result of AC_SEARCH_LIBS.  Fix
> typo ac_cv_search_pthread_crate.
> * configure: Regenerate.
>
> Signed-off-by: Collin Funk 
> ---

I committed it. And if you are using git format-patch, you can add
-V2/-V3/-V4 option if you are resending a updated patch.


[PATCH] Build: Set gcc_cv_as_mips_explicit_relocs if gcc_cv_as_mips_explicit_relocs_pcrel

2024-06-19 Thread YunQiang Su
We check gcc_cv_as_mips_explicit_relocs if gcc_cv_as_mips_explicit_relocs_pcrel
only, while gcc_cv_as_mips_explicit_relocs is used by later code.

Maybe, it is time for use to set gcc_cv_as_mips_explicit_relocs always now,
as it has been in Binutils for more than 20 years.

gcc
* configure.ac: Set gcc_cv_as_mips_explicit_relocs if
gcc_cv_as_mips_explicit_relocs_pcrel.
* configure: Regenerate.
---
 gcc/configure| 2 ++
 gcc/configure.ac | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/gcc/configure b/gcc/configure
index 9dc0b65dfaa..ad998105da3 100755
--- a/gcc/configure
+++ b/gcc/configure
@@ -30278,6 +30278,8 @@ $as_echo "#define MIPS_EXPLICIT_RELOCS 
MIPS_EXPLICIT_RELOCS_BASE" >>confdefs.h
 
 fi
 
+else
+  gcc_cv_as_mips_explicit_relocs=yes
 fi
 
 if test x$gcc_cv_as_mips_explicit_relocs = xno; then \
diff --git a/gcc/configure.ac b/gcc/configure.ac
index b2243e9954a..c51d3ca5f1b 100644
--- a/gcc/configure.ac
+++ b/gcc/configure.ac
@@ -5255,6 +5255,8 @@ LCF0:
 [  lw $4,%gp_rel(foo)($4)],,
   [AC_DEFINE(MIPS_EXPLICIT_RELOCS, MIPS_EXPLICIT_RELOCS_BASE,
 [Define if assembler supports %reloc.])])
+else
+  gcc_cv_as_mips_explicit_relocs=yes
 fi
 
 if test x$gcc_cv_as_mips_explicit_relocs = xno; then \
-- 
2.39.3 (Apple Git-146)



[PATCH] RISC-V: Add dg-remove-option

2024-06-19 Thread Patrick O'Neill
This introduces testsuite support infra for removing extensions.
Since z* extensions don't have ordering requirements the logic for
adding/removing those extensions has also been consolidated.

This fixes RVWMO compile testcases failing on Ztso targets by removing
the extension from the -march string.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/amo/amo-table-a-6-amo-add-1.c: Add dg-remove-options
for ztso.
* gcc.target/riscv/amo/amo-table-a-6-amo-add-2.c: Ditto.
* gcc.target/riscv/amo/amo-table-a-6-amo-add-3.c: Ditto.
* gcc.target/riscv/amo/amo-table-a-6-amo-add-4.c: Ditto.
* gcc.target/riscv/amo/amo-table-a-6-amo-add-5.c: Ditto.
* gcc.target/riscv/amo/amo-table-a-6-compare-exchange-1.c: Ditto.
* gcc.target/riscv/amo/amo-table-a-6-compare-exchange-2.c: Ditto.
* gcc.target/riscv/amo/amo-table-a-6-compare-exchange-3.c: Ditto.
* gcc.target/riscv/amo/amo-table-a-6-compare-exchange-4.c: Ditto.
* gcc.target/riscv/amo/amo-table-a-6-compare-exchange-5.c: Ditto.
* gcc.target/riscv/amo/amo-table-a-6-compare-exchange-6.c: Ditto.
* gcc.target/riscv/amo/amo-table-a-6-compare-exchange-7.c: Ditto.
* gcc.target/riscv/amo/amo-table-a-6-fence-1.c: Ditto.
* gcc.target/riscv/amo/amo-table-a-6-fence-2.c: Ditto.
* gcc.target/riscv/amo/amo-table-a-6-fence-3.c: Ditto.
* gcc.target/riscv/amo/amo-table-a-6-fence-4.c: Ditto.
* gcc.target/riscv/amo/amo-table-a-6-fence-5.c: Ditto.
* gcc.target/riscv/amo/amo-table-a-6-load-1.c: Ditto.
* gcc.target/riscv/amo/amo-table-a-6-load-2.c: Ditto.
* gcc.target/riscv/amo/amo-table-a-6-load-3.c: Ditto.
* gcc.target/riscv/amo/amo-table-a-6-store-1.c: Ditto.
* gcc.target/riscv/amo/amo-table-a-6-store-2.c: Ditto.
* gcc.target/riscv/amo/amo-table-a-6-store-compat-3.c: Ditto.
* gcc.target/riscv/amo/amo-table-a-6-subword-amo-add-1.c: Ditto.
* gcc.target/riscv/amo/amo-table-a-6-subword-amo-add-2.c: Ditto.
* gcc.target/riscv/amo/amo-table-a-6-subword-amo-add-3.c: Ditto.
* gcc.target/riscv/amo/amo-table-a-6-subword-amo-add-4.c: Ditto.
* gcc.target/riscv/amo/amo-table-a-6-subword-amo-add-5.c: Ditto.
* gcc.target/riscv/amo/amo-zalrsc-amo-add-1.c: Replace manually
specified -march string with dg-remove-options zaamo/ztso.
* gcc.target/riscv/amo/amo-zalrsc-amo-add-2.c: Ditto.
* gcc.target/riscv/amo/amo-zalrsc-amo-add-3.c: Ditto.
* gcc.target/riscv/amo/amo-zalrsc-amo-add-4.c: Ditto.
* gcc.target/riscv/amo/amo-zalrsc-amo-add-5.c: Ditto.
* lib/target-supports-dg.exp: Add dg-remove-options.
* lib/target-supports.exp: Add dg-remove-options and consolidate z*
extension add/remove-option code.

Signed-off-by: Patrick O'Neill 
---
Tested using rv64gcv_ztso but relying on precommit to run the targets
there.

Beyond testing Ztso/Zalrsc this is also helpful for the Zabha patch I'm
working on. We can continue to test the atomic subword emulation
routines without specifing a -march string.
---
 .../riscv/amo/amo-table-a-6-amo-add-1.c   |   1 +
 .../riscv/amo/amo-table-a-6-amo-add-2.c   |   1 +
 .../riscv/amo/amo-table-a-6-amo-add-3.c   |   1 +
 .../riscv/amo/amo-table-a-6-amo-add-4.c   |   1 +
 .../riscv/amo/amo-table-a-6-amo-add-5.c   |   1 +
 .../amo/amo-table-a-6-compare-exchange-1.c|   1 +
 .../amo/amo-table-a-6-compare-exchange-2.c|   1 +
 .../amo/amo-table-a-6-compare-exchange-3.c|   1 +
 .../amo/amo-table-a-6-compare-exchange-4.c|   1 +
 .../amo/amo-table-a-6-compare-exchange-5.c|   1 +
 .../amo/amo-table-a-6-compare-exchange-6.c|   1 +
 .../amo/amo-table-a-6-compare-exchange-7.c|   1 +
 .../riscv/amo/amo-table-a-6-fence-1.c |   1 +
 .../riscv/amo/amo-table-a-6-fence-2.c |   1 +
 .../riscv/amo/amo-table-a-6-fence-3.c |   1 +
 .../riscv/amo/amo-table-a-6-fence-4.c |   1 +
 .../riscv/amo/amo-table-a-6-fence-5.c |   1 +
 .../riscv/amo/amo-table-a-6-load-1.c  |   1 +
 .../riscv/amo/amo-table-a-6-load-2.c  |   1 +
 .../riscv/amo/amo-table-a-6-load-3.c  |   1 +
 .../riscv/amo/amo-table-a-6-store-1.c |   1 +
 .../riscv/amo/amo-table-a-6-store-2.c |   1 +
 .../riscv/amo/amo-table-a-6-store-compat-3.c  |   1 +
 .../amo/amo-table-a-6-subword-amo-add-1.c |   1 +
 .../amo/amo-table-a-6-subword-amo-add-2.c |   1 +
 .../amo/amo-table-a-6-subword-amo-add-3.c |   1 +
 .../amo/amo-table-a-6-subword-amo-add-4.c |   1 +
 .../amo/amo-table-a-6-subword-amo-add-5.c |   1 +
 .../riscv/amo/amo-zalrsc-amo-add-1.c  |   4 +-
 .../riscv/amo/amo-zalrsc-amo-add-2.c  |   4 +-
 .../riscv/amo/amo-zalrsc-amo-add-3.c  |   4 +-
 .../riscv/amo/amo-zalrsc-amo-add-4.c  |   4 +-
 .../riscv/amo/amo-zalrsc-amo-add-5.c  |   4 +-
 gcc/testsuite/lib/target-supports-dg.exp

RE: [PATCH v2] RISC-V: Remove float vector eqne pattern

2024-06-19 Thread Demin Han
Hi Jeff,

Thanks for fixing that.

Regards,
Demin

> -Original Message-
> From: Jeff Law 
> Sent: 2024年6月19日 22:33
> To: Demin Han ; gcc-patches@gcc.gnu.org
> Cc: juzhe.zh...@rivai.ai; kito.ch...@gmail.com; pan2...@intel.com;
> rdapp@gmail.com
> Subject: Re: [PATCH v2] RISC-V: Remove float vector eqne pattern
> 
> 
> 
> On 6/19/24 6:30 AM, demin.han wrote:
> > We can unify eqne and other comparison operations.
> >
> > Tested on RV32 and RV64
> >
> > gcc/ChangeLog:
> >
> > * config/riscv/riscv-vector-builtins-bases.cc: Remove eqne cond
> > * config/riscv/vector.md (@pred_eqne_scalar): Remove patterns
> > (*pred_eqne_scalar_merge_tie_mask): Ditto
> > (*pred_eqne_scalar): Ditto
> > (*pred_eqne_scalar_narrow): Ditto
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/riscv/rvv/base/float-point-cmp-eqne.c: New test.
> >
> > Signed-off-by: demin.han 
> > ---
> >
> > v2 changes:
> >1. add test
> >
> >Only intrinsics utilize those removed vf patterns.
> >Auto vectorization use vv format now.
> >The NaN will optimized out before expand in autovec as I tested.
> >
> >   .../riscv/riscv-vector-builtins-bases.cc  |  4 -
> >   gcc/config/riscv/vector.md| 86 ---
> >   .../riscv/rvv/base/float-point-cmp-eqne.c | 54 
> >   3 files changed, 54 insertions(+), 90 deletions(-)
> >   create mode 100644
> > gcc/testsuite/gcc.target/riscv/rvv/base/float-point-cmp-eqne.c
> >
> > diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc
> > b/gcc/config/riscv/riscv-vector-builtins-bases.cc
> > index b6f6e4ff37e..d414721ede8 100644
> > --- a/gcc/config/riscv/riscv-vector-builtins-bases.cc
> > +++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc
> > @@ -1420,10 +1420,6 @@ public:
> >   switch (e.op_info->op)
> > {
> > case OP_TYPE_vf: {
> > - if (CODE == EQ || CODE == NE)
> > -   return e.use_compare_insn (CODE, code_for_pred_eqne_scalar (
> > -  e.vector_mode ()));
> > - else
> > return e.use_compare_insn (CODE, code_for_pred_cmp_scalar
> (
> >e.vector_mode ()));
> Formatting nit.  You removed the IF-THEN-ELSE construct, leaving just the
> ELSE's body.  You need to reindent that body, both lines of which would move
> left by two spaces.
> 
> I'll fix and push it momentarily.
> 
> jeff


[PATCH] i386: Fix some ISA bit test in option_override

2024-06-19 Thread Hongyu Wang
Hi,

This patch adjusts several new feature check in ix86_option_override_interal
that directly use TARGET_* instead of TARGET_*_P (opts->ix86_isa_flags),
which caused cmdline option overrides target_attribute isa flag.

Bootstrapped && regtested on x86_64-pc-linux-gnu.

Ok for trunk?

gcc/ChangeLog:

* config/i386/i386-options.cc (ix86_option_override_internal):
Use TARGET_*_P (opts->x_ix86_isa_flags*) instead of TARGET_*
for UINTR, LAM and APX_F.

gcc/testsuite/ChangeLog:

* gcc.target/i386/apx-ccmp-2.c: Remove -mno-apxf in option.
* gcc.target/i386/funcspec-56.inc: Drop uintr tests.
* gcc.target/i386/funcspec-6.c: Add uintr tests.
---
 gcc/config/i386/i386-options.cc   | 14 +-
 gcc/testsuite/gcc.target/i386/apx-ccmp-2.c|  2 +-
 gcc/testsuite/gcc.target/i386/funcspec-56.inc |  2 --
 gcc/testsuite/gcc.target/i386/funcspec-6.c|  2 ++
 4 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index f2cecc0e254..34adedb3127 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -2113,15 +2113,18 @@ ix86_option_override_internal (bool main_args_p,
   opts->x_ix86_stringop_alg = no_stringop;
 }
 
-  if (TARGET_APX_F && !TARGET_64BIT)
+  if (TARGET_APX_F_P (opts->x_ix86_isa_flags2)
+  && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
 error ("%<-mapxf%> is not supported for 32-bit code");
-  else if (opts->x_ix86_apx_features != apx_none && !TARGET_64BIT)
+  else if (opts->x_ix86_apx_features != apx_none
+  && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
 error ("%<-mapx-features=%> option is not supported for 32-bit code");
 
-  if (TARGET_UINTR && !TARGET_64BIT)
+  if (TARGET_UINTR_P (opts->x_ix86_isa_flags2)
+  && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
 error ("%<-muintr%> not supported for 32-bit code");
 
-  if (ix86_lam_type && !TARGET_LP64)
+  if (ix86_lam_type && !TARGET_LP64_P (opts->x_ix86_isa_flags))
 error ("%<-mlam=%> option: [u48|u57] not supported for 32-bit code");
 
   if (!opts->x_ix86_arch_string)
@@ -2502,7 +2505,8 @@ ix86_option_override_internal (bool main_args_p,
   init_machine_status = ix86_init_machine_status;
 
   /* Override APX flag here if ISA bit is set.  */
-  if (TARGET_APX_F && !OPTION_SET_P (ix86_apx_features))
+  if (TARGET_APX_F_P (opts->x_ix86_isa_flags2)
+  && !OPTION_SET_P (ix86_apx_features))
 opts->x_ix86_apx_features = apx_all;
 
   /* Validate -mregparm= value.  */
diff --git a/gcc/testsuite/gcc.target/i386/apx-ccmp-2.c 
b/gcc/testsuite/gcc.target/i386/apx-ccmp-2.c
index 4a0784394c3..192c0458728 100644
--- a/gcc/testsuite/gcc.target/i386/apx-ccmp-2.c
+++ b/gcc/testsuite/gcc.target/i386/apx-ccmp-2.c
@@ -1,6 +1,6 @@
 /* { dg-do run { target { ! ia32 } } } */
 /* { dg-require-effective-target apxf } */
-/* { dg-options "-O3 -mno-apxf" } */
+/* { dg-options "-O3" } */
 
 __attribute__((noinline, noclone, target("apxf")))
 int foo_apx(int a, int b, int c, int d)
diff --git a/gcc/testsuite/gcc.target/i386/funcspec-56.inc 
b/gcc/testsuite/gcc.target/i386/funcspec-56.inc
index 2a50f5bf67c..8825e88768a 100644
--- a/gcc/testsuite/gcc.target/i386/funcspec-56.inc
+++ b/gcc/testsuite/gcc.target/i386/funcspec-56.inc
@@ -69,7 +69,6 @@ extern void test_avx512vp2intersect (void)
__attribute__((__target__("avx512vp2i
 extern void test_amx_tile (void)   
__attribute__((__target__("amx-tile")));
 extern void test_amx_int8 (void)   
__attribute__((__target__("amx-int8")));
 extern void test_amx_bf16 (void)   
__attribute__((__target__("amx-bf16")));
-extern void test_uintr (void)  
__attribute__((__target__("uintr")));
 extern void test_hreset (void) 
__attribute__((__target__("hreset")));
 extern void test_keylocker (void)  
__attribute__((__target__("kl")));
 extern void test_widekl (void) 
__attribute__((__target__("widekl")));
@@ -158,7 +157,6 @@ extern void test_no_avx512vp2intersect (void)   
__attribute__((__target__("no-avx5
 extern void test_no_amx_tile (void)
__attribute__((__target__("no-amx-tile")));
 extern void test_no_amx_int8 (void)
__attribute__((__target__("no-amx-int8")));
 extern void test_no_amx_bf16 (void)
__attribute__((__target__("no-amx-bf16")));
-extern void test_no_uintr (void)   
__attribute__((__target__("no-uintr")));
 extern void test_no_hreset (void)  
__attribute__((__target__("no-hreset")));
 extern void test_no_keylocker (void)   
__attribute__((__target__("no-kl")));
 extern void test_no_widekl (void)  
__attribute__((__target__("no-widekl")));
diff --git a/gcc/testsuite/gcc.target/i386/funcspec-6.c 
b/gcc/testsuite/gcc.target/i386/funcspec-6.c
index ea896b7ebfd..033c9a50e23 100644
--- a/gcc/testsuite/gcc.target/i386/funcspec-6.c
+++ b/gcc/testsuite/g

[PATCH] build: Fix missing variable quotes and typo

2024-06-19 Thread Collin Funk
I've just fixed the quotes and that typo in one patch.  I hope you don't
mind.  When using Autoconf 2.69 and Automake 1.15.1 that copyright diff
goes away.  I'm not familiar with the gcc-autoregen bot but I think this
should make it happy.

-- >8 --

When dlopen and pthread_create are in libc the variable is
set to "none required", therefore running configure will show
the following errors:

./configure: line 8997: test: too many arguments
./configure: line 8999: test: too many arguments
./configure: line 9003: test: too many arguments
./configure: line 9005: test: =: unary operator expected

ChangeLog:

PR bootstrap/115453
* configure.ac: Quote variable result of AC_SEARCH_LIBS.  Fix
typo ac_cv_search_pthread_crate.
* configure: Regenerate.

Signed-off-by: Collin Funk 
---
 configure| 8 
 configure.ac | 8 
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/configure b/configure
index 51576a41f30..51bf1d1add1 100755
--- a/configure
+++ b/configure
@@ -8994,15 +8994,15 @@ if test "$ac_res" != no; then :
 fi
 
 
-if test $ac_cv_search_dlopen = -ldl; then
+if test "$ac_cv_search_dlopen" = -ldl; then
 CRAB1_LIBS="$CRAB1_LIBS -ldl"
-elif test $ac_cv_search_dlopen = no; then
+elif test "$ac_cv_search_dlopen" = no; then
 missing_rust_dynlibs="libdl"
 fi
 
-if test $ac_cv_search_pthread_create = -lpthread; then
+if test "$ac_cv_search_pthread_create" = -lpthread; then
 CRAB1_LIBS="$CRAB1_LIBS -lpthread"
-elif test $ac_cv_search_pthread_crate = no; then
+elif test "$ac_cv_search_pthread_create" = no; then
 missing_rust_dynlibs="$missing_rust_dynlibs, libpthread"
 fi
 
diff --git a/configure.ac b/configure.ac
index 5eda8dcdbf7..20457005e29 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2045,15 +2045,15 @@ missing_rust_dynlibs=none
 AC_SEARCH_LIBS([dlopen], [dl])
 AC_SEARCH_LIBS([pthread_create], [pthread])
 
-if test $ac_cv_search_dlopen = -ldl; then
+if test "$ac_cv_search_dlopen" = -ldl; then
 CRAB1_LIBS="$CRAB1_LIBS -ldl"
-elif test $ac_cv_search_dlopen = no; then
+elif test "$ac_cv_search_dlopen" = no; then
 missing_rust_dynlibs="libdl"
 fi
 
-if test $ac_cv_search_pthread_create = -lpthread; then
+if test "$ac_cv_search_pthread_create" = -lpthread; then
 CRAB1_LIBS="$CRAB1_LIBS -lpthread"
-elif test $ac_cv_search_pthread_crate = no; then
+elif test "$ac_cv_search_pthread_create" = no; then
 missing_rust_dynlibs="$missing_rust_dynlibs, libpthread"
 fi
 
-- 
2.45.2



Re: [gcc r15-1436] build: Fix missing variable quotes

2024-06-19 Thread YunQiang Su
Thanks.  Sorry for the noise. I have reverted
   8088374a868aacab4dff208ec3e3fde790a1d9a3
   c6a9ab8c920f297c4efd289182aef9fbc73f5906

I will submit and back port the modification of gcc_cv_as_mips_explicit_relocs
separately.

@Collin Funk Can you sent a new correct/full patch?


Re: [pushed] readings: Drop FORTRAN 77 test suite at itl.nist.gov

2024-06-19 Thread Jerry D

On 6/18/24 10:20 AM, Steve Kargl wrote:

On Tue, Jun 18, 2024 at 09:13:23AM +0200, Gerald Pfeifer wrote:

The original subsite has disappeared and we couldn't find it elsewhere.



https://github.com/gklimowicz/FCVS

gklimowicz is a flang developer and member of J3.



FWIW my copy of the tests still pass:

--- snip ---

FM921 compiles and runs OK
***FM922***
FM922 compiles and runs OK
***FM923***
FM923 compiles and runs OK

The logfile is nist.log

0 compilation errors or warnings

0 load and link errors

0 runtime errors

192 completely successful



Re: [PATCH] middle-end/114070 - folding breaking VEC_COND expansion

2024-06-19 Thread Andrew Pinski
On Wed, Jun 19, 2024 at 7:44 AM Vaseeharan Vinayagamoorthy
 wrote:
>
> Hi,
>
> I have found that this patch has introduced a regression in the arm-none-eabi 
> toolchain for a testcase, which was previously passing:
>
> PASS->FAIL: gcc.dg/tree-ssa/andnot-2.c scan-tree-dump-not forwprop3 "_expr"
>
> The toolchain was built with:
> Build = x86_64-none-linux-gnu
> Host = x86_64-none-linux-gnu
> Target = arm-none-eabi
>
> This is also affecting the gcc-13 and gcc-14 branches.
> Could you please let me know the impact of this regression, and whether you 
> plan to fix the regression?

See the thread starting at
https://gcc.gnu.org/pipermail/gcc-patches/2024-February/646587.html
for information on the testcase regression and what needs to be done.
I suspect this only now effects targets which don't have a vector
modes enabled.

Note it is a (minor) missed optimization regression so the impact
looks to be small.
I am not sure if people have written code with this pattern, it
requires vectors and it fails only on targets where there is no vector
support enabled.
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95906

Thanks,
Andrew Pinski

>
>
> Kind regards,
> Vasee
>
> 
> From: Richard Biener 
> Sent: 26 February 2024 07:42
> To: gcc-patches@gcc.gnu.org
> Subject: [PATCH] middle-end/114070 - folding breaking VEC_COND expansion
>
> The following properly guards the simplifications that move
> operations into VEC_CONDs, in particular when that changes the
> type constraints on this operation.
>
> This needed a genmatch fix which was recording spurious implicit fors
> when tcc_comparison is used in a C expression.
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.
>
> PR middle-end/114070
> * genmatch.cc (parser::parse_c_expr): Do not record operand
> lists but only mark operators used.
> * match.pd ((c ? a : b) op (c ? d : e)  -->  c ? (a op d) : (b op e)):
> Properly guard the case of tcc_comparison changing the VEC_COND
> value operand type.
>
> * gcc.dg/torture/pr114070.c: New testcase.
> ---
>  gcc/genmatch.cc |  6 ++
>  gcc/match.pd| 15 ---
>  gcc/testsuite/gcc.dg/torture/pr114070.c | 12 
>  3 files changed, 26 insertions(+), 7 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/torture/pr114070.c
>
> diff --git a/gcc/genmatch.cc b/gcc/genmatch.cc
> index 375ae90ae6c..d9ae436ce5c 100644
> --- a/gcc/genmatch.cc
> +++ b/gcc/genmatch.cc
> @@ -4760,10 +4760,8 @@ parser::parse_c_expr (cpp_ttype start)
> = (const char *)CPP_HASHNODE (token->val.node.node)->ident.str;
>   if (strcmp (str, "return") == 0)
> fatal_at (token, "return statement not allowed in C expression");
> - id_base *idb = get_operator (str);
> - user_id *p;
> - if (idb && (p = dyn_cast (idb)) && p->is_oper_list)
> -   record_operlist (token->src_loc, p);
> + /* Mark user operators corresponding to 'str' as used.  */
> + get_operator (str);
> }
>
>/* Record the token.  */
> diff --git a/gcc/match.pd b/gcc/match.pd
> index c5b6540f939..67007fc2017 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -5149,15 +5149,24 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>  /* (c ? a : b) op (c ? d : e)  -->  c ? (a op d) : (b op e) */
>   (simplify
>(op (vec_cond:s @0 @1 @2) (vec_cond:s @0 @3 @4))
> -  (vec_cond @0 (op! @1 @3) (op! @2 @4)))
> +  (if (TREE_CODE_CLASS (op) != tcc_comparison
> +   || types_match (type, TREE_TYPE (@1))
> +   || expand_vec_cond_expr_p (type, TREE_TYPE (@0), ERROR_MARK))
> +   (vec_cond @0 (op! @1 @3) (op! @2 @4
>
>  /* (c ? a : b) op d  -->  c ? (a op d) : (b op d) */
>   (simplify
>(op (vec_cond:s @0 @1 @2) @3)
> -  (vec_cond @0 (op! @1 @3) (op! @2 @3)))
> +  (if (TREE_CODE_CLASS (op) != tcc_comparison
> +   || types_match (type, TREE_TYPE (@1))
> +   || expand_vec_cond_expr_p (type, TREE_TYPE (@0), ERROR_MARK))
> +   (vec_cond @0 (op! @1 @3) (op! @2 @3
>   (simplify
>(op @3 (vec_cond:s @0 @1 @2))
> -  (vec_cond @0 (op! @3 @1) (op! @3 @2
> +  (if (TREE_CODE_CLASS (op) != tcc_comparison
> +   || types_match (type, TREE_TYPE (@1))
> +   || expand_vec_cond_expr_p (type, TREE_TYPE (@0), ERROR_MARK))
> +   (vec_cond @0 (op! @3 @1) (op! @3 @2)
>
>  #if GIMPLE
>  (match (nop_atomic_bit_test_and_p @0 @1 @4)
> diff --git a/gcc/testsuite/gcc.dg/torture/pr114070.c 
> b/gcc/testsuite/gcc.dg/torture/pr114070.c
> new file mode 100644
> index 000..cf46ec45a04
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/torture/pr114070.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-fno-vect-cost-model" } */
> +
> +int unresolved(unsigned dirmask, unsigned mask, int *unresolved_n)
> +{
> +  for (int i = 0; i < 1024; i++) {
> +mask |= 1;
> +if (!unresolved_n[i] || unresolved_n[i] & 7000

[PATCH] rs6000: ROP - Emit hashst and hashchk insns on Power8 and later [PR114759]

2024-06-19 Thread Peter Bergner
We currently only emit the ROP-protect hash* insns for Power10, where the
insns were added to the architecture.  We want to emit them for earlier
cpus (where they operate as NOPs), so that if those older binaries are
ever executed on a Power10, then they'll be protected from ROP attacks.
Binutils accepts hashst and hashchk back to Power8, so change GCC to emit
them for Power8 and later.  This matches clang's behavior.

This patch is independent of the ROP shrink-wrap fix submitted earlier.
This passed bootstrap and regtesting on powerpc64le-linux with no regressions.
Ok for trunk?  

Peter



2024-06-19  Peter Bergner  

gcc/
PR target/114759
* config/rs6000/rs6000-logue.cc (rs6000_stack_info): Use TARGET_POWER8.
(rs6000_emit_prologue): Likewise.
* config/rs6000/rs6000.md (hashchk): Likewise.
(hashst): Likewise.
Fix whitespace.

gcc/testsuite/
PR target/114759
* gcc.target/powerpc/pr114759-2.c: New test.
* lib/target-supports.exp (rop_ok): Use
check_effective_target_has_arch_pwr8.
---
 gcc/config/rs6000/rs6000-logue.cc |  6 +++---
 gcc/config/rs6000/rs6000.md   |  6 +++---
 gcc/testsuite/gcc.target/powerpc/pr114759-2.c | 17 +
 gcc/testsuite/lib/target-supports.exp |  2 +-
 4 files changed, 24 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr114759-2.c

diff --git a/gcc/config/rs6000/rs6000-logue.cc 
b/gcc/config/rs6000/rs6000-logue.cc
index c384e48e378..bd363b625a4 100644
--- a/gcc/config/rs6000/rs6000-logue.cc
+++ b/gcc/config/rs6000/rs6000-logue.cc
@@ -716,7 +716,7 @@ rs6000_stack_info (void)
   info->calls_p = (!crtl->is_leaf || cfun->machine->ra_needs_full_frame);
   info->rop_hash_size = 0;
 
-  if (TARGET_POWER10
+  if (TARGET_POWER8
   && info->calls_p
   && DEFAULT_ABI == ABI_ELFv2
   && rs6000_rop_protect)
@@ -3277,7 +3277,7 @@ rs6000_emit_prologue (void)
   /* NOTE: The hashst isn't needed if we're going to do a sibcall,
  but there's no way to know that here.  Harmless except for
  performance, of course.  */
-  if (TARGET_POWER10 && rs6000_rop_protect && info->rop_hash_size != 0)
+  if (TARGET_POWER8 && rs6000_rop_protect && info->rop_hash_size != 0)
 {
   gcc_assert (DEFAULT_ABI == ABI_ELFv2);
   rtx stack_ptr = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM);
@@ -5056,7 +5056,7 @@ rs6000_emit_epilogue (enum epilogue_type epilogue_type)
 
   /* The ROP hash check must occur after the stack pointer is restored
  (since the hash involves r1), and is not performed for a sibcall.  */
-  if (TARGET_POWER10
+  if (TARGET_POWER8
   && rs6000_rop_protect
   && info->rop_hash_size != 0
   && epilogue_type != EPILOGUE_TYPE_SIBCALL)
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index a5d20594789..694076e311f 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -15808,9 +15808,9 @@ (define_insn "*cmpeqb_internal"
 
 (define_insn "hashst"
   [(set (match_operand:DI 0 "simple_offsettable_mem_operand" "=m")
-(unspec_volatile:DI [(match_operand:DI 1 "int_reg_operand" "r")]
+   (unspec_volatile:DI [(match_operand:DI 1 "int_reg_operand" "r")]
UNSPEC_HASHST))]
-  "TARGET_POWER10 && rs6000_rop_protect"
+  "TARGET_POWER8 && rs6000_rop_protect"
 {
   static char templ[32];
   const char *p = rs6000_privileged ? "p" : "";
@@ -15823,7 +15823,7 @@ (define_insn "hashchk"
   [(unspec_volatile [(match_operand:DI 0 "int_reg_operand" "r")
 (match_operand:DI 1 "simple_offsettable_mem_operand" "m")]
UNSPEC_HASHCHK)]
-  "TARGET_POWER10 && rs6000_rop_protect"
+  "TARGET_POWER8 && rs6000_rop_protect"
 {
   static char templ[32];
   const char *p = rs6000_privileged ? "p" : "";
diff --git a/gcc/testsuite/gcc.target/powerpc/pr114759-2.c 
b/gcc/testsuite/gcc.target/powerpc/pr114759-2.c
new file mode 100644
index 000..3881ebd416e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr114759-2.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mdejagnu-cpu=power8 -mrop-protect" } */
+/* { dg-require-effective-target rop_ok } Only enable on supported ABIs.  */
+
+/* Verify we generate ROP-protect hash insns when compiling for Power8.  */
+
+extern void foo (void);
+
+int
+bar (void)
+{
+  foo ();
+  return 5;
+}
+
+/* { dg-final { scan-assembler-times {\mhashst\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mhashchk\M} 1 } } */
diff --git a/gcc/testsuite/lib/target-supports.exp 
b/gcc/testsuite/lib/target-supports.exp
index e307f4e69ef..b1ef4e8eaef 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -7339,7 +7339,7 @@ proc check_effective_target_powerpc_elfv2 { } {
 # Return 1 if this is a PowerPC target supporting -mrop-protect
 
 proc check_effective_target_rop_ok { } {
-return [check_effective_target_power10_ok] && 
[ch

[Committed] RISC-V: Promote Zaamo/Zalrsc to a when using an old binutils

2024-06-19 Thread Patrick O'Neill

Committed.

Patrick

On 6/19/24 06:25, Kito Cheng wrote:

LGTM :)

Patrick O'Neill  於 2024年6月19日 週三 05:40 寫道:

Binutils 2.42 and before don't support Zaamo/Zalrsc. When users
specify
both Zaamo and Zalrsc, promote them to 'a' in the -march string.

This does not affect testsuite results for users with old versions
of binutils.
Testcases that failed due to 'call'/isa string continue to fail
after this PATCH
when using an old version of binutils.

gcc/ChangeLog:

        * common/config/riscv/riscv-common.cc: Add 'a' extension to
        riscv_combine_info.

Signed-off-by: Patrick O'Neill 
---
We will emit calls if the user only specifies Zaamo or Zalrsc.
To my knowledge there isn't a way to make a testcase for this in
dejagnu.
I used the most recent version of the 'a' extension arbitrarily
since AFAICT the
version of the extension doesn't affect the combine logic.
---
 gcc/common/config/riscv/riscv-common.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gcc/common/config/riscv/riscv-common.cc
b/gcc/common/config/riscv/riscv-common.cc
index 1dc1d9904c7..410e673f5e0 100644
--- a/gcc/common/config/riscv/riscv-common.cc
+++ b/gcc/common/config/riscv/riscv-common.cc
@@ -401,6 +401,7 @@ static const struct riscv_ext_version
riscv_ext_version_table[] =
 /* Combine extensions defined in this table  */
 static const struct riscv_ext_version riscv_combine_info[] =
 {
+  {"a", ISA_SPEC_CLASS_20191213, 2, 1},
   {"zk",  ISA_SPEC_CLASS_NONE, 1, 0},
   {"zkn",  ISA_SPEC_CLASS_NONE, 1, 0},
   {"zks",  ISA_SPEC_CLASS_NONE, 1, 0},
--
2.34.1


Re: [PATCH][v2] Enhance if-conversion for automatic arrays

2024-06-19 Thread Toon Moene

On 6/19/24 21:06, Richard Biener wrote:




Am 19.06.2024 um 20:25 schrieb Toon Moene :

On 6/17/24 16:05, Richard Biener wrote:


Automatic arrays that are not address-taken should not be subject to
store data races.  This applies to OMP SIMD in-branch lowered
functions result array which for the testcase otherwise prevents
vectorization with SSE and for AVX and AVX512 ends up with spurious
.MASK_STORE to the stack surviving.


Does this also apply for "automatic arrays" as defined by the Fortran Standard 
(see https://j3-fortran.org/doc/year/23/23-007r1.pdf, page 104), i.e., outside of the 
OMP_SIMD construct ?

In gfortran, when using the option -fstack-arrays, they are assigned memory 
space on the stack.


I’d say yes though the likelihood those are address taken and thus not 
considered is high.  The main target were the arrays created as part of the 
SIMD lowering.


Isn't there a "not" missing before "high" ?

So it mostly helps after the call to subroutine 'sub' in the following:

SUBROUTINE AAP(A, B, N)
INTEGER N
REAL A(N), B(N), R(N)
CALL SUB(R, N) ! Address of R passed to SUB
R = ABS(A)
B = R
B = SQRT(A)
END

?
--
Toon Moene - e-mail: t...@moene.org - phone: +31 346 214290
Saturnushof 14, 3738 XG  Maartensdijk, The Netherlands



Re: [Fortran, Patch, PR 96992] Fix Class arrays of different ranks are rejected as storage association argument

2024-06-19 Thread Harald Anlauf

Hi Andre,

Am 19.06.24 um 09:07 schrieb Andre Vehreschild:

Hi Harald,

thank you for the investigation and useful tips. I had to figure what went
wrong here, but I now figured, that the array needs repacking when a negative
stride is used (or at least a call to that routine, which then fixes "stuff").
I have added it, freeing the memory allocated potentially by pack, and also
updated the testcase to include the negative stride.


hmmm, the pack does not always get generated:

module foo_mod
  implicit none
  type foo
 integer :: i
  end type foo
contains
  subroutine d1(x,n)
integer, intent(in) :: n
integer :: i
class (foo), intent(out) :: x(n)
select type(x)
class is(foo)
   x(:)%i = (/ (42 + i, i = 1, n ) /)
class default
   stop 1
end select
  end subroutine d1
  subroutine d2(x,n)
integer, intent(in) :: n
integer :: i
class (foo), intent(in) :: x(n,n,n)
select type (x)
class is (foo)
   print *,"d2:  ", x%i
   if ( any( x%i /= reshape((/ (42 + i, i = 1, n ** 3 ) /), [n, n, 
n] ))) stop 2

class default
   stop 3
end select
  end subroutine d2

  subroutine d3(x,n)
integer, intent(in) :: n
integer :: i
class (foo), intent(inout) :: x(n)
select type (x)
class is (foo)
   print *,"d3_1:", x%i
   x%i = -x%i   ! Simply negate elements
   print *,"d3_2:", x%i
class default
   stop 33
end select
  end subroutine d3
end module foo_mod
program main
  use foo_mod
  implicit none
  type (foo), dimension(:), allocatable :: f
  integer :: n, k, m
  n = 2
  allocate (f(n*n*n))
  ! Original testcase:
  call d1(f,n*n*n)
  print *, "d1->:", f%i
  call d2(f,n)
  ! Ensure that array f is ok:
  print *, "d2->:", f%i

  ! The following shows that no appropriate internal pack is generated:
  call d1(f,n*n*n)
  print *, "d1->:", f%i
  m = n*n*n
  k = 3
  print *, "->d3:", f(1:m:k)%i
  call d3(f(1:m:k),1+(m-1)/k)
  print *, "d3->:", f(1:m:k)%i
  print *, "full:", f%i
  deallocate (f)
end program main


After the second version of your patch this prints:

 d1->:  43  44  45  46  47 
48  49  50
 d2:43  44  45  46  47 
48  49  50
 d2->:  43  44  45  46  47 
48  49  50
 d1->:  43  44  45  46  47 
48  49  50

 ->d3:  43  46  49
 d3_1:  43  44  45
 d3_2: -43 -44 -45
 d3->: -43  46  49
 full: -43 -44 -45  46  47 
48  49  50


While the print properly handles f(1:m:k)%i, passing it as
actual argument to subroutine d3 does not do pack/unpack.

Can you have another look?

Thanks,
Harald



Regtests fine on x86_64-pc-linux-gnu/Fedora 39. Ok for mainline?

Regards,
Andre

On Sun, 16 Jun 2024 23:27:46 +0200
Harald Anlauf  wrote:

<< snipped for brevity >>>
--
Andre Vehreschild * Email: vehre ad gmx dot de





Re: [PATCH] bitint: Fix up lowering of COMPLEX_EXPR [PR115544]

2024-06-19 Thread Richard Biener



> Am 19.06.2024 um 20:44 schrieb Jakub Jelinek :
> 
> Hi!
> 
> We don't really support _Complex _BitInt(N), the only place we use
> bitint complex types is for the .{ADD,SUB,MUL}_OVERFLOW internal function
> results and COMPLEX_EXPR in the usual case should be either not present
> yet because the ifns weren't folded and will be lowered, or optimized
> into something simpler, because normally the complex bitint should be
> used just for extracting the 2 subparts from it.
> Still, with disabled optimizations it can occassionally happen that it
> appears in the IL and that is why there is support for lowering those,
> but it doesn't handle optimizing those too much, so if it uses SSA_NAME,
> it relies on them having a backing VAR_DECL during the lowering.
> This is normally achieves through the
>  && ((is_gimple_assign (use_stmt)
>   && (gimple_assign_rhs_code (use_stmt)
>   != COMPLEX_EXPR))
>  || gimple_code (use_stmt) == GIMPLE_COND)
> hunk in gimple_lower_bitint, but as the following testcase shows, there
> is one thing I've missed, the load optimization isn't guarded by the
> above stuff.  So, either we'd need to add support for loads to
> lower_complexexpr_stmt, or because they should be really rare, this
> patch just disables the load optimization if at least one load use is
> a COMPLEX_EXPR (like we do already for PHIs, calls, asm).

Sounds reasonable.

> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

Ok

Richard 

> 2024-06-19  Jakub Jelinek  
> 
>PR tree-optimization/115544
>* gimple-lower-bitint.cc (gimple_lower_bitint): Disable optimizing
>loads used by COMPLEX_EXPR operands.
> 
>* gcc.dg/bitint-107.c: New test.
> 
> --- gcc/gimple-lower-bitint.cc.jj2024-06-07 12:17:09.811966904 +0200
> +++ gcc/gimple-lower-bitint.cc2024-06-19 15:27:22.378759911 +0200
> @@ -6630,7 +6630,10 @@ gimple_lower_bitint (void)
>continue;
>  if (gimple_code (use_stmt) == GIMPLE_PHI
>  || is_gimple_call (use_stmt)
> -  || gimple_code (use_stmt) == GIMPLE_ASM)
> +  || gimple_code (use_stmt) == GIMPLE_ASM
> +  || (is_gimple_assign (use_stmt)
> +  && (gimple_assign_rhs_code (use_stmt)
> +  == COMPLEX_EXPR)))
>{
>  optimizable_load = false;
>  break;
> --- gcc/testsuite/gcc.dg/bitint-107.c.jj2024-06-19 15:36:32.817747449 
> +0200
> +++ gcc/testsuite/gcc.dg/bitint-107.c2024-06-19 14:03:31.383805280 +0200
> @@ -0,0 +1,16 @@
> +/* PR tree-optimization/115544 */
> +/* { dg-do compile { target bitint } } */
> +/* { dg-options "-O -fno-tree-fre -fno-tree-ccp -fno-tree-forwprop" } */
> +
> +#if __BITINT_MAXWIDTH__ >= 129
> +typedef _BitInt(129) B;
> +#else
> +typedef _BitInt(63) B;
> +#endif
> +B a, b;
> +
> +int
> +foo (void)
> +{
> +  return __builtin_mul_overflow (a, 1, &b);
> +}
> 
>Jakub
> 


Re: [PATCH][v2] Enhance if-conversion for automatic arrays

2024-06-19 Thread Richard Biener



> Am 19.06.2024 um 20:25 schrieb Toon Moene :
> 
> On 6/17/24 16:05, Richard Biener wrote:
> 
>> Automatic arrays that are not address-taken should not be subject to
>> store data races.  This applies to OMP SIMD in-branch lowered
>> functions result array which for the testcase otherwise prevents
>> vectorization with SSE and for AVX and AVX512 ends up with spurious
>> .MASK_STORE to the stack surviving.
> 
> Does this also apply for "automatic arrays" as defined by the Fortran 
> Standard (see https://j3-fortran.org/doc/year/23/23-007r1.pdf, page 104), 
> i.e., outside of the OMP_SIMD construct ?
> 
> In gfortran, when using the option -fstack-arrays, they are assigned memory 
> space on the stack.

I’d say yes though the likelihood those are address taken and thus not 
considered is high.  The main target were the arrays created as part of the 
SIMD lowering.

Richard 

> Kind regards,
> 
> --
> Toon Moene - e-mail: t...@moene.org - phone: +31 346 214290
> Saturnushof 14, 3738 XG  Maartensdijk, The Netherlands
> 


[PATCH] bitint: Fix up lowering of COMPLEX_EXPR [PR115544]

2024-06-19 Thread Jakub Jelinek
Hi!

We don't really support _Complex _BitInt(N), the only place we use
bitint complex types is for the .{ADD,SUB,MUL}_OVERFLOW internal function
results and COMPLEX_EXPR in the usual case should be either not present
yet because the ifns weren't folded and will be lowered, or optimized
into something simpler, because normally the complex bitint should be
used just for extracting the 2 subparts from it.
Still, with disabled optimizations it can occassionally happen that it
appears in the IL and that is why there is support for lowering those,
but it doesn't handle optimizing those too much, so if it uses SSA_NAME,
it relies on them having a backing VAR_DECL during the lowering.
This is normally achieves through the
  && ((is_gimple_assign (use_stmt)
   && (gimple_assign_rhs_code (use_stmt)
   != COMPLEX_EXPR))
  || gimple_code (use_stmt) == GIMPLE_COND)
hunk in gimple_lower_bitint, but as the following testcase shows, there
is one thing I've missed, the load optimization isn't guarded by the
above stuff.  So, either we'd need to add support for loads to
lower_complexexpr_stmt, or because they should be really rare, this
patch just disables the load optimization if at least one load use is
a COMPLEX_EXPR (like we do already for PHIs, calls, asm).

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2024-06-19  Jakub Jelinek  

PR tree-optimization/115544
* gimple-lower-bitint.cc (gimple_lower_bitint): Disable optimizing
loads used by COMPLEX_EXPR operands.

* gcc.dg/bitint-107.c: New test.

--- gcc/gimple-lower-bitint.cc.jj   2024-06-07 12:17:09.811966904 +0200
+++ gcc/gimple-lower-bitint.cc  2024-06-19 15:27:22.378759911 +0200
@@ -6630,7 +6630,10 @@ gimple_lower_bitint (void)
continue;
  if (gimple_code (use_stmt) == GIMPLE_PHI
  || is_gimple_call (use_stmt)
- || gimple_code (use_stmt) == GIMPLE_ASM)
+ || gimple_code (use_stmt) == GIMPLE_ASM
+ || (is_gimple_assign (use_stmt)
+ && (gimple_assign_rhs_code (use_stmt)
+ == COMPLEX_EXPR)))
{
  optimizable_load = false;
  break;
--- gcc/testsuite/gcc.dg/bitint-107.c.jj2024-06-19 15:36:32.817747449 
+0200
+++ gcc/testsuite/gcc.dg/bitint-107.c   2024-06-19 14:03:31.383805280 +0200
@@ -0,0 +1,16 @@
+/* PR tree-optimization/115544 */
+/* { dg-do compile { target bitint } } */
+/* { dg-options "-O -fno-tree-fre -fno-tree-ccp -fno-tree-forwprop" } */
+
+#if __BITINT_MAXWIDTH__ >= 129
+typedef _BitInt(129) B;
+#else
+typedef _BitInt(63) B;
+#endif
+B a, b;
+
+int
+foo (void)
+{
+  return __builtin_mul_overflow (a, 1, &b);
+}

Jakub



[PATCH] libcpp: Add support for gnu::base64 #embed parameter

2024-06-19 Thread Jakub Jelinek
Hi!

The following patch adds another extension, gnu::base64.
As mentioned in the documentation, this extension is primarily
intended for use by the preprocessor, so that for the larger (say 32+ or
64+ bytes long embeds it doesn't have to emit tens of thousands or
millions of comma separated string literals which would be very expensive
to parse again, but can emit
#embed "." 
__gnu__::__base64__("TG9yZW0gaXBzdW0gZG9sb3Igc2l0IGFtZXQsIGNvbnNlY3RldHVyIGFkaXBpc2NpbmcgZWxpdCwgc2VkIGRvIGVpdXNtb2QgdGVtcG9yIGluY2lkaWR1bnQgdXQgbGFib3JlIGV0IGRvbG9yZSBtYWduYSBhbGlxdWEuCg==")
with the meaning don't actually load some file, instead base64 decode
(RFC4648 with A-Za-z0-9+/ chars and = padding, no newlines in between)
the string and use that as data.  This is chosen because it should be
-pedantic-errors clean, fairly cheap to decode and then in optimizing
compiler could be handled as similar binary blob to normal #embed,
while the data isn't left somewhere on the disk, so distcc/ccache etc.
can move the preprocessed source without issues.
It makes no sense to support limit and gnu::offset parameters together
with it IMHO, why would somebody waste providing full data and then
threw some away?  prefix/suffix/if_empty are normally supported though,
but not intended to be used by the preprocessor.

This patch adds just the extension side, not the actual emitting of this
during -E or -E -fdirectives-only for now, because that will be far easier
to be done when we introduce some token type and tree to handle the sequence
as a binary blob, once that is done printing of that new token type can
be printed as the #embed (or a series of them, I think we can't emit longer
string than 2GB).

Right now the patch only supports a single huge string literal in there,
not concatenation of multiple strings, dunno if we shouldn't add support
for that so that we don't run into the line length limits for column
numbering.  The alternative would be emit
#embed "." __gnu__::__base64__( \
"Tm9uIGVyYW0gbsOpc2NpdXMsIEJydXRlLCBjdW0sIHF1w6Ygc3VtbWlzIGluZ8OpbmlpcyBleHF1" \
"aXNpdMOhcXVlIGRvY3Ryw61uYSBwaGlsw7Nzb3BoaSBHcsOmY28gc2VybcOzbmUgdHJhY3RhdsOt" \
"c3NlbnQsIGVhIExhdMOtbmlzIGzDrXR0ZXJpcyBtYW5kYXLDqW11cywgZm9yZSB1dCBoaWMgbm9z" \
"dGVyIGxhYm9yIGluIHbDoXJpYXMgcmVwcmVoZW5zacOzbmVzIGluY8O6cnJlcmV0LiBuYW0gcXVp" \
"YsO6c2RhbSwgZXQgaWlzIHF1aWRlbSBub24gw6FkbW9kdW0gaW5kw7NjdGlzLCB0b3R1bSBob2Mg" \
"ZMOtc3BsaWNldCBwaGlsb3NvcGjDoXJpLiBxdWlkYW0gYXV0ZW0gbm9uIHRhbSBpZCByZXByZWjD" \
"qW5kdW50LCBzaSByZW3DrXNzaXVzIGFnw6F0dXIsIHNlZCB0YW50dW0gc3TDumRpdW0gdGFtcXVl" \
"IG11bHRhbSDDs3BlcmFtIHBvbsOpbmRhbSBpbiBlbyBub24gYXJiaXRyw6FudHVyLiBlcnVudCDD" \
"qXRpYW0sIGV0IGlpIHF1aWRlbSBlcnVkw610aSBHcsOmY2lzIGzDrXR0ZXJpcywgY29udGVtbsOp" \
"bnRlcyBMYXTDrW5hcywgcXVpIHNlIGRpY2FudCBpbiBHcsOmY2lzIGxlZ8OpbmRpcyDDs3BlcmFt" \
"IG1hbGxlIGNvbnPDum1lcmUuIHBvc3Ryw6ltbyDDoWxpcXVvcyBmdXTDunJvcyBzw7pzcGljb3Is" \
"IHF1aSBtZSBhZCDDoWxpYXMgbMOtdHRlcmFzIHZvY2VudCwgZ2VudXMgaG9jIHNjcmliw6luZGks" \
"IGV0c2kgc2l0IGVsw6lnYW5zLCBwZXJzw7Nuw6YgdGFtZW4gZXQgZGlnbml0w6F0aXMgZXNzZSBu" \
"ZWdlbnQu")
so effectively split it at the 76 columns boundaries (or some other one,
ideally multiple of 4); disadvantage is then that we lex that not into
a single huge string but perhaps hundreds/thousands/millions of short
CPP_STRINGs that would be gathered together.
Thoughts on that?

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2024-06-19  Jakub Jelinek  

libcpp/
* internal.h (struct cpp_embed_params): Add base64 member.
* directives.cc (_cpp_parse_embed_params): Parse gnu::base64
parameter.
* files.cc (finish_embed, base64_dec_fn): New functions.
(base64_dec): New array.
(B64D0, B64D1, B64D2, B64D3): Define.
(finish_base64_embed): New function.
(_cpp_stack_embed): Use finish_embed.  Handle params->base64
using finish_base64_embed.
gcc/
* doc/cpp.texi (Binary Resource Inclusion): Document gnu::base64
parameter.
gcc/testsuite/
* c-c++-common/cpp/embed-17.c: New test.
* c-c++-common/cpp/embed-18.c: New test.

--- libcpp/internal.h.jj2024-06-18 17:42:32.228010763 +0200
+++ libcpp/internal.h   2024-06-19 09:28:25.881760114 +0200
@@ -631,6 +631,7 @@ struct cpp_embed_params
   location_t loc;
   bool has_embed;
   cpp_num_part limit, offset;
+  const cpp_token *base64;
   cpp_embed_params_tokens prefix, suffix, if_empty;
 };
 
--- libcpp/directives.cc.jj 2024-06-18 17:42:32.0 +0200
+++ libcpp/directives.cc2024-06-19 12:12:54.178141429 +0200
@@ -1032,12 +1032,22 @@ _cpp_parse_embed_params (cpp_reader *pfi
  cpp_error (pfile, CPP_DL_ERROR, "expected ')'");
  return false;
}
- return ret;
}
- else if (token->type == CPP_CLOSE_PAREN && params->has_embed)
-   return ret;
- cpp_error (pfile, CPP_DL_ERROR, "expected parameter name");
- return false;
+ else if (token->type

Re: [PATCH][v2] Enhance if-conversion for automatic arrays

2024-06-19 Thread Toon Moene

On 6/17/24 16:05, Richard Biener wrote:


Automatic arrays that are not address-taken should not be subject to
store data races.  This applies to OMP SIMD in-branch lowered
functions result array which for the testcase otherwise prevents
vectorization with SSE and for AVX and AVX512 ends up with spurious
.MASK_STORE to the stack surviving.


Does this also apply for "automatic arrays" as defined by the Fortran 
Standard (see https://j3-fortran.org/doc/year/23/23-007r1.pdf, page 
104), i.e., outside of the OMP_SIMD construct ?


In gfortran, when using the option -fstack-arrays, they are assigned 
memory space on the stack.


Kind regards,

--
Toon Moene - e-mail: t...@moene.org - phone: +31 346 214290
Saturnushof 14, 3738 XG  Maartensdijk, The Netherlands



Re: [PATCH] [x86_64]: Zhaoxin shijidadao enablement

2024-06-19 Thread Uros Bizjak
On Tue, Jun 18, 2024 at 9:21 AM mayshao-oc  wrote:
>
>
>
> On 5/28/24 14:15, Uros Bizjak wrote:
> >
> >
> >
> > On Mon, May 27, 2024 at 10:33 AM MayShao  wrote:
> >>
> >> From: mayshao 
> >>
> >> Hi all:
> >>  This patch enables -march/-mtune=shijidadao, costs and tunings are 
> >> set according to the characteristics of the processor.
> >>
> >>  Bootstrapped /regtested X86_64.
> >>
> >>  Ok for trunk?
> >
> > OK.
> >
> > Thanks,
> > Uros.
>
> Thanks for your review, please help me commit.

Done, committed as r15-1454 [1].

[1] https://gcc.gnu.org/pipermail/gcc-cvs/2024-June/404474.html

Thanks,
Uros.


Re: [C PATCH] Fix ICE related to incomplete structures in C23 [PR114930,PR115502].

2024-06-19 Thread Jakub Jelinek
On Wed, Jun 19, 2024 at 07:32:28PM +0200, Jakub Jelinek wrote:
> Ok, I've tried that, but that doesn't work, it ICEs on the
> pr114574-2.c testcase.

The following works on quick testing of dg.exp=pr11[45]*.c
but haven't bootstrapped/regtested it yet.

2024-06-19  Jakub Jelinek  
Martin Uecker  

PR c/114930
PR c/115502
gcc/c/
* c-decl.cc (c_update_type_canonical): Assert t is main variant
with 0 TYPE_QUALS.  Simplify and don't use check_qualified_type.
Deal with the case where build_qualified_type returns
TYPE_STRUCTURAL_EQUALITY_P type.
gcc/testsuite/
* gcc.dg/pr114574-1.c: Require lto effective target.
* gcc.dg/pr114574-2.c: Likewise.
* gcc.dg/pr114930.c: New test.
* gcc.dg/pr115502.c: New test.

--- gcc/c/c-decl.cc.jj  2024-06-07 12:17:09.582969919 +0200
+++ gcc/c/c-decl.cc 2024-06-19 19:59:24.955836263 +0200
@@ -9367,18 +9367,44 @@ is_flexible_array_member_p (bool is_last
 static void
 c_update_type_canonical (tree t)
 {
-  for (tree x = TYPE_MAIN_VARIANT (t); x; x = TYPE_NEXT_VARIANT (x))
+  gcc_checking_assert (TYPE_MAIN_VARIANT (t) == t && !TYPE_QUALS (t));
+  for (tree x = t, l = NULL_TREE; x; l = x, x = TYPE_NEXT_VARIANT (x))
 {
   if (x != t && TYPE_STRUCTURAL_EQUALITY_P (x))
{
- if (TYPE_QUALS (x) == TYPE_QUALS (t))
+ if (!TYPE_QUALS (x))
TYPE_CANONICAL (x) = TYPE_CANONICAL (t);
- else if (TYPE_CANONICAL (t) != t
-  || check_qualified_type (x, t, TYPE_QUALS (x)))
-   TYPE_CANONICAL (x)
- = build_qualified_type (TYPE_CANONICAL (t), TYPE_QUALS (x));
  else
-   TYPE_CANONICAL (x) = x;
+   {
+ tree
+   c = build_qualified_type (TYPE_CANONICAL (t), TYPE_QUALS (x));
+ if (TYPE_STRUCTURAL_EQUALITY_P (c))
+   {
+ gcc_checking_assert (TYPE_CANONICAL (t) == t);
+ if (c == x)
+   TYPE_CANONICAL (x) = x;
+ else
+   {
+ /* build_qualified_type for this function unhelpfully
+moved c from some later spot in TYPE_MAIN_VARIANT (t)
+chain to right after t (or created it there).  Move
+it right before x and process c and then x.  */
+ gcc_checking_assert (TYPE_NEXT_VARIANT (t) == c);
+ if (l == t)
+   x = t;
+ else
+   {
+ TYPE_NEXT_VARIANT (t) = TYPE_NEXT_VARIANT (c);
+ TYPE_NEXT_VARIANT (l) = c;
+ TYPE_NEXT_VARIANT (c) = x;
+ x = l;
+   }
+ continue;
+   }
+   }
+ else
+   TYPE_CANONICAL (x) = TYPE_CANONICAL (c);
+   }
}
   else if (x != t)
continue;
--- gcc/testsuite/gcc.dg/pr114574-1.c.jj2024-04-20 00:05:07.273690453 
+0200
+++ gcc/testsuite/gcc.dg/pr114574-1.c   2024-06-19 20:00:33.015984692 +0200
@@ -1,6 +1,6 @@
-/* PR lto/114574
- * { dg-do compile }
- * { dg-options "-flto" } */
+/* PR lto/114574 */
+/* { dg-do compile { target lto } } */
+/* { dg-options "-flto" } */
 
 const struct S * x;
 struct S {};
--- gcc/testsuite/gcc.dg/pr114574-2.c.jj2024-04-20 00:05:07.274690440 
+0200
+++ gcc/testsuite/gcc.dg/pr114574-2.c   2024-06-19 20:00:55.084704278 +0200
@@ -1,6 +1,6 @@
-/* PR lto/114574
- * { dg-do compile }
- * { dg-options "-flto -std=c23" } */
+/* PR lto/114574 */
+/* { dg-do compile { target lto } } */
+/* { dg-options "-flto -std=c23" } */
 
 const struct S * x;
 struct S {};
--- gcc/testsuite/gcc.dg/pr114930.c.jj  2024-06-18 21:27:53.782729543 +0200
+++ gcc/testsuite/gcc.dg/pr114930.c 2024-06-18 21:27:53.782729543 +0200
@@ -0,0 +1,9 @@
+/* PR c/114930 */
+/* { dg-do compile { target lto } } */
+/* { dg-options "-std=c23 -flto" } */
+
+typedef struct WebPPicture WebPPicture;
+typedef int (*WebPProgressHook)(const WebPPicture *);
+WebPProgressHook progress_hook;
+struct WebPPicture {
+} WebPGetColorPalette(const struct WebPPicture *);
--- gcc/testsuite/gcc.dg/pr115502.c.jj  2024-06-18 21:27:53.793729408 +0200
+++ gcc/testsuite/gcc.dg/pr115502.c 2024-06-18 21:27:53.793729408 +0200
@@ -0,0 +1,9 @@
+/* PR c/115502 */
+/* { dg-do compile { target lto } } */
+/* { dg-options "-std=c23 -flto" } */
+
+typedef struct _OSet OSet;
+typedef OSet AvlTree;
+void vgPlain_OSetGen_Lookup(const OSet *);
+struct _OSet {};
+void vgPlain_OSetGen_Lookup(const AvlTree *);


Jakub



Welcome to the Chinese WOS, Scopus Journal (Multidisciplinary)

2024-06-19 Thread Zhao Huang
   Journal of Hunan University Natural Sciences p{ margin:10px 0; padding:0; } 
table{ border-collapse:collapse; } h1,h2,h3,h4,h5,h6{ display:block; margin:0; 
padding:0; } img,a img{ border:0; height:auto; outline:none; 
text-decoration:none; } body,#bodyTable,#bodyCell{ height:100%; margin:0; 
padding:0; width:100%; } .mcnPreviewText{ display:none !important; } #outlook 
a{ padding:0; } img{ -ms-interpolation-mode:bicubic; } table{ 
mso-table-lspace:0pt; mso-table-rspace:0pt; } .ReadMsgBody{ width:100%; } 
.ExternalClass{ width:100%; } p,a,li,td,blockquote{ 
mso-line-height-rule:exactly; } a[href^=tel],a[href^=sms]{ color:inherit; 
cursor:default; text-decoration:none; } p,a,li,td,body,table,blockquote{ 
-ms-text-size-adjust:100%; -webkit-text-size-adjust:100%; } 
.ExternalClass,.ExternalClass p,.ExternalClass td,.ExternalClass 
div,.ExternalClass span,.ExternalClass font{ line-height:100%; } 
a[x-apple-data-detectors]{ color:inherit !important; text-decoration:none 
!important; font-size:inherit !important; font-family:inherit !important; 
font-weight:inherit !important; line-height:inherit !important; } #bodyCell{ 
padding:10px; } .templateContainer{ max-width:600px !important; border:0; } 
a.mcnButton{ display:block; } .mcnImage,.mcnRetinaImage{ vertical-align:bottom; 
} .mcnTextContent{ word-break:break-word; } .mcnTextContent img{ height:auto 
!important; } .mcnDividerBlock{ table-layout:fixed !important; } /* @tab Page 
@section Background Style @tip Set the background color and top border for your 
email. You may want to choose colors that match your company's branding. */ 
body,#bodyTable{ /*@editable*/background-color:#FAFAFA; } /* @tab Page @section 
Background Style @tip Set the background color and top border for your email. 
You may want to choose colors that match your company's branding. */ #bodyCell{ 
/*@editable*/border-top:0; } /* @tab Page @section Email Border @tip Set the 
border for your email. */ .templateContainer{ /*@editable*/border:0; } /* @tab 
Page @section Heading 1 @tip Set the styling for all first-level headings in 
your emails. These should be the largest of your headings. @style heading 1 */ 
h1{ /*@editable*/color:#202020; /*@editable*/font-family:Helvetica; 
/*@editable*/font-size:26px; /*@editable*/font-style:normal; 
/*@editable*/font-weight:bold; /*@editable*/line-height:125%; 
/*@editable*/letter-spacing:normal; /*@editable*/text-align:left; } /* @tab 
Page @section Heading 2 @tip Set the styling for all second-level headings in 
your emails. @style heading 2 */ h2{ /*@editable*/color:#202020; 
/*@editable*/font-family:Helvetica; /*@editable*/font-size:22px; 
/*@editable*/font-style:normal; /*@editable*/font-weight:bold; 
/*@editable*/line-height:125%; /*@editable*/letter-spacing:normal; 
/*@editable*/text-align:left; } /* @tab Page @section Heading 3 @tip Set the 
styling for all third-level headings in your emails. @style heading 3 */ h3{ 
/*@editable*/color:#202020; /*@editable*/font-family:Helvetica; 
/*@editable*/font-size:20px; /*@editable*/font-style:normal; 
/*@editable*/font-weight:bold; /*@editable*/line-height:125%; 
/*@editable*/letter-spacing:normal; /*@editable*/text-align:left; } /* @tab 
Page @section Heading 4 @tip Set the styling for all fourth-level headings in 
your emails. These should be the smallest of your headings. @style heading 4 */ 
h4{ /*@editable*/color:#202020; /*@editable*/font-family:Helvetica; 
/*@editable*/font-size:18px; /*@editable*/font-style:normal; 
/*@editable*/font-weight:bold; /*@editable*/line-height:125%; 
/*@editable*/letter-spacing:normal; /*@editable*/text-align:left; } /* @tab 
Preheader @section Preheader Style @tip Set the background color and borders 
for your email's preheader area. */ #templatePreheader{ 
/*@editable*/background-color:#fafafa; /*@editable*/background-image:none; 
/*@editable*/background-repeat:no-repeat; 
/*@editable*/background-position:center; /*@editable*/background-size:cover; 
/*@editable*/border-top:0; /*@editable*/border-bottom:0; 
/*@editable*/padding-top:9px; /*@editable*/padding-bottom:9px; } /* @tab 
Preheader @section Preheader Text @tip Set the styling for your email's 
preheader text. Choose a size and color that is easy to read. */ 
#templatePreheader .mcnTextContent,#templatePreheader .mcnTextContent p{ 
/*@editable*/color:#656565; /*@editable*/font-family:Helvetica; 
/*@editable*/font-size:12px; /*@editable*/line-height:150%; 
/*@editable*/text-align:left; } /* @tab Preheader @section Preheader Link @tip 
Set the styling for your email's preheader links. Choose a color that helps 
them stand out from your text. */ #templatePreheader .mcnTextContent 
a,#templatePreheader .mcnTextContent p a{ /*@editable*/color:#656565; 
/*@editable*/font-weight:normal; /*@editable*/text-decoration:underline; } /* 
@tab Header @section Header Style @tip Set the background color and borders for 
your email's header area. */ #templateHeader{ 
/*@editable*/background-color:#ff; /*@editable*/back

Re: [C PATCH] Fix ICE related to incomplete structures in C23 [PR114930,PR115502].

2024-06-19 Thread Jakub Jelinek
On Wed, Jun 19, 2024 at 09:26:00AM +0200, Martin Uecker wrote:
> Ok. Then should it, instead of
> 
> TYPE_CANONICAL (x)
> = build_qualified_type (TYPE_CANONICAL (t), TYPE_QUALS (x));
> 
> be
> 
> tree c = build_qualified_type (TYPE_CANONICAL (t), TYPE_QUALS (x));
> TYPE_CANONICAL (x) = TREE_CANONICAL (c);
> 
> in the patch below?

Ok, I've tried that, but that doesn't work, it ICEs on the
pr114574-2.c testcase.

What happens is that TYPE_CANONICAL (t) = t; is set by the caller
and the first loop first sees x == t, so only handles pointers, and then
moves to const struct S.  As TYPE_CANONICAL (t) is t, build_qualified_type
looks for an existing qualified type (and, not just that, it also moves
the found qualified type to TYPE_NEXT_VARIANT (t) to speed it up next
time!!), in this case returns x, so it then effectively does nothing,
TYPE_CANONICAL (x) = TYPE_CANONICAL (x);
and leaves the type TYPE_STRUCTURAL_EQUALITY_P, which is not what we want.
Dunno if it in theory could also find some type later in the
TYPE_NEXT_VARIANT chain and move it earlier.

So, if the new patch is to be used, we need to add some extra handling
for these problematic cases.
One is if c == x (which can happen solely if TYPE_CANONICAL (t) == t),
that is easy to handle, in that case we should make it the canonical type
of itself, so TYPE_CANONICAL (x) = x; rather than TYPE_CANONICAL (x) =
TYPE_CANONICAL (c).
And then there is the theoretical case that c is some type from
the TYPE_MAIN_VARIANT chain which we haven't processed yet.  And that
build_qualified_type moved it to the second position in the chain
even when we haven't processed that yet.  For that, I think we need
to first process that c and only then restart handling of x.
So, either we could:
  gcc_checking_assert (TYPE_MAIN_VARIANT (t) == t && !TYPE_QUALS (t));
  for (tree x = t; x; x = TYPE_NEXT_VARIANT (x))
{
  if (x != t && TYPE_STRUCTURAL_EQUALITY_P (x))
{
  if (!TYPE_QUALS (x))
TYPE_CANONICAL (x) = TYPE_CANONICAL (t);
  else
{
  tree
c = build_qualified_type (TYPE_CANONICAL (t), TYPE_QUALS (x));
  if (TYPE_STRUCTURAL_EQUALITY_P (c))
{
  gcc_checking_assert (TYPE_CANONICAL (t) == t);
  if (c == x)
TYPE_CANONICAL (x) = x;
  else
{
  /* build_qualified_type unhelpfully moved c from some
 later spot in TYPE_MAIN_VARIANT (t) chain to right
 after t.  Restart processing the whole chain.  */
  gcc_checking_assert (TYPE_MAIN_VARIANT (t) == c);
  x = t;
  continue;
}
}
  else
TYPE_CANONICAL (x) = TYPE_CANONICAL (c);
}
}
...
but that could walk perhaps very long chain over and over (sure, for the
already handled cases it would see TYPE_STRUCTUAL_EQUALITY_P (x) is no
longer the case, but still, I'm afraid it increases compile time complexity
for pathological cases too much.
Or perhaps undo what get_qualified_type's
/* Put the found variant at the head of the variant list so
   frequently searched variants get found faster.  The C++ FE
   benefits greatly from this.  */
tree t = *tp;
*tp = TYPE_NEXT_VARIANT (t);
TYPE_NEXT_VARIANT (t) = TYPE_NEXT_VARIANT (mv);
TYPE_NEXT_VARIANT (mv) = t;
return t;
optimization or build_variant_type_copy clearly as well (it chains new types
to TYPE_NEXT_VARIANT (mv) as well).
So perhaps instead we need to undo the move.

Here is what I've bootstrapped/regtested and what broke pr114574-2.c:

gcc/c/
* c-decl.cc (c_update_type_canonical): Assert t is main variant
with 0 TYPE_QUALS.  Simplify and don't use check_qualified_type.
gcc/testsuite/
* gcc.dg/pr114930.c: New test.
* gcc.dg/pr115502.c: New test.

--- gcc/c/c-decl.cc.jj  2024-06-07 12:17:09.582969919 +0200
+++ gcc/c/c-decl.cc 2024-06-19 13:35:46.648956792 +0200
@@ -9367,18 +9367,19 @@ is_flexible_array_member_p (bool is_last
 static void
 c_update_type_canonical (tree t)
 {
-  for (tree x = TYPE_MAIN_VARIANT (t); x; x = TYPE_NEXT_VARIANT (x))
+  gcc_checking_assert (TYPE_MAIN_VARIANT (t) == t && !TYPE_QUALS (t));
+  for (tree x = t; x; x = TYPE_NEXT_VARIANT (x))
 {
   if (x != t && TYPE_STRUCTURAL_EQUALITY_P (x))
{
- if (TYPE_QUALS (x) == TYPE_QUALS (t))
+ if (!TYPE_QUALS (x))
TYPE_CANONICAL (x) = TYPE_CANONICAL (t);
- else if (TYPE_CANONICAL (t) != t
-  || check_qualified_type (x, t, TYPE_QUALS (x)))
-   TYPE_CANONICAL (x)
- = build_qualified_type (TYPE_CANONICAL (t), TYPE_QUALS (x));
  else
-   TYPE_CANONICAL (x) = x;
+   {
+ tree
+   c = build_qualified

Re: [PATCH] MIPS: Use Reg0 instead of const0_rtx for TRAP

2024-06-19 Thread Maciej W. Rozycki
On Wed, 19 Jun 2024, YunQiang Su wrote:

> MIPSr6 removes condition trap instructions with imm, so the instruction
> like `teq $2,imm` will be converted to
>   li $at, imm
>   teq $2, $at
> 
> The current version of Gas cannot detect if imm is zero, and output
>   teq $2, $0
> Let's do it in GCC.

 It seems like an output pattern issue with `*conditional_trap_reg' 
insn to me.

> diff --git a/gcc/config/mips/mips.cc b/gcc/config/mips/mips.cc
> index 48924116937..ba1e6214656 100644
> --- a/gcc/config/mips/mips.cc
> +++ b/gcc/config/mips/mips.cc
> @@ -6026,7 +6026,7 @@ mips_expand_conditional_trap (rtx comparison)
>  
>emit_insn (gen_rtx_TRAP_IF (VOIDmode,
> gen_rtx_fmt_ee (code, mode, op0, op1),
> -   const0_rtx));
> +   gen_rtx_REG (mode, GP_REG_FIRST)));

 IOW this just papers over the actual issue.

 FWIW,

  Maciej


[PATCH] MIPS: Implement vcond_mask optabs for MSA

2024-06-19 Thread YunQiang Su
Currently, we have `mips_expand_vec_cond_expr`, which calculate
cmp_res first.  We can just add a new extra argument to ask it
to use operands[3] as cmp_res instead of calculating from operands[4]
and operands[5].

gcc
* config/mips/mips.cc(mips_expand_vec_cond_expr): Add extra
argument to info that opernads[3] is cmp_res already.
* config/mips/mips-msa.md(vcond_mask): Define new expand.
(vcondu): Use mips_expand_vec_cond_expr with 4th argument.
(vcond): Ditto.
---
 gcc/config/mips/mips-msa.md   | 17 +++--
 gcc/config/mips/mips-protos.h |  2 +-
 gcc/config/mips/mips.cc   | 18 --
 3 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/gcc/config/mips/mips-msa.md b/gcc/config/mips/mips-msa.md
index 779157f2a0c..0081b688ce9 100644
--- a/gcc/config/mips/mips-msa.md
+++ b/gcc/config/mips/mips-msa.md
@@ -411,6 +411,19 @@ (define_expand "vec_set"
   DONE;
 })
 
+(define_expand "vcond_mask_"
+  [(match_operand:MSA 0 "register_operand")
+   (match_operand:MSA 1 "reg_or_m1_operand")
+   (match_operand:MSA 2 "reg_or_0_operand")
+   (match_operand:IMSA 3 "register_operand")]
+  "ISA_HAS_MSA
+   && (GET_MODE_NUNITS (mode) == GET_MODE_NUNITS (mode))"
+{
+  mips_expand_vec_cond_expr (mode, mode, operands, true);
+  DONE;
+})
+
+
 (define_expand "vcondu"
   [(match_operand:MSA 0 "register_operand")
(match_operand:MSA 1 "reg_or_m1_operand")
@@ -421,7 +434,7 @@ (define_expand "vcondu"
   "ISA_HAS_MSA
&& (GET_MODE_NUNITS (mode) == GET_MODE_NUNITS (mode))"
 {
-  mips_expand_vec_cond_expr (mode, mode, operands);
+  mips_expand_vec_cond_expr (mode, mode, operands, 
false);
   DONE;
 })
 
@@ -435,7 +448,7 @@ (define_expand "vcond"
   "ISA_HAS_MSA
&& (GET_MODE_NUNITS (mode) == GET_MODE_NUNITS (mode))"
 {
-  mips_expand_vec_cond_expr (mode, mode, operands);
+  mips_expand_vec_cond_expr (mode, mode, operands, 
false);
   DONE;
 })
 
diff --git a/gcc/config/mips/mips-protos.h b/gcc/config/mips/mips-protos.h
index fcc0a0ae663..75f80984c03 100644
--- a/gcc/config/mips/mips-protos.h
+++ b/gcc/config/mips/mips-protos.h
@@ -385,7 +385,7 @@ extern mulsidi3_gen_fn mips_mulsidi3_gen_fn (enum rtx_code);
 #endif
 
 extern void mips_register_frame_header_opt (void);
-extern void mips_expand_vec_cond_expr (machine_mode, machine_mode, rtx *);
+extern void mips_expand_vec_cond_expr (machine_mode, machine_mode, rtx *, 
bool);
 extern void mips_expand_vec_cmp_expr (rtx *);
 
 extern void mips_emit_speculation_barrier_function (void);
diff --git a/gcc/config/mips/mips.cc b/gcc/config/mips/mips.cc
index b7acf041903..b1219385096 100644
--- a/gcc/config/mips/mips.cc
+++ b/gcc/config/mips/mips.cc
@@ -22777,14 +22777,20 @@ mips_expand_vec_cmp_expr (rtx *operands)
 
 void
 mips_expand_vec_cond_expr (machine_mode mode, machine_mode vimode,
-  rtx *operands)
+  rtx *operands, bool mask)
 {
-  rtx cond = operands[3];
-  rtx cmp_op0 = operands[4];
-  rtx cmp_op1 = operands[5];
-  rtx cmp_res = gen_reg_rtx (vimode);
+  rtx cmp_res;
+  if (mask)
+cmp_res = operands[3];
+  else
+{
+  rtx cond = operands[3];
+  rtx cmp_op0 = operands[4];
+  rtx cmp_op1 = operands[5];
+  cmp_res = gen_reg_rtx (vimode);
 
-  mips_expand_msa_cmp (cmp_res, GET_CODE (cond), cmp_op0, cmp_op1);
+  mips_expand_msa_cmp (cmp_res, GET_CODE (cond), cmp_op0, cmp_op1);
+}
 
   /* We handle the following cases:
  1) r = a CMP b ? -1 : 0
-- 
2.39.3 (Apple Git-146)



Re: [PATCH] xtensa: Eliminate double MEMW insertions for volatile memory

2024-06-19 Thread Max Filippov
On Tue, Jun 18, 2024 at 10:00 PM Takayuki 'January June' Suwa
 wrote:
>
> This patch makes avoid inserting a MEMW instruction before a load/store
> nstruction with volatile memory reference if there is already a MEMW
> immediately before it.
>
> gcc/ChangeLog:
>
> * config/xtensa/xtensa.cc (print_operand):
> When outputting MEMW before the instruction, check if the previous
> instruction is already that.
> ---
>   gcc/config/xtensa/xtensa.cc | 12 +++-
>   1 file changed, 11 insertions(+), 1 deletion(-)

Regtested for target=xtensa-linux-uclibc, no new regressions.
Committed to master.

-- 
Thanks.
-- Max


Re: [gcc r15-1436] build: Fix missing variable quotes

2024-06-19 Thread Maciej W. Rozycki
On Wed, 19 Jun 2024, YunQiang Su via Gcc-cvs wrote:

> diff --git a/configure b/configure
> index 51576a41f303..6e95b27d9df4 100755
> --- a/configure
> +++ b/configure
> @@ -8994,15 +8994,15 @@ if test "$ac_res" != no; then :
>  fi
>  
>  
> -if test $ac_cv_search_dlopen = -ldl; then
> +if test "$ac_cv_search_dlopen" = -ldl; then
>  CRAB1_LIBS="$CRAB1_LIBS -ldl"
> -elif test $ac_cv_search_dlopen = no; then
> +elif test "$ac_cv_search_dlopen" = no; then
>  missing_rust_dynlibs="libdl"
>  fi
>  
> -if test $ac_cv_search_pthread_create = -lpthread; then
> +if test "$ac_cv_search_pthread_create" = -lpthread; then
>  CRAB1_LIBS="$CRAB1_LIBS -lpthread"
> -elif test $ac_cv_search_pthread_crate = no; then
> +elif test "$ac_cv_search_pthread_crate" = no; then
>  missing_rust_dynlibs="$missing_rust_dynlibs, libpthread"
>  fi
>  

 There's still an unfixed typo here, it should have been corrected in the 
same commit since the lines have been changed anyway.

> @@ -19746,7 +19746,7 @@ config.status
>  configured by $0, generated by GNU Autoconf 2.69,
>with options \\"\$ac_cs_config\\"
>  
> -Copyright (C) 2012 Free Software Foundation, Inc.
> +Copyright (C)  Free Software Foundation, Inc.
>  This config.status script is free software; the Free Software Foundation
>  gives unlimited permission to copy, distribute and modify it."
>  

 There seems to be a tooling problem here.

> diff --git a/gcc/configure.ac b/gcc/configure.ac
> index b2243e9954aa..1501bf89c89d 100644
> --- a/gcc/configure.ac
> +++ b/gcc/configure.ac
> @@ -5317,7 +5317,7 @@ x:
>  
>  AC_MSG_CHECKING(assembler and linker for explicit JALR relocation)
>  gcc_cv_as_ld_jalr_reloc=no
> -if test $gcc_cv_as_mips_explicit_relocs = yes; then
> +if test "x$gcc_cv_as_mips_explicit_relocs" = "xyes"; then
>if test $in_tree_ld = yes ; then
>  if test "$gcc_cv_gld_major_version" -eq 2 -a 
> "$gcc_cv_gld_minor_version" -ge 20 -o "$gcc_cv_gld_major_version" -gt 2 \
> && test $in_tree_ld_is_elf = yes; then

 This should not have been bundled with the other change, because it is 
unrelated and more importantly clearly it has to be backported to the 
relevant release branches.

  Maciej


[PATCH] libstdc++: Fix std::to_array for trivial-ish types [PR115522]

2024-06-19 Thread Jonathan Wakely
Tested x86_64-linux. Not pushed yet. backports will be needed too.

-- >8 --

Due to PR c++/85723 the std::is_trivial trait is true for types with a
deleted default constructor, so the use of std::is_trivial in
std::to_array is not sufficient to ensure the type can be trivially
default constructed then filled using memcpy.

I also forgot that a type with a deleted assignment operator can still
be trivial, so we also need to check that it's assignable because the
is_constant_evaluated() path can't use memcpy.

Replace the uses of std::is_trivial with std::is_trivially_copyable
(needed for memcpy), std::is_trivially_default_constructible (needed so
that the default construction is valid and does no work) and
std::is_copy_assignable (needed for the constant evaluation case).

libstdc++-v3/ChangeLog:

PR libstdc++/115522
* include/std/array (to_array): Workaround the fact that
std::is_trivial is not sufficient to check that a type is
trivially default constructible and assignable.
* testsuite/23_containers/array/creation/115522.cc: New test.
---
 libstdc++-v3/include/std/array|  8 +++--
 .../23_containers/array/creation/115522.cc| 33 +++
 2 files changed, 39 insertions(+), 2 deletions(-)
 create mode 100644 
libstdc++-v3/testsuite/23_containers/array/creation/115522.cc

diff --git a/libstdc++-v3/include/std/array b/libstdc++-v3/include/std/array
index 39695471e24..8710bf75924 100644
--- a/libstdc++-v3/include/std/array
+++ b/libstdc++-v3/include/std/array
@@ -431,7 +431,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   static_assert(is_constructible_v<_Tp, _Tp&>);
   if constexpr (is_constructible_v<_Tp, _Tp&>)
{
- if constexpr (is_trivial_v<_Tp>)
+ if constexpr (is_trivially_copyable_v<_Tp>
+ && is_trivially_default_constructible_v<_Tp>
+ && is_copy_assignable_v<_Tp>)
{
  array, _Nm> __arr;
  if (!__is_constant_evaluated() && _Nm != 0)
@@ -460,7 +462,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   static_assert(is_move_constructible_v<_Tp>);
   if constexpr (is_move_constructible_v<_Tp>)
{
- if constexpr (is_trivial_v<_Tp>)
+ if constexpr (is_trivially_copyable_v<_Tp>
+ && is_trivially_default_constructible_v<_Tp>
+ && is_copy_assignable_v<_Tp>)
{
  array, _Nm> __arr;
  if (!__is_constant_evaluated() && _Nm != 0)
diff --git a/libstdc++-v3/testsuite/23_containers/array/creation/115522.cc 
b/libstdc++-v3/testsuite/23_containers/array/creation/115522.cc
new file mode 100644
index 000..37073e002bd
--- /dev/null
+++ b/libstdc++-v3/testsuite/23_containers/array/creation/115522.cc
@@ -0,0 +1,33 @@
+// { dg-do compile { target c++20 } }
+
+// PR libstdc++/115522 std::to_array no longer works for struct which is
+// trivial but not default constructible
+
+#include 
+
+void
+test_deleted_ctor()
+{
+  struct S
+  {
+S() = delete;
+S(int) { }
+  };
+
+  S arr[1] = {{1}};
+  auto arr1 = std::to_array(arr);
+  auto arr2 = std::to_array(std::move(arr));
+}
+
+void
+test_deleted_assignment()
+{
+  struct S
+  {
+void operator=(const S&) = delete;
+  };
+
+  S arr[1] = {};
+  auto a1 = std::to_array(arr);
+  auto a2 = std::to_array(std::move(arr));
+}
-- 
2.45.1



[committed] libstdc++: Consistently indent with tabs

2024-06-19 Thread Jonathan Wakely
Whitespace only. Tested x86_64-linux. Pushed to trunk.

-- >8 --

libstdc++-v3/ChangeLog:

* include/std/future: Adjust whitespace to use tabs for
indentation.
---
 libstdc++-v3/include/std/future | 328 
 1 file changed, 164 insertions(+), 164 deletions(-)

diff --git a/libstdc++-v3/include/std/future b/libstdc++-v3/include/std/future
index d7be205af50..6ce7d89ca3f 100644
--- a/libstdc++-v3/include/std/future
+++ b/libstdc++-v3/include/std/future
@@ -292,7 +292,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   {
using __allocator_type = __alloc_rebind<_Alloc, _Result_alloc>;
 
-explicit
+   explicit
_Result_alloc(const _Alloc& __a) : _Result<_Res>(), _Alloc(__a)
{ }
 
@@ -362,9 +362,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   }
 
   template
-future_status
-wait_for(const chrono::duration<_Rep, _Period>& __rel)
-{
+   future_status
+   wait_for(const chrono::duration<_Rep, _Period>& __rel)
+   {
  // First, check if the future has been made ready.  Use acquire MO
  // to synchronize with the thread that made it ready.
  if (_M_status._M_load(memory_order_acquire) == _Status::__ready)
@@ -396,9 +396,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
}
 
   template
-future_status
-wait_until(const chrono::time_point<_Clock, _Duration>& __abs)
-{
+   future_status
+   wait_until(const chrono::time_point<_Clock, _Duration>& __abs)
+   {
 #if __cplusplus > 201703L
  static_assert(chrono::is_clock_v<_Clock>);
 #endif
@@ -430,8 +430,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   _M_set_result(function<_Ptr_type()> __res, bool __ignore_failure = false)
   {
bool __did_set = false;
-// all calls to this function are serialized,
-// side-effects of invoking __res only happen once
+   // all calls to this function are serialized,
+   // side-effects of invoking __res only happen once
call_once(_M_once, &_State_baseV2::_M_do_set, this,
  std::__addressof(__res), std::__addressof(__did_set));
if (__did_set)
@@ -439,7 +439,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
  _M_status._M_store_notify_all(_Status::__ready,
memory_order_release);
else if (!__ignore_failure)
-  __throw_future_error(int(future_errc::promise_already_satisfied));
+ __throw_future_error(int(future_errc::promise_already_satisfied));
   }
 
   // Provide a result to the shared state but delay making it ready
@@ -451,12 +451,12 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   {
bool __did_set = false;
unique_ptr<_Make_ready> __mr{new _Make_ready};
-// all calls to this function are serialized,
-// side-effects of invoking __res only happen once
+   // all calls to this function are serialized,
+   // side-effects of invoking __res only happen once
call_once(_M_once, &_State_baseV2::_M_do_set, this,
  std::__addressof(__res), std::__addressof(__did_set));
if (!__did_set)
-  __throw_future_error(int(future_errc::promise_already_satisfied));
+ __throw_future_error(int(future_errc::promise_already_satisfied));
__mr->_M_shared_state = std::move(__self);
__mr->_M_set();
__mr.release();
@@ -490,41 +490,41 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   }
 
   template
-struct _Setter;
+   struct _Setter;
 
   // set lvalues
   template
-struct _Setter<_Res, _Arg&>
-{
-  // check this is only used by promise::set_value(const R&)
-  // or promise::set_value(R&)
-  static_assert(is_same<_Res, _Arg&>::value  // promise
-  || is_same::value,   // promise
-  "Invalid specialisation");
+   struct _Setter<_Res, _Arg&>
+   {
+ // check this is only used by promise::set_value(const R&)
+ // or promise::set_value(R&)
+ static_assert(is_same<_Res, _Arg&>::value  // promise
+ || is_same::value,   // promise
+   "Invalid specialisation");
 
  // Used by std::promise to copy construct the result.
-  typename promise<_Res>::_Ptr_type operator()() const
-  {
-_M_promise->_M_storage->_M_set(*_M_arg);
-return std::move(_M_promise->_M_storage);
-  }
-  promise<_Res>*_M_promise;
-  _Arg* _M_arg;
-};
+ typename promise<_Res>::_Ptr_type operator()() const
+ {
+   _M_promise->_M_storage->_M_set(*_M_arg);
+   return std::move(_M_promise->_M_storage);
+ }
+ promise<_Res>*_M_promise;
+ _Arg* _M_arg;
+   };
 
   // set rvalues
   template
-struct _Setter<_Res, _Res&&>
-{
+   struct _Setter<_Res, _R

[committed] libstdc++: Add noexcept to some std::promise shared state internals

2024-06-19 Thread Jonathan Wakely
Tested x86_64-linux. Pushed to trunk.

-- >8 --

Making the state ready for a std::promise only needs to move a
unique_ptr, which cannot throw. Make its call operator noexcept.
Similarly, making the state ready by storing an exception_ptr also can't
throw, so make that call operator noexcept too.

libstdc++-v3/ChangeLog:

* include/std/future (_State_baseV2::_Setter): Add
noexcept to call operator.
(_State_baseV2::_Setter): Likewise.
---
 libstdc++-v3/include/std/future | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libstdc++-v3/include/std/future b/libstdc++-v3/include/std/future
index 9e75ae98b13..d7be205af50 100644
--- a/libstdc++-v3/include/std/future
+++ b/libstdc++-v3/include/std/future
@@ -532,7 +532,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
{
  static_assert(is_void<_Res>::value, "Only used for promise");
 
- typename promise<_Res>::_Ptr_type operator()() const
+ typename promise<_Res>::_Ptr_type operator()() const noexcept
  { return std::move(_M_promise->_M_storage); }
 
  promise<_Res>*_M_promise;
@@ -545,7 +545,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 struct _Setter<_Res, __exception_ptr_tag>
 {
  // Used by std::promise to store an exception as the result.
-  typename promise<_Res>::_Ptr_type operator()() const
+  typename promise<_Res>::_Ptr_type operator()() const noexcept
   {
 _M_promise->_M_storage->_M_error = *_M_ex;
 return std::move(_M_promise->_M_storage);
-- 
2.45.1



[committed] libstdc++: Add conditional noexcept to std::pair default ctor

2024-06-19 Thread Jonathan Wakely
Tested x86_64-linux. Pushed to trunk.

-- >8 --

Most of std::pair constructors implemented using C++20 concepts have a
conditional noexcept-specifier, but the default constructor doesn't.
This fixes that.

libstdc++-v3/ChangeLog:

* include/bits/stl_pair.h [__cpp_lib_concepts] (pair()): Add
conditional noexcept.
---
 libstdc++-v3/include/bits/stl_pair.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libstdc++-v3/include/bits/stl_pair.h 
b/libstdc++-v3/include/bits/stl_pair.h
index 0c1e5719a1a..0d60eaba194 100644
--- a/libstdc++-v3/include/bits/stl_pair.h
+++ b/libstdc++-v3/include/bits/stl_pair.h
@@ -344,6 +344,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   explicit(__not_<__and_<__is_implicitly_default_constructible<_T1>,
 __is_implicitly_default_constructible<_T2>>>())
   pair()
+  noexcept(is_nothrow_default_constructible_v<_T1>
+   && is_nothrow_default_constructible_v<_T2>)
   requires is_default_constructible_v<_T1>
   && is_default_constructible_v<_T2>
   : first(), second()
-- 
2.45.1



[PATCH] rs6000, altivec-1-runnable.c update the require-effective-target

2024-06-19 Thread Carl Love
GCC maintainers:

The dg options for this test should be the same as for altivec-2-runnable.c.  
This patch updates the dg options to match 
the settings in altivec-2-runnable.c.

The patch has been tested on Power 10 with no regression failures.

Please let me know if this patch is acceptable for mainline.  Thanks.

Carl 

--From
 289e15d215161ad45ae1aae7a5dedd2374737ec4 rs6000, altivec-1-runnable.c update 
the require-effective-target

The test requires a minimum of Power8 vector HW and a compile level
of -O2.

gcc/testsuite/ChangeLog:gcc/testsuite/ChangeLog:
* gcc.target/powerpc/altivec-1-runnable.c: Change the
require-effective-target for the test.
---
 gcc/testsuite/gcc.target/powerpc/altivec-1-runnable.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/gcc/testsuite/gcc.target/powerpc/altivec-1-runnable.c 
b/gcc/testsuite/gcc.target/powerpc/altivec-1-runnable.c
index da8ebbc30ba..c113089c13a 100644
--- a/gcc/testsuite/gcc.target/powerpc/altivec-1-runnable.c
+++ b/gcc/testsuite/gcc.target/powerpc/altivec-1-runnable.c
@@ -1,6 +1,7 @@
-/* { dg-do compile { target powerpc*-*-* } } */
-/* { dg-require-effective-target powerpc_altivec_ok } */
-/* { dg-options "-maltivec" } */
+/* { dg-do run { target vsx_hw } } */
+/* { dg-do compile { target { ! vmx_hw } } } */
+/* { dg-options "-O2 -mdejagnu-cpu=power8" } */
+/* { dg-require-effective-target powerpc_altivec } */
 
 #include 
 
-- 
2.45.0



Re: [PATCH ver3] rs6000, altivec-2-runnable.c update the require-effective-target

2024-06-19 Thread Carl Love
Everyone, Oops, this should be version 3 not 2.  Sorry.

  Carl 

On 6/19/24 09:13, Carl Love wrote:
> GCC maintainers:
> 
> version 2:  Updated per the feedback from Peter, Kewen and Segher.  Note, 
> Peter suggested the -mdejagnu-cpu= value must be power7.  
> The test fails if -mdejagnu-cpu= is set to power7, needs to be power8.  Patch 
> has been retested on a Power 10 box, it succeeds
> with 2 passes and no fails.
> 
> Per the additional feedback after patch: 
> 
>   commit c892525813c94b018464d5a4edc17f79186606b7
>   Author: Carl Love 
>   Date:   Tue Jun 11 14:01:16 2024 -0400
> 
>   rs6000, altivec-2-runnable.c should be a runnable test
> 
>   The test case has "dg-do compile" set not "dg-do run" for a runnable
>   test.  This patch changes the dg-do command argument to run.
> 
>   gcc/testsuite/ChangeLog:gcc/testsuite/ChangeLog:
>   * gcc.target/powerpc/altivec-2-runnable.c: Change dg-do
>   argument to run.
> 
> was approved and committed, I have updated the dg-require-effective-target
> and dg-options as requested so the test will compile with -O2 on a 
> machine that has a minimum support of Power 8 vector hardware.
> 
> The patch has been tested on Power 10 with no regression failures.
> 
> Please let me know if this patch is acceptable for mainline.  Thanks.
> 
> Carl 
> 
> 
> rs6000, altivec-2-runnable.c update the require-effective-target
> 
> The test requires a minimum of Power8 vector HW and a compile level
> of -O2.
> 
> gcc/testsuite/ChangeLog:gcc/testsuite/ChangeLog:
>   * gcc.target/powerpc/altivec-2-runnable.c: Change the
>   require-effective-target for the test.
> ---
>  gcc/testsuite/gcc.target/powerpc/altivec-2-runnable.c | 8 
>  1 file changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/gcc/testsuite/gcc.target/powerpc/altivec-2-runnable.c 
> b/gcc/testsuite/gcc.target/powerpc/altivec-2-runnable.c
> index 17b23eb9d50..9e7ef89327b 100644
> --- a/gcc/testsuite/gcc.target/powerpc/altivec-2-runnable.c
> +++ b/gcc/testsuite/gcc.target/powerpc/altivec-2-runnable.c
> @@ -1,7 +1,7 @@
> -/* { dg-do run } */
> -/* { dg-options "-mvsx" } */
> -/* { dg-additional-options "-mdejagnu-cpu=power8" { target { ! has_arch_pwr8 
> } } } */
> -/* { dg-require-effective-target powerpc_vsx } */
> +/* { dg-do run { target vsx_hw } } */
> +/* { dg-do compile { target { ! vmx_hw } } } */
> +/* { dg-options "-O2  -mdejagnu-cpu=power8" } */
> +/* { dg-require-effective-target powerpc_altivec } */
>  
>  #include 
>  


[PATCH ver2] rs6000, altivec-2-runnable.c update the require-effective-target

2024-06-19 Thread Carl Love
GCC maintainers:

version 2:  Updated per the feedback from Peter, Kewen and Segher.  Note, Peter 
suggested the -mdejagnu-cpu= value must be power7.  
The test fails if -mdejagnu-cpu= is set to power7, needs to be power8.  Patch 
has been retested on a Power 10 box, it succeeds
with 2 passes and no fails.

Per the additional feedback after patch: 

  commit c892525813c94b018464d5a4edc17f79186606b7
  Author: Carl Love 
  Date:   Tue Jun 11 14:01:16 2024 -0400

  rs6000, altivec-2-runnable.c should be a runnable test

  The test case has "dg-do compile" set not "dg-do run" for a runnable
  test.  This patch changes the dg-do command argument to run.

  gcc/testsuite/ChangeLog:gcc/testsuite/ChangeLog:
  * gcc.target/powerpc/altivec-2-runnable.c: Change dg-do
  argument to run.

was approved and committed, I have updated the dg-require-effective-target
and dg-options as requested so the test will compile with -O2 on a 
machine that has a minimum support of Power 8 vector hardware.

The patch has been tested on Power 10 with no regression failures.

Please let me know if this patch is acceptable for mainline.  Thanks.

Carl 


rs6000, altivec-2-runnable.c update the require-effective-target

The test requires a minimum of Power8 vector HW and a compile level
of -O2.

gcc/testsuite/ChangeLog:gcc/testsuite/ChangeLog:
* gcc.target/powerpc/altivec-2-runnable.c: Change the
require-effective-target for the test.
---
 gcc/testsuite/gcc.target/powerpc/altivec-2-runnable.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gcc/testsuite/gcc.target/powerpc/altivec-2-runnable.c 
b/gcc/testsuite/gcc.target/powerpc/altivec-2-runnable.c
index 17b23eb9d50..9e7ef89327b 100644
--- a/gcc/testsuite/gcc.target/powerpc/altivec-2-runnable.c
+++ b/gcc/testsuite/gcc.target/powerpc/altivec-2-runnable.c
@@ -1,7 +1,7 @@
-/* { dg-do run } */
-/* { dg-options "-mvsx" } */
-/* { dg-additional-options "-mdejagnu-cpu=power8" { target { ! has_arch_pwr8 } 
} } */
-/* { dg-require-effective-target powerpc_vsx } */
+/* { dg-do run { target vsx_hw } } */
+/* { dg-do compile { target { ! vmx_hw } } } */
+/* { dg-options "-O2  -mdejagnu-cpu=power8" } */
+/* { dg-require-effective-target powerpc_altivec } */
 
 #include 
 
-- 
2.45.0



[PATCH] MIPS: Use Reg0 instead of const0_rtx for TRAP

2024-06-19 Thread YunQiang Su
MIPSr6 removes condition trap instructions with imm, so the instruction
like `teq $2,imm` will be converted to
  li $at, imm
  teq $2, $at

The current version of Gas cannot detect if imm is zero, and output
  teq $2, $0
Let's do it in GCC.

gcc
* config/mips/mips.cc(mips_expand_conditional_trap): Use Reg0
instead of const0_rtx.
---
 gcc/config/mips/mips.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/config/mips/mips.cc b/gcc/config/mips/mips.cc
index 48924116937..ba1e6214656 100644
--- a/gcc/config/mips/mips.cc
+++ b/gcc/config/mips/mips.cc
@@ -6026,7 +6026,7 @@ mips_expand_conditional_trap (rtx comparison)
 
   emit_insn (gen_rtx_TRAP_IF (VOIDmode,
  gen_rtx_fmt_ee (code, mode, op0, op1),
- const0_rtx));
+ gen_rtx_REG (mode, GP_REG_FIRST)));
 }
 
 /* Initialize *CUM for a call to a function of type FNTYPE.  */
-- 
2.39.3 (Apple Git-146)



Re: [PATCH v2] ARM: thumb1: Use LDMIA/STMIA for DI/DF loads/stores

2024-06-19 Thread Richard Earnshaw (lists)
On 19/06/2024 16:11, Siarhei Volkau wrote:
> ср, 19 июн. 2024 г. в 15:19, Richard Earnshaw (lists)
> :
>>
>> On 18/06/2024 19:14, Siarhei Volkau wrote:
>>> If the address register is dead after load/store operation it looks
>>> beneficial to use LDMIA/STMIA instead of pair of LDR/STR instructions,
>>> at least if optimizing for size.
>>>
>>> Changes v1 -> v2:
>>>  - switching to peephole2 approach
>>>  - added test case
>>>
>>> gcc/ChangeLog:
>>>
>>> * config/arm/thumb1.md (peephole2 to rewrite DI/DF load): New.
>>> (peephole2 to rewrite DI/DF store): New.
>>> (thumb1_movdi_insn): Handle overlapped regs ldmia case.
>>> (thumb_movdf_insn): Likewise.
>>>
>>>   * config/arm/iterators.md (DIDF): New.
>>>
>>> gcc/testsuite:
>>>
>>> * gcc.target/arm/thumb1-load-store-64bit.c: Add new test.
>>>
>>> Signed-off-by: Siarhei Volkau 
>>> ---
>>>  gcc/config/arm/iterators.md   |  3 +++
>>>  gcc/config/arm/thumb1.md  | 27 ++-
>>>  .../gcc.target/arm/thumb1-load-store-64bit.c  | 16 +++
>>>  3 files changed, 45 insertions(+), 1 deletion(-)
>>>  create mode 100644 gcc/testsuite/gcc.target/arm/thumb1-load-store-64bit.c
>>>
>>> diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
>>> index 8d066fcf05d..09046bff83b 100644
>>> --- a/gcc/config/arm/iterators.md
>>> +++ b/gcc/config/arm/iterators.md
>>> @@ -50,6 +50,9 @@ (define_mode_iterator QHSD [QI HI SI DI])
>>>  ;; A list of the 32bit and 64bit integer modes
>>>  (define_mode_iterator SIDI [SI DI])
>>>
>>> +;; A list of the 64bit modes for thumb1.
>>> +(define_mode_iterator DIDF [DI DF])
>>> +
>>>  ;; A list of atomic compare and swap success return modes
>>>  (define_mode_iterator CCSI [(CC_Z "TARGET_32BIT") (SI "TARGET_THUMB1")])
>>>
>>> diff --git a/gcc/config/arm/thumb1.md b/gcc/config/arm/thumb1.md
>>> index d7074b43f60..ed4b706773a 100644
>>> --- a/gcc/config/arm/thumb1.md
>>> +++ b/gcc/config/arm/thumb1.md
>>> @@ -633,6 +633,8 @@ (define_insn "*thumb1_movdi_insn"
>>>gcc_assert (TARGET_HAVE_MOVT);
>>>return \"movw\\t%Q0, %L1\;movs\\tR0, #0\";
>>>  case 4:
>>> +  if (reg_overlap_mentioned_p (operands[0], operands[1]))
>>> + return \"ldmia\\t%m1, {%0, %H0}\";
>>
>> See below for why I don't think this is a case we need to consider here.
>>
>>>return \"ldmia\\t%1, {%0, %H0}\";
>>>  case 5:
>>>return \"stmia\\t%0, {%1, %H1}\";
>>> @@ -966,6 +968,8 @@ (define_insn "*thumb_movdf_insn"
>>>   return \"adds\\t%0, %1, #0\;adds\\t%H0, %H1, #0\";
>>>return \"adds\\t%H0, %H1, #0\;adds\\t%0, %1, #0\";
>>>  case 1:
>>> +  if (reg_overlap_mentioned_p (operands[0], operands[1]))
>>> + return \"ldmia\\t%m1, {%0, %H0}\";
>>>return \"ldmia\\t%1, {%0, %H0}\";
>>>  case 2:
>>>return \"stmia\\t%0, {%1, %H1}\";
>>> @@ -2055,4 +2059,25 @@ (define_insn "thumb1_stack_protect_test_insn"
>>> (set_attr "conds" "clob")
>>> (set_attr "type" "multiple")]
>>>  )
>>> -
>>> +
>>> +;; match patterns usable by ldmia/stmia
>>> +(define_peephole2
>>> +  [(set (match_operand:DIDF 0 "low_register_operand" "")
>>> + (mem:DIDF (match_operand:SI 1 "low_register_operand")))]
>>> +  "TARGET_THUMB1
>>> +   && (peep2_reg_dead_p (1, operands[1])
>>> +   || REGNO (operands[0]) + 1 == REGNO (operands[1]))"
>>
>> I don't understand this second condition (partial overlap of the base 
>> address with the value loaded), what are you guarding against here?  The 
>> instruction specification says that there is no writeback if the base 
>> register has any overlap with any of the loaded registers, so really we 
>> should reject the peephole if that's true (we'd have invalid RTL as well as 
>> there would end up being two writes to the same register).
>>
> 
> The first condition here is for ldmia cases with writeback, the second
> condition is for cases without writeback. I falsely thought that base
> register has to be the last register in the list similarly like for
> stmia it has to be first.
> So, I have to reject the no-writeback case there because it emits
> incorrect RTL, this is clear now.
> 
> However, to not miss possible optimization, no-writeback cases check
> might be moved to instruction itself as it was done in v1 patch
> (incorrect instruction length will appear again then).
> Any objections on that?

Yes, I think the no-writeback case will need to be handled inside the code that 
prints the instruction (like you were doing originally).  We'll just have to 
forgo the knowledge that this case only needs 2 bytes for the instruction here 
(it's not a major loss).

> 
>>> +  [(set (match_dup 0)
>>> + (mem:DIDF (post_inc:SI (match_dup 1]
>>> +  ""
>>
>> This is not enough, unfortunately.  MEM() objects carry attributes about the 
>> memory accessed (alias sets, known alignment, etc) and these will not be 
>> propagated correctly if you rewrite the pattern this way.  The correct 
>> 

Re: [RFC/RFA] [PATCH 08/12] Add a new pass for naive CRC loops detection

2024-06-19 Thread Mariam Arutunian
On Sun, Jun 9, 2024 at 2:00 AM Jeff Law  wrote:

>
>
> On 5/29/24 5:12 AM, Mariam Arutunian wrote:
>
> >
> > IIRC we looked at the problem of canonicalizing the loop into a form
> > where we didn't necessarily have conditional blocks, instead we had
> > branchless sequences for the conditional xor and dealing with the
> high
> > bit in the crc.  My recollection was that the coremark CRC loop would
> > always canonicalize, but that in general we still saw multiple CRC
> > implementations that did not canonicalize and thus we still needed
> the
> > more complex matching.  Correct?
> >
> >
> > The loop in CoreMark is not fully canonicalized in that form,
> > as there are still branches present for the conditional XOR operation.
> > I checked that using the -O2 and -O3 flags.
> A bit of a surprise.  Though it may be the case that some of the
> canonicalization steps are happening later in the pipeline.  No worries
> as I think we'd already concluded that we'd see at least some CRC
> implementations that wouldn't canonicalize down to branchless sequences
> for the conditional xor.
>

Sorry, I had checked incorrectly. I had checked my added GCC test
(crc-5.c), where I call both the optimized and non-optimized versions, so I
mistakenly checked the non-optimized function.
But, yes, in my pass, the function isn't canonicalized; it is canonicalized
later.


>
>
> >
> >
> >  > +
> >  > +gimple *
> >  > +crc_optimization::find_shift_after_xor (tree xored_crc)
> >  > +{
> >  > +  imm_use_iterator imm_iter;
> >  > +  use_operand_p use_p;
> >  > +
> >  > +  if (TREE_CODE (xored_crc) != SSA_NAME)
> >  > +return nullptr;
> > If we always expect XORED_CRC to be an SSA_NAME, we might be able to
> use
> > gcc_assert TREE_CODE (XORED_CRC) == SSA_NAME);
> >
> > I'm not sure that it always has to be an SSA_NAME.
> For a logical operation like XOR it should always have the form
>
> SSA_NAME = SSA_NAME ^ (SSA_NAME | CONSTANT)
>
> The constant might be a vector  constant, but the basic form won't
> change.  It's one of the nicer properties of gimple.  In contrast RTL
> would allow a variety of lvalues and rvalues, including MEMs, REGs,
> SUBREGs, extensions, other binary ops, etc etc.
>

Ok. Thanks for the explanation.


>
>
> >  > +
> >  > +/* Set M_PHI_FOR_CRC and M_PHI_FOR_DATA fields.
> >  > +   Returns false if there are more than two (as in CRC
> > calculation only CRC's
> >  > +   and data's phi may exist) or no phi statements in STMTS (at
> > least there must
> >  > +   be CRC's phi).
> >  > +   Otherwise, returns true.  */
> >  > +
> >  > +bool
> >  > +crc_optimization::set_crc_and_data_phi (auto_vec
> &stmts)
> >  > +{
> >  > +  for (auto stmt_it = stmts.begin (); stmt_it != stmts.end ();
> > stmt_it++)
> >  > +{
> >  > +  if (is_a (*stmt_it) && bb_loop_header_p (gimple_bb
> > (*stmt_it)))
> >  > + {
> >  > +   if (!m_phi_for_crc)
> >  > + m_phi_for_crc = as_a (*stmt_it);
> >  > +   else if (!m_phi_for_data)
> >  > + m_phi_for_data = as_a (*stmt_it);
> >  > +   else
> >  > + {
> >  > +   if (dump_file && (dump_flags & TDF_DETAILS))
> >  > + fprintf (dump_file, "Xor-ed variable depends on
> > more than 2 "
> >  > + "phis.\n");
> >  > +   return false;
> >  > + }
> >  > + }
> >  > +}
> >  > +  return m_phi_for_crc;
> > Hmm.  For a given PHI, how do we know if it's for the data item or
> the
> > crc item, or something else (like a loop counter) entirely?
> >
> >
> >
> > I trace the def-use chain upwards from the XOR statement to determine
> > which PHI node corresponds to CRC and data.
> > Since we assume the loop calculates CRC, I expect only variables
> > representing data and CRC to participate in these operations.
> > In the implementations I support, the loop counter is used only for the
> > iteration.
> > Any misidentification of CRC and data would occur only if the loop
> > doesn't calculate CRC, in which case next checks would fail, leading the
> > algorithm to identify it as not CRC.
> >
> > Here, the PHI nodes for CRC and data might be mixed in places.
> > I just assume that the first found PHI is CRC, second data.
> > I correctly determine them later with the |
> > *swap_crc_and_data_if_needed*| function.
> Ah, OK.  That probably deserves a comment in this code.
>
>
Ok. I'll add a comment.


Thanks,
Mariam


>
> jeff
>


Re: [RFC/RFA] [PATCH 06/12] aarch64: Implement new expander for efficient CRC computation

2024-06-19 Thread Mariam Arutunian
On Sat, Jun 8, 2024 at 3:41 PM Richard Sandiford 
wrote:

> Mariam Arutunian  writes:
> > This patch introduces two new expanders for the aarch64 backend,
> > dedicated to generate optimized code for CRC computations.
> > The new expanders are designed to leverage specific hardware capabilities
> > to achieve faster CRC calculations,
> > particularly using the pmul or crc32 instructions when supported by the
> > target architecture.
>
> Thanks for porting this to aarch64!
>
> > Expander 1: Bit-Forward CRC (crc4)
> > For targets that support pmul instruction (TARGET_AES),
> > the expander will generate code that uses the pmul (crypto_pmulldi)
> > instruction for CRC computation.
> >
> > Expander 2: Bit-Reversed CRC (crc_rev4)
> > The expander first checks if the target supports the CRC32 instruction
> set
> > (TARGET_CRC32)
> > and the polynomial in use is 0x1EDC6F41 (iSCSI). If the conditions are
> met,
> > it emits calls to the corresponding crc32 instruction (crc32b, crc32h,
> > crc32w, or crc32x depending on the data size).
> > If the target does not support crc32 but supports pmul, it then uses the
> > pmul (crypto_pmulldi) instruction for bit-reversed CRC computation.
> >
> > Otherwise table-based CRC is generated.
> >
> >   gcc/config/aarch64/
> >
> > * aarch64-protos.h (aarch64_expand_crc_using_clmul): New extern
> > function declaration.
> > (aarch64_expand_reversed_crc_using_clmul):  Likewise.
> > * aarch64.cc (aarch64_expand_crc_using_clmul): New function.
> > (aarch64_expand_reversed_crc_using_clmul):  Likewise.
> > * aarch64.md (UNSPEC_CRC, UNSPEC_CRC_REV):  New unspecs.
> > (crc_rev4): New expander for reversed CRC.
> > (crc4): New expander for reversed CRC.
> > * iterators.md (crc_data_type): New mode attribute.
> >
> >   gcc/testsuite/gcc.target/aarch64/
> >
> > * crc-1-pmul.c: Likewise.
> > * crc-10-pmul.c: Likewise.
> > * crc-12-pmul.c: Likewise.
> > * crc-13-pmul.c: Likewise.
> > * crc-14-pmul.c: Likewise.
> > * crc-17-pmul.c: Likewise.
> > * crc-18-pmul.c: Likewise.
> > * crc-21-pmul.c: Likewise.
> > * crc-22-pmul.c: Likewise.
> > * crc-23-pmul.c: Likewise.
> > * crc-4-pmul.c: Likewise.
> > * crc-5-pmul.c: Likewise.
> > * crc-6-pmul.c: Likewise.
> > * crc-7-pmul.c: Likewise.
> > * crc-8-pmul.c: Likewise.
> > * crc-9-pmul.c: Likewise.
> > * crc-CCIT-data16-pmul.c: Likewise.
> > * crc-CCIT-data8-pmul.c: Likewise.
> > * crc-coremark-16bitdata-pmul.c: Likewise.
> > * crc-crc32-data16.c: New test.
> > * crc-crc32-data32.c: Likewise.
> > * crc-crc32-data8.c: Likewise.
> >
> > Signed-off-by: Mariam Arutunian  > diff --git a/gcc/config/aarch64/aarch64-protos.h
> b/gcc/config/aarch64/aarch64-protos.h
> > index 1d3f94c813e..167e1140f0d 100644
> > --- a/gcc/config/aarch64/aarch64-protos.h
> > +++ b/gcc/config/aarch64/aarch64-protos.h
> > @@ -1117,5 +1117,8 @@ extern void mingw_pe_encode_section_info (tree,
> rtx, int);
> >
> >  bool aarch64_optimize_mode_switching (aarch64_mode_entity);
> >  void aarch64_restore_za (rtx);
> > +void aarch64_expand_crc_using_clmul (rtx *);
> > +void aarch64_expand_reversed_crc_using_clmul (rtx *);
> > +
> >
> >  #endif /* GCC_AARCH64_PROTOS_H */
> > diff --git a/gcc/config/aarch64/aarch64.cc
> b/gcc/config/aarch64/aarch64.cc
> > index ee12d8897a8..05cd0296d38 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -30265,6 +30265,135 @@ aarch64_retrieve_sysreg (const char *regname,
> bool write_p, bool is128op)
> >return sysreg->encoding;
> >  }
> >
> > +/* Generate assembly to calculate CRC
> > +   using carry-less multiplication instruction.
> > +   OPERANDS[1] is input CRC,
> > +   OPERANDS[2] is data (message),
> > +   OPERANDS[3] is the polynomial without the leading 1.  */
> > +
> > +void
> > +aarch64_expand_crc_using_clmul (rtx *operands)
>
> This should probably be pmul rather than clmul.
>
> > +{
> > +  /* Check and keep arguments.  */
> > +  gcc_assert (!CONST_INT_P (operands[0]));
> > +  gcc_assert (CONST_INT_P (operands[3]));
> > +  rtx crc = operands[1];
> > +  rtx data = operands[2];
> > +  rtx polynomial = operands[3];
> > +
> > +  unsigned HOST_WIDE_INT
> > +  crc_size = GET_MODE_BITSIZE (GET_MODE (operands[0])).to_constant
> ();
> > +  gcc_assert (crc_size <= 32);
> > +  unsigned HOST_WIDE_INT
> > +  data_size = GET_MODE_BITSIZE (GET_MODE (data)).to_constant ();
>
> We could instead make the interface:
>
> void
> aarch64_expand_crc_using_pmul (scalar_mode crc_mode, scalar_mode data_mode,
>rtx *operands)
>
> so that the lines above don't need the to_constant.  This should "just
> work" on the .md file side, since the modes being passed are naturally
> scalar_mode.
>
> I think it'd be worth asserting also that data_size <= crc_size.
> (Although we could handle any MAX (data_size, crc_size) <= 32
> with some adjustment.)
>
> > +
> > +  /* Calculate the qu

Re: [PATCH v2] ARM: thumb1: Use LDMIA/STMIA for DI/DF loads/stores

2024-06-19 Thread Siarhei Volkau
ср, 19 июн. 2024 г. в 15:19, Richard Earnshaw (lists)
:
>
> On 18/06/2024 19:14, Siarhei Volkau wrote:
> > If the address register is dead after load/store operation it looks
> > beneficial to use LDMIA/STMIA instead of pair of LDR/STR instructions,
> > at least if optimizing for size.
> >
> > Changes v1 -> v2:
> >  - switching to peephole2 approach
> >  - added test case
> >
> > gcc/ChangeLog:
> >
> > * config/arm/thumb1.md (peephole2 to rewrite DI/DF load): New.
> > (peephole2 to rewrite DI/DF store): New.
> > (thumb1_movdi_insn): Handle overlapped regs ldmia case.
> > (thumb_movdf_insn): Likewise.
> >
> >   * config/arm/iterators.md (DIDF): New.
> >
> > gcc/testsuite:
> >
> > * gcc.target/arm/thumb1-load-store-64bit.c: Add new test.
> >
> > Signed-off-by: Siarhei Volkau 
> > ---
> >  gcc/config/arm/iterators.md   |  3 +++
> >  gcc/config/arm/thumb1.md  | 27 ++-
> >  .../gcc.target/arm/thumb1-load-store-64bit.c  | 16 +++
> >  3 files changed, 45 insertions(+), 1 deletion(-)
> >  create mode 100644 gcc/testsuite/gcc.target/arm/thumb1-load-store-64bit.c
> >
> > diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
> > index 8d066fcf05d..09046bff83b 100644
> > --- a/gcc/config/arm/iterators.md
> > +++ b/gcc/config/arm/iterators.md
> > @@ -50,6 +50,9 @@ (define_mode_iterator QHSD [QI HI SI DI])
> >  ;; A list of the 32bit and 64bit integer modes
> >  (define_mode_iterator SIDI [SI DI])
> >
> > +;; A list of the 64bit modes for thumb1.
> > +(define_mode_iterator DIDF [DI DF])
> > +
> >  ;; A list of atomic compare and swap success return modes
> >  (define_mode_iterator CCSI [(CC_Z "TARGET_32BIT") (SI "TARGET_THUMB1")])
> >
> > diff --git a/gcc/config/arm/thumb1.md b/gcc/config/arm/thumb1.md
> > index d7074b43f60..ed4b706773a 100644
> > --- a/gcc/config/arm/thumb1.md
> > +++ b/gcc/config/arm/thumb1.md
> > @@ -633,6 +633,8 @@ (define_insn "*thumb1_movdi_insn"
> >gcc_assert (TARGET_HAVE_MOVT);
> >return \"movw\\t%Q0, %L1\;movs\\tR0, #0\";
> >  case 4:
> > +  if (reg_overlap_mentioned_p (operands[0], operands[1]))
> > + return \"ldmia\\t%m1, {%0, %H0}\";
>
> See below for why I don't think this is a case we need to consider here.
>
> >return \"ldmia\\t%1, {%0, %H0}\";
> >  case 5:
> >return \"stmia\\t%0, {%1, %H1}\";
> > @@ -966,6 +968,8 @@ (define_insn "*thumb_movdf_insn"
> >   return \"adds\\t%0, %1, #0\;adds\\t%H0, %H1, #0\";
> >return \"adds\\t%H0, %H1, #0\;adds\\t%0, %1, #0\";
> >  case 1:
> > +  if (reg_overlap_mentioned_p (operands[0], operands[1]))
> > + return \"ldmia\\t%m1, {%0, %H0}\";
> >return \"ldmia\\t%1, {%0, %H0}\";
> >  case 2:
> >return \"stmia\\t%0, {%1, %H1}\";
> > @@ -2055,4 +2059,25 @@ (define_insn "thumb1_stack_protect_test_insn"
> > (set_attr "conds" "clob")
> > (set_attr "type" "multiple")]
> >  )
> > -
> > +
> > +;; match patterns usable by ldmia/stmia
> > +(define_peephole2
> > +  [(set (match_operand:DIDF 0 "low_register_operand" "")
> > + (mem:DIDF (match_operand:SI 1 "low_register_operand")))]
> > +  "TARGET_THUMB1
> > +   && (peep2_reg_dead_p (1, operands[1])
> > +   || REGNO (operands[0]) + 1 == REGNO (operands[1]))"
>
> I don't understand this second condition (partial overlap of the base address 
> with the value loaded), what are you guarding against here?  The instruction 
> specification says that there is no writeback if the base register has any 
> overlap with any of the loaded registers, so really we should reject the 
> peephole if that's true (we'd have invalid RTL as well as there would end up 
> being two writes to the same register).
>

The first condition here is for ldmia cases with writeback, the second
condition is for cases without writeback. I falsely thought that base
register has to be the last register in the list similarly like for
stmia it has to be first.
So, I have to reject the no-writeback case there because it emits
incorrect RTL, this is clear now.

However, to not miss possible optimization, no-writeback cases check
might be moved to instruction itself as it was done in v1 patch
(incorrect instruction length will appear again then).
Any objections on that?

> > +  [(set (match_dup 0)
> > + (mem:DIDF (post_inc:SI (match_dup 1]
> > +  ""
>
> This is not enough, unfortunately.  MEM() objects carry attributes about the 
> memory accessed (alias sets, known alignment, etc) and these will not be 
> propagated correctly if you rewrite the pattern this way.  The correct 
> solution is to match the entire mem as operand1, then use change_address to 
> rewrite that.  Something like:
>
>  operands[1] = change_address (operands[1], VOIDmode,
>gen_rtx_POST_INC (SImode,
>  XEXP (operands[1], 0)));
>

Got it, will rewrite, thanks.

> > +)
>

RE: [PATCH][ivopts]: use affine_tree when comparing IVs during candidate selection [PR114932]

2024-06-19 Thread Tamar Christina
> -Original Message-
> From: Michael Matz 
> Sent: Wednesday, June 19, 2024 3:46 PM
> To: Tamar Christina 
> Cc: Richard Biener ; gcc-patches@gcc.gnu.org; nd
> ; bin.ch...@linux.alibaba.com
> Subject: RE: [PATCH][ivopts]: use affine_tree when comparing IVs during 
> candidate
> selection [PR114932]
> 
> Hello,
> 
> On Wed, 19 Jun 2024, Tamar Christina wrote:
> 
> > So this is where we compare different IV expressions to determine which
> > IVs compute the same thing and thus can be in the same group.
> >
> > The STRIP_NOPS don't work because while the incoming types are the same
> > the casts are different.  So:
> >
> > >>> p debug (ustep)
> > (unsigned long) stride.3_27 * 4
> > $3 = void
> > >>> p debug (cstep)
> > (unsigned long) (stride.3_27 * 4)
> > $4 = void
> >
> > Which is of course stripped to:
> >
> > >>> p debug (top)
> > (unsigned long) stride.3_27 * 4
> > $1 = void
> > >>> p debug (bot)
> > stride.3_27 * 4
> >
> > Both of these compute the same thing
> 
> In isolation these are _not_ computing the same when strides type is
> smaller than ulong, namely when stride is either negative or larger than
> its max-value/4.  I.e. when comparing IVs not only the overflow behaviour
> for the whole {base,+,step} revolution matters, but also the behaviour on
> the constituent expressions.  (It's possible that stride is known to be
> non-problematic here, I haven't checked.  I was just triggered by the
> claim of same-ness :) )

The only use of this method is to determine whether the two expressions
can possibly be the same.  After this IVops forcibly converts them to an
unsigned type though an affine fold in get_computation_aff_1.

So in the end it doesn't care about the sign and uses them all as unsigned.

Tamar
> 
> 
> Ciao,
> Michael.


Re: [PATCH] [testsuite] [arm] [vect] adjust mve-vshr test [PR113281]

2024-06-19 Thread Richard Earnshaw (lists)
On 13/06/2024 10:23, Alexandre Oliva wrote:
> 
> The test was too optimistic, alas.  We used to vectorize shifts
> involving 8-bit and 16-bit integral types by clamping the shift count
> at the highest in-range shift count, but that was not correct: such
> narrow shifts expect integral promotion, so larger shift counts should
> be accepted.  (int16_t)32768 >> (int16_t)16 must yield 0, not 1 (as
> before the fix).
> 
> Unfortunately, in the gimple model of vector units, such large shift
> counts wouldn't be well-defined, so we won't vectorize such shifts any
> more, unless we can tell they're in range or undefined.
> 
> So the test that expected the incorrect clamping we no longer perform
> needs to be adjusted.
> 
> Tested on x86_64-linux-gnu-x-arm-eabi.  Also tested with gcc-13
> x-arm-vx7r2.  Ok to install?
> 
> 
> for  gcc/testsuite/ChangeLog
> 
>   PR tree-optimization/113281
>   * gcc.target/arm/simd/mve-vshr.c: Adjust expectations.
> ---
>  gcc/testsuite/gcc.target/arm/simd/mve-vshr.c |6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vshr.c 
> b/gcc/testsuite/gcc.target/arm/simd/mve-vshr.c
> index 8c7adef9ed8f1..8253427db6ef6 100644
> --- a/gcc/testsuite/gcc.target/arm/simd/mve-vshr.c
> +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vshr.c
> @@ -56,9 +56,9 @@ FUNC_IMM(u, uint, 8, 16, >>, vshrimm)
>  /* MVE has only 128-bit vectors, so we can vectorize only half of the
> functions above.  */
>  /* Vector right shifts use vneg and left shifts.  */
> -/* { dg-final { scan-assembler-times {vshl.s[0-9]+\tq[0-9]+, q[0-9]+} 3 } } 
> */
> -/* { dg-final { scan-assembler-times {vshl.u[0-9]+\tq[0-9]+, q[0-9]+} 3 } } 
> */
> -/* { dg-final { scan-assembler-times {vneg.s[0-9]+\tq[0-9]+, q[0-9]+} 6 } } 
> */
> +/* { dg-final { scan-assembler-times {vshl.s[0-9]+\tq[0-9]+, q[0-9]+} 1 } } 
> */
> +/* { dg-final { scan-assembler-times {vshl.u[0-9]+\tq[0-9]+, q[0-9]+} 1 } } 
> */
> +/* { dg-final { scan-assembler-times {vneg.s[0-9]+\tq[0-9]+, q[0-9]+} 2 } } 
> */
>  
>  
>  /* Shift by immediate.  */
> 
> 

We know the range of the LHS of the shift operand (it comes from a T* array, so 
that isn't the issue.  The problem comes from the RHS of the shift where the 
range can legitimately come from 0..31 (since the shift is conceptually 
performed at int precision).  That can't be handled correctly as gimple only 
supports shifts on the number of bits in the type, itself.  But we can inform 
the compiler that it needn't care about the larger range with a 
__builtin_unreachable().

It looks like adding

  if ((unsigned)b[i] >= 8*sizeof (TYPE##BITS##_t)) \
__builtin_unreachable();   \
 
to the shift-by-variable case is enough to tell the vectorizer that it's safe 
to vectorize this code without needing to handle any additional clamping.

Since this test is primarily about testing the MVE vector operations, I think 
I'd rather go with a solution along those lines rather than nobbling the test.

Thoughts?

R.


RE: [PATCH][ivopts]: use affine_tree when comparing IVs during candidate selection [PR114932]

2024-06-19 Thread Michael Matz
Hello,

On Wed, 19 Jun 2024, Tamar Christina wrote:

> So this is where we compare different IV expressions to determine which
> IVs compute the same thing and thus can be in the same group.
> 
> The STRIP_NOPS don't work because while the incoming types are the same
> the casts are different.  So:
> 
> >>> p debug (ustep)
> (unsigned long) stride.3_27 * 4
> $3 = void
> >>> p debug (cstep)
> (unsigned long) (stride.3_27 * 4)
> $4 = void
> 
> Which is of course stripped to:
> 
> >>> p debug (top)
> (unsigned long) stride.3_27 * 4
> $1 = void
> >>> p debug (bot)
> stride.3_27 * 4
> 
> Both of these compute the same thing

In isolation these are _not_ computing the same when strides type is 
smaller than ulong, namely when stride is either negative or larger than 
its max-value/4.  I.e. when comparing IVs not only the overflow behaviour 
for the whole {base,+,step} revolution matters, but also the behaviour on 
the constituent expressions.  (It's possible that stride is known to be 
non-problematic here, I haven't checked.  I was just triggered by the 
claim of same-ness :) )


Ciao,
Michael.


Re: [PATCH] middle-end/114070 - folding breaking VEC_COND expansion

2024-06-19 Thread Vaseeharan Vinayagamoorthy
Hi,

I have found that this patch has introduced a regression in the arm-none-eabi 
toolchain for a testcase, which was previously passing:

PASS->FAIL: gcc.dg/tree-ssa/andnot-2.c scan-tree-dump-not forwprop3 "_expr"

The toolchain was built with:
Build = x86_64-none-linux-gnu
Host = x86_64-none-linux-gnu
Target = arm-none-eabi

This is also affecting the gcc-13 and gcc-14 branches.
Could you please let me know the impact of this regression, and whether you 
plan to fix the regression?


Kind regards,
Vasee


From: Richard Biener 
Sent: 26 February 2024 07:42
To: gcc-patches@gcc.gnu.org
Subject: [PATCH] middle-end/114070 - folding breaking VEC_COND expansion

The following properly guards the simplifications that move
operations into VEC_CONDs, in particular when that changes the
type constraints on this operation.

This needed a genmatch fix which was recording spurious implicit fors
when tcc_comparison is used in a C expression.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

PR middle-end/114070
* genmatch.cc (parser::parse_c_expr): Do not record operand
lists but only mark operators used.
* match.pd ((c ? a : b) op (c ? d : e)  -->  c ? (a op d) : (b op e)):
Properly guard the case of tcc_comparison changing the VEC_COND
value operand type.

* gcc.dg/torture/pr114070.c: New testcase.
---
 gcc/genmatch.cc |  6 ++
 gcc/match.pd| 15 ---
 gcc/testsuite/gcc.dg/torture/pr114070.c | 12 
 3 files changed, 26 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/torture/pr114070.c

diff --git a/gcc/genmatch.cc b/gcc/genmatch.cc
index 375ae90ae6c..d9ae436ce5c 100644
--- a/gcc/genmatch.cc
+++ b/gcc/genmatch.cc
@@ -4760,10 +4760,8 @@ parser::parse_c_expr (cpp_ttype start)
= (const char *)CPP_HASHNODE (token->val.node.node)->ident.str;
  if (strcmp (str, "return") == 0)
fatal_at (token, "return statement not allowed in C expression");
- id_base *idb = get_operator (str);
- user_id *p;
- if (idb && (p = dyn_cast (idb)) && p->is_oper_list)
-   record_operlist (token->src_loc, p);
+ /* Mark user operators corresponding to 'str' as used.  */
+ get_operator (str);
}

   /* Record the token.  */
diff --git a/gcc/match.pd b/gcc/match.pd
index c5b6540f939..67007fc2017 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -5149,15 +5149,24 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
 /* (c ? a : b) op (c ? d : e)  -->  c ? (a op d) : (b op e) */
  (simplify
   (op (vec_cond:s @0 @1 @2) (vec_cond:s @0 @3 @4))
-  (vec_cond @0 (op! @1 @3) (op! @2 @4)))
+  (if (TREE_CODE_CLASS (op) != tcc_comparison
+   || types_match (type, TREE_TYPE (@1))
+   || expand_vec_cond_expr_p (type, TREE_TYPE (@0), ERROR_MARK))
+   (vec_cond @0 (op! @1 @3) (op! @2 @4

 /* (c ? a : b) op d  -->  c ? (a op d) : (b op d) */
  (simplify
   (op (vec_cond:s @0 @1 @2) @3)
-  (vec_cond @0 (op! @1 @3) (op! @2 @3)))
+  (if (TREE_CODE_CLASS (op) != tcc_comparison
+   || types_match (type, TREE_TYPE (@1))
+   || expand_vec_cond_expr_p (type, TREE_TYPE (@0), ERROR_MARK))
+   (vec_cond @0 (op! @1 @3) (op! @2 @3
  (simplify
   (op @3 (vec_cond:s @0 @1 @2))
-  (vec_cond @0 (op! @3 @1) (op! @3 @2
+  (if (TREE_CODE_CLASS (op) != tcc_comparison
+   || types_match (type, TREE_TYPE (@1))
+   || expand_vec_cond_expr_p (type, TREE_TYPE (@0), ERROR_MARK))
+   (vec_cond @0 (op! @3 @1) (op! @3 @2)

 #if GIMPLE
 (match (nop_atomic_bit_test_and_p @0 @1 @4)
diff --git a/gcc/testsuite/gcc.dg/torture/pr114070.c 
b/gcc/testsuite/gcc.dg/torture/pr114070.c
new file mode 100644
index 000..cf46ec45a04
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/pr114070.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-fno-vect-cost-model" } */
+
+int unresolved(unsigned dirmask, unsigned mask, int *unresolved_n)
+{
+  for (int i = 0; i < 1024; i++) {
+mask |= 1;
+if (!unresolved_n[i] || unresolved_n[i] & 7)
+  dirmask |= 1;
+  }
+  return (dirmask == mask);
+}
--
2.35.3


Re: [PATCH v2] RISC-V: Remove float vector eqne pattern

2024-06-19 Thread Jeff Law




On 6/19/24 6:30 AM, demin.han wrote:

We can unify eqne and other comparison operations.

Tested on RV32 and RV64

gcc/ChangeLog:

* config/riscv/riscv-vector-builtins-bases.cc: Remove eqne cond
* config/riscv/vector.md (@pred_eqne_scalar): Remove patterns
(*pred_eqne_scalar_merge_tie_mask): Ditto
(*pred_eqne_scalar): Ditto
(*pred_eqne_scalar_narrow): Ditto

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/base/float-point-cmp-eqne.c: New test.

Signed-off-by: demin.han 
---

v2 changes:
   1. add test

   Only intrinsics utilize those removed vf patterns.
   Auto vectorization use vv format now.
   The NaN will optimized out before expand in autovec as I tested.

  .../riscv/riscv-vector-builtins-bases.cc  |  4 -
  gcc/config/riscv/vector.md| 86 ---
  .../riscv/rvv/base/float-point-cmp-eqne.c | 54 
  3 files changed, 54 insertions(+), 90 deletions(-)
  create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/base/float-point-cmp-eqne.c

diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc 
b/gcc/config/riscv/riscv-vector-builtins-bases.cc
index b6f6e4ff37e..d414721ede8 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.cc
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc
@@ -1420,10 +1420,6 @@ public:
  switch (e.op_info->op)
{
case OP_TYPE_vf: {
- if (CODE == EQ || CODE == NE)
-   return e.use_compare_insn (CODE, code_for_pred_eqne_scalar (
-  e.vector_mode ()));
- else
return e.use_compare_insn (CODE, code_for_pred_cmp_scalar (
   e.vector_mode ()));
Formatting nit.  You removed the IF-THEN-ELSE construct, leaving just 
the ELSE's body.  You need to reindent that body, both lines of which 
would move left by two spaces.


I'll fix and push it momentarily.

jeff


Re: [PATCH v2] RISC-V: Remove float vector eqne pattern

2024-06-19 Thread Robin Dapp
OK.  Thanks for adding the test.

Regards
 Robin


Re: [RFC/RFA] [PATCH 08/12] Add a new pass for naive CRC loops detection

2024-06-19 Thread Mariam Arutunian
On Sun, Jun 9, 2024 at 1:48 AM Jeff Law  wrote:

>
>
> On 6/4/24 7:41 AM, Mariam Arutunian wrote:
> >/Mariam, your thoughts on whether or not those two phases could handle a
> > loop with two CRC calculations inside, essentially creating two calls to
> > our new builtins? /
> >
> > /
> > /
> >
> > It is feasible, but it would likely demand considerable effort and
> > additional work to implement effectively.
> Thanks for the confirmation.  I suspect it likely doesn't come up often
> in practice either.
>
>
> >
> >>The key would be to only simulate the use-def cycle from the loop-closed
> PHI (plus the loop control of course, but miter/SCEV should be enough
> there) and just replace that LC PHI, leaving loop DCE to DCE.
> >
> > Thank you, this is a good idea to just replace the PHI and leave the
> loop to DCE to remove only single CRC parts.
> It does seem like replacing the PHI when we have an optimizable case
> might simplify that aspect of the implementation.
>

Yes.


>
>
> >
> > The current pass only verifies cases where a single CRC calculation is
> performed within the loop. During the verification phase,
> > I ensure that there are no other calculations aside from those necessary
> for the considered CRC computation.
> >
> > Also, when I was investigating the bitwise CRC implementations used in
> different software, in all cases the loop was calculating just one CRC and
> no other calculations were done.
> > Thus, in almost all cases, the first phase will filter out non-CRCs, and
> during the second phase, only real CRCs with no other calculations will be
> executed.
> > This ensures that unnecessary statements won't be executed in most cases.
> But we may have had a degree of sampling bias here.  If I remember
> correctly I used the initial filtering pass as the "trigger" to report a
> potential CRC case.  If that initial filtering pass rejected cases with
> other calculations in the loop, then we never would have seen those.


Yes, you used initial filtering, but in that process, I do checks with
use-def chains. So, if there were separate computations not connected with
each other, the initial filtering most probably wouldn't reject those.


>
>
> > Leaving the loop to DCE will simplify the process of removing parts
> connected to a single CRC calculation.
> > However, since now we detect a loop that only calculates a single CRC,
> we can entirely remove it at this stage without additional checks.
> Let's evaluate this option as we get to the later patches in the series.
>   What I like about Richard's suggestion is that it "just works" and it
> will continue to work, even as the overall infrastructure changes.  In
> contrast a bespoke loop removal implementation in a specific pass may
> need adjustment if other aspects of our infrastructure change.
>

Ok.


>
> >If we really want a separate pass (or utility to work on a single
> > loop) then we might consider moving some of the final value replacement
> > code that doesn’t work with only SCEV there as well. There’s also
> > special code in loop distribution for strlen recognition now, not
> > exactly fitting in. >
> >
> >>Note I had patches to do final value replacement on demand from CD-DCE
> when it figures a loop has no side effects besides of its reduction outputs
> (still want to pick this up at some point again).
> >
> > Oh, this could provide useful insights for our implementation.
> Are you thinking of reusing that on-demand analysis to reduce the set of
> loops we analyze?
>

I thought its parts would be helpful in future changes of the verification
algorithm.

Thanks,
Mariam


> Jeff
>
>


RE: [PATCH][ivopts]: use affine_tree when comparing IVs during candidate selection [PR114932]

2024-06-19 Thread Tamar Christina
> -Original Message-
> From: Richard Biener 
> Sent: Wednesday, June 19, 2024 12:55 PM
> To: Tamar Christina 
> Cc: gcc-patches@gcc.gnu.org; nd ; bin.ch...@linux.alibaba.com
> Subject: Re: [PATCH][ivopts]: use affine_tree when comparing IVs during 
> candidate
> selection [PR114932]
> 
> On Fri, 14 Jun 2024, Tamar Christina wrote:
> 
> > Hi All,
> >
> > IVOPTS normally uses affine trees to perform comparisons between different 
> > IVs,
> > but these seem to have been missing in two key spots and instead normal tree
> > equivalencies used.
> >
> > In some cases where we have a structural equivalence but not a signedness
> > equivalencies we end up generating both a signed and unsigned IV for the 
> > same
> > candidate.
> >
> > This happens quite a lot with fortran but can also happen in C because this 
> > came
> > code is unable to figure out when one expression is a multiple of another.
> >
> > As an example in the attached testcase we get:
> >
> > Initial set of candidates:
> >   cost: 24 (complexity 3)
> >   reg_cost: 9
> >   cand_cost: 15
> >   cand_group_cost: 0 (complexity 3)
> >   candidates: 1, 6, 8
> >group:0 --> iv_cand:6, cost=(0,1)
> >group:1 --> iv_cand:1, cost=(0,0)
> >group:2 --> iv_cand:8, cost=(0,1)
> >group:3 --> iv_cand:8, cost=(0,1)
> >   invariant variables: 6
> >   invariant expressions: 1, 2
> >
> > :
> > inv_expr 1: stride.3_27 * 4
> > inv_expr 2: (unsigned long) stride.3_27 * 4
> >
> > These end up being used in the same group:
> >
> > Group 1:
> > cand  costcompl.  inv.expr.   inv.vars
> > 1 0   0   NIL;6
> > 2 0   0   NIL;6
> > 3 0   0   NIL;6
> >
> > which ends up with IV opts picking the signed and unsigned IVs:
> >
> > Improved to:
> >   cost: 24 (complexity 3)
> >   reg_cost: 9
> >   cand_cost: 15
> >   cand_group_cost: 0 (complexity 3)
> >   candidates: 1, 6, 8
> >group:0 --> iv_cand:6, cost=(0,1)
> >group:1 --> iv_cand:1, cost=(0,0)
> >group:2 --> iv_cand:8, cost=(0,1)
> >group:3 --> iv_cand:8, cost=(0,1)
> >   invariant variables: 6
> >   invariant expressions: 1, 2
> >
> > and so generates the same IV as both signed and unsigned:
> >
> > ;;   basic block 21, loop depth 3, count 214748368 (estimated locally, freq
> 58.2545), maybe hot
> > ;;prev block 28, next block 31, flags: (NEW, REACHABLE, VISITED)
> > ;;pred:   28 [always]  count:23622320 (estimated locally, freq 
> > 6.4080)
> (FALLTHRU,EXECUTABLE)
> > ;;25 [always]  count:191126046 (estimated locally, freq 
> > 51.8465)
> (FALLTHRU,DFS_BACK,EXECUTABLE)
> >   # .MEM_66 = PHI <.MEM_34(28), .MEM_22(25)>
> >   # ivtmp.22_41 = PHI <0(28), ivtmp.22_82(25)>
> >   # ivtmp.26_51 = PHI 
> >   # ivtmp.28_90 = PHI 
> >
> > ...
> >
> > ;;   basic block 24, loop depth 3, count 214748366 (estimated locally, freq
> 58.2545), maybe hot
> > ;;prev block 22, next block 25, flags: (NEW, REACHABLE, VISITED)'
> > ;;pred:   22 [always]  count:95443719 (estimated locally, freq 
> > 25.8909)
> (FALLTHRU)
> ;;21 [33.3% (guessed)]  count:71582790 (estimated locally, 
> freq 19.4182)
> (TRUE_VALUE,EXECUTABLE)
> ;;31 [33.3% (guessed)]  count:47721860 (estimated locally, 
> freq 12.9455)
> (TRUE_VALUE,EXECUTABLE)
> # .MEM_22 = PHI <.MEM_44(22), .MEM_31(21), .MEM_79(31)>
> 
>   
>  ivtmp.22_82 = ivtmp.22_41 + 1;
> ivtmp.26_72 = ivtmp.26_51 + _80;
> ivtmp.28_98 = ivtmp.28_90 + _39;
> >
> > These two IVs are always used as unsigned, so IV ops generates:
> >
> >   _73 = stride.3_27 * 4;
> >   _80 = (unsigned long) _73;
> >   _54 = (unsigned long) stride.3_27;
> >   _39 = _54 * 4;
> >
> > Which means that in e.g. exchange2 we generate a lot of duplicate code.
> >
> > This is because candidate 6 and 8 are structurally equivalent but have 
> > different
> > signs.
> >
> > This patch changes it so that if you have two IVs that are affine 
> > equivalent to
> > just pick one over the other.  IV already has code for this, so the patch 
> > just
> > uses affine trees instead of tree for the check.
> >
> > With it we get:
> >
> > :
> > inv_expr 1: stride.3_27 * 4
> >
> > :
> > Group 0:
> >   cand  costcompl.  inv.expr.   inv.vars
> >   5 0   2   NIL;NIL;
> >   6 0   3   NIL;NIL;
> >
> > Group 1:
> >   cand  costcompl.  inv.expr.   inv.vars
> >   1 0   0   NIL;6
> >   2 0   0   NIL;6
> >   3 0   0   NIL;6
> >   4 0   0   NIL;6
> >
> > Initial set of candidates:
> >   cost: 16 (complexity 3)
> >   reg_cost: 6
> >   cand_cost: 10
> >   cand_group_cost: 0 (complexity 3)
> >   candidates: 1, 6
> >group:0 --> iv_cand:6, cost=(0,3)
> >group:1 --> iv_cand:1, cost=(0,0)
> >   invariant variables: 6
> >   invariant expressions: 1
> >
> > The two patches together results in a 10% performance i

[to-be-committed] [RISC-V] [PATCH V2] Minor cleanup/improvement to bset/binv patterns

2024-06-19 Thread Jeff Law


Changes since V1:
  Whitespace fixes noted by the linter
  Missed using the iterator for the output template in 
_mask pattern!


--

This patch introduces a bit_optab iterator that maps IOR/XOR to bset and 
binv (and one day bclr if we need it).  That allows us to combine some 
patterns that only differed in the RTL opcode (IOR vs XOR) and in the 
name/assembly (bset vs binv).


Additionally this also allow us to use the iterator in the 
bsetmask and bsetidisi patterns thus potentially fixing a missed 
optimization.


This has gone through my tester.  I'll wait for a verdict from 
pre-commit CI before moving forward.


JeffThis patch introduces a bit_optab iterator that maps IOR/XOR to bset and
binv (and one day bclr if we need it).  That allows us to combine some
patterns that only differed in the RTL opcode (IOR vs XOR) and in the
name/assembly (bset vs binv).

Additionally this also allow us to use the iterator in the
bsetmask and bsetidisi patterns thus potentially fixing a missed
optimization.

This has gone through my tester.  I'll wait for a verdict from
pre-commit CI before moving forward.

diff --git a/gcc/config/riscv/bitmanip.md b/gcc/config/riscv/bitmanip.md
index ae5e7e510c0..3eedabffca0 100644
--- a/gcc/config/riscv/bitmanip.md
+++ b/gcc/config/riscv/bitmanip.md
@@ -569,24 +569,26 @@ (define_insn_and_split "*minmax"
 
 ;; ZBS extension.
 
-(define_insn "*bset"
+(define_insn "*"
   [(set (match_operand:X 0 "register_operand" "=r")
-   (ior:X (ashift:X (const_int 1)
-(match_operand:QI 2 "register_operand" "r"))
-  (match_operand:X 1 "register_operand" "r")))]
+   (any_or:X (ashift:X (const_int 1)
+   (match_operand:QI 2 "register_operand" "r"))
+ (match_operand:X 1 "register_operand" "r")))]
   "TARGET_ZBS"
-  "bset\t%0,%1,%2"
+  "\t%0,%1,%2"
   [(set_attr "type" "bitmanip")])
 
-(define_insn "*bset_mask"
+(define_insn "*_mask"
   [(set (match_operand:X 0 "register_operand" "=r")
-   (ior:X (ashift:X (const_int 1)
-(subreg:QI
- (and:X (match_operand:X 2 "register_operand" "r")
-(match_operand 3 "" 
"")) 0))
-  (match_operand:X 1 "register_operand" "r")))]
+   (any_or:X
+ (ashift:X
+   (const_int 1)
+   (subreg:QI
+ (and:X (match_operand:X 2 "register_operand" "r")
+(match_operand 3 "" "")) 0))
+ (match_operand:X 1 "register_operand" "r")))]
   "TARGET_ZBS"
-  "bset\t%0,%1,%2"
+  "\t%0,%1,%2"
   [(set_attr "type" "bitmanip")])
 
 (define_insn "*bset_1"
@@ -655,24 +657,24 @@ (define_insn "*bset_1_mask"
   "bset\t%0,x0,%1"
   [(set_attr "type" "bitmanip")])
 
-(define_insn "*bseti"
+(define_insn "*i"
   [(set (match_operand:X 0 "register_operand" "=r")
-   (ior:X (match_operand:X 1 "register_operand" "r")
-  (match_operand:X 2 "single_bit_mask_operand" "DbS")))]
+   (any_or:X (match_operand:X 1 "register_operand" "r")
+ (match_operand:X 2 "single_bit_mask_operand" "DbS")))]
   "TARGET_ZBS"
-  "bseti\t%0,%1,%S2"
+  "i\t%0,%1,%S2"
   [(set_attr "type" "bitmanip")])
 
 ;; As long as the SImode operand is not a partial subreg, we can use a
 ;; bseti without postprocessing, as the middle end is smart enough to
 ;; stay away from the signbit.
-(define_insn "*bsetidisi"
+(define_insn "*idisi"
   [(set (match_operand:DI 0 "register_operand" "=r")
-   (ior:DI (sign_extend:DI (match_operand:SI 1 "register_operand" "r"))
-   (match_operand 2 "single_bit_mask_operand" "i")))]
+   (any_or:DI (sign_extend:DI (match_operand:SI 1 "register_operand" "r"))
+  (match_operand 2 "single_bit_mask_operand" "i")))]
   "TARGET_ZBS && TARGET_64BIT
&& !partial_subreg_p (operands[1])"
-  "bseti\t%0,%1,%S2"
+  "i\t%0,%1,%S2"
   [(set_attr "type" "bitmanip")])
 
 ;; We can easily handle zero extensions
@@ -781,23 +783,6 @@ (define_split
  (and:DI (rotate:DI (const_int -2) (match_dup 1))
  (match_dup 3)))])
 
-(define_insn "*binv"
-  [(set (match_operand:X 0 "register_operand" "=r")
-   (xor:X (ashift:X (const_int 1)
-(match_operand:QI 2 "register_operand" "r"))
-  (match_operand:X 1 "register_operand" "r")))]
-  "TARGET_ZBS"
-  "binv\t%0,%1,%2"
-  [(set_attr "type" "bitmanip")])
-
-(define_insn "*binvi"
-  [(set (match_operand:X 0 "register_operand" "=r")
-   (xor:X (match_operand:X 1 "register_operand" "r")
-  (match_operand:X 2 "single_bit_mask_operand" "DbS")))]
-  "TARGET_ZBS"
-  "binvi\t%0,%1,%S2"
-  [(set_attr "type" "bitmanip")])
-
 (define_insn "*bext"
   [(set (match_operand:X 0 "register_operand" "=r")
(zero_extract:X (match_operand:X 1 "register_operand" "r")
diff --git a/gcc/config/riscv/iterators.md b/gcc/config/riscv/iterators.md
index 1e37e843023..20745faa55

[PATCH] gcc/doc: adjust __builtin_choose_expr() description

2024-06-19 Thread Jan Beulich
Present wording has misled people to believe the ?: operator would be
evaluating all three of the involved expressions.

gcc/

* doc/extend.texi: Clarify __builtin_choose_expr() similarity to
the ?: operator.

--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -14962,9 +14962,9 @@ struct {
 
 This built-in function is analogous to the @samp{? :} operator in C,
 except that the expression returned has its type unaltered by promotion
-rules.  Also, the built-in function does not evaluate the expression
-that is not chosen.  For example, if @var{const_exp} evaluates to @code{true},
-@var{exp2} is not evaluated even if it has side effects.
+rules.  Like the @samp{? :} operator, the built-in function does not evaluate
+the expression that is not chosen.  For example, if @var{const_exp} evaluates
+to @code{true}, @var{exp2} is not evaluated even if it has side effects.
 
 This built-in function can return an lvalue if the chosen argument is an
 lvalue.


[PATCH] c++: implement DR1363 and DR1496 for __is_trivial [PR85723]

2024-06-19 Thread Marek Polacek
Bootstrapped/regtested on x86_64-pc-linux-gnu, ok for trunk?

-- >8 --
is_trivial was introduced in

which split POD into is_trivial and is_standard_layout.

Later came CWG 1363.  Since

  struct A {
A() = default;
A(int = 42) {}
  };

cannot be default-initialized, it should not be trivial, so the definition
of what is a trivial class changed.

Similarly, CWG 1496 concluded that

  struct B {
B() = delete;
  }:

should not be trivial either.

P0848 adjusted the definition further to say "eligible".  That means
that

  template
  struct C {
C() requires false = default;
  };

should not be trivial, either, since C::C() is not eligible.

Bug 85723 reports that we implement none of the CWGs.

I chose to fix this by using type_has_non_deleted_trivial_default_ctor
which uses locate_ctor which uses build_new_method_call, which would
be used by default-initialization as well.  With that, all __is_trivial
problems I could find in the Bugzilla are fixed, except for PR96288,
which may need changes to trivially-copyable, so I'm not messing with
that now.

I hope this has no ABI implications.  There's effort undergoing to
remove "trivial class" from the core language as it's not really
meaningful.  So the impact of this change should be pretty low except
to fix a few libstdc++ problems.

PR c++/108769
PR c++/58074
PR c++/115522
PR c++/85723

gcc/cp/ChangeLog:

* class.cc (type_has_non_deleted_trivial_default_ctor): Fix formatting.
* tree.cc (trivial_type_p): Instead of TYPE_HAS_TRIVIAL_DFLT, use
type_has_non_deleted_trivial_default_ctor.

gcc/testsuite/ChangeLog:

* g++.dg/warn/Wclass-memaccess.C: Add dg-warning.
* g++.dg/ext/is_trivial1.C: New test.
* g++.dg/ext/is_trivial2.C: New test.
* g++.dg/ext/is_trivial3.C: New test.
* g++.dg/ext/is_trivial4.C: New test.
* g++.dg/ext/is_trivial5.C: New test.
* g++.dg/ext/is_trivial6.C: New test.
---
 gcc/cp/class.cc  |  3 +-
 gcc/cp/tree.cc   |  4 +-
 gcc/testsuite/g++.dg/ext/is_trivial1.C   | 14 ++
 gcc/testsuite/g++.dg/ext/is_trivial2.C   | 17 +++
 gcc/testsuite/g++.dg/ext/is_trivial3.C   | 15 ++
 gcc/testsuite/g++.dg/ext/is_trivial4.C   | 10 
 gcc/testsuite/g++.dg/ext/is_trivial5.C   |  8 
 gcc/testsuite/g++.dg/ext/is_trivial6.C   | 49 
 gcc/testsuite/g++.dg/warn/Wclass-memaccess.C |  2 +
 9 files changed, 120 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/ext/is_trivial1.C
 create mode 100644 gcc/testsuite/g++.dg/ext/is_trivial2.C
 create mode 100644 gcc/testsuite/g++.dg/ext/is_trivial3.C
 create mode 100644 gcc/testsuite/g++.dg/ext/is_trivial4.C
 create mode 100644 gcc/testsuite/g++.dg/ext/is_trivial5.C
 create mode 100644 gcc/testsuite/g++.dg/ext/is_trivial6.C

diff --git a/gcc/cp/class.cc b/gcc/cp/class.cc
index 0ce361eb88e..718601756dd 100644
--- a/gcc/cp/class.cc
+++ b/gcc/cp/class.cc
@@ -5918,7 +5918,8 @@ type_has_virtual_destructor (tree type)
 /* True iff class TYPE has a non-deleted trivial default
constructor.  */
 
-bool type_has_non_deleted_trivial_default_ctor (tree type)
+bool
+type_has_non_deleted_trivial_default_ctor (tree type)
 {
   return TYPE_HAS_TRIVIAL_DFLT (type) && locate_ctor (type);
 }
diff --git a/gcc/cp/tree.cc b/gcc/cp/tree.cc
index 28648c14c6d..5b837f89e03 100644
--- a/gcc/cp/tree.cc
+++ b/gcc/cp/tree.cc
@@ -4637,7 +4637,9 @@ trivial_type_p (const_tree t)
   t = strip_array_types (CONST_CAST_TREE (t));
 
   if (CLASS_TYPE_P (t))
-return (TYPE_HAS_TRIVIAL_DFLT (t)
+/* A trivial class is a class that is trivially copyable and has one or
+   more eligible default constructors, all of which are trivial.  */
+return (type_has_non_deleted_trivial_default_ctor (CONST_CAST_TREE (t))
&& trivially_copyable_p (t));
   else
 return scalarish_type_p (t);
diff --git a/gcc/testsuite/g++.dg/ext/is_trivial1.C 
b/gcc/testsuite/g++.dg/ext/is_trivial1.C
new file mode 100644
index 000..60ce48edfe9
--- /dev/null
+++ b/gcc/testsuite/g++.dg/ext/is_trivial1.C
@@ -0,0 +1,14 @@
+// PR c++/108769
+// { dg-do compile { target c++20 } }
+
+template 
+struct S {
+S() requires false = default;
+};
+static_assert(!__is_trivial(S));
+
+template 
+struct R {
+R() requires true = default;
+};
+static_assert(__is_trivial(R));
diff --git a/gcc/testsuite/g++.dg/ext/is_trivial2.C 
b/gcc/testsuite/g++.dg/ext/is_trivial2.C
new file mode 100644
index 000..8a8e554580c
--- /dev/null
+++ b/gcc/testsuite/g++.dg/ext/is_trivial2.C
@@ -0,0 +1,17 @@
+// PR c++/58074
+// { dg-do compile { target c++11 } }
+
+struct Trivial
+{
+  Trivial() = delete;
+};
+
+struct NonTrivial
+{
+  NonTrivial() = default;
+  NonTrivial(NonTrivial&) = default;
+  NonTrivial& operator=(NonTrivial&) = default;
+};
+
+stati

RE: [PATCH v1] Match: Support more forms for the scalar unsigned .SAT_SUB

2024-06-19 Thread Li, Pan2
Got it. Thanks Richard for suggestion.

Pan

-Original Message-
From: Richard Biener  
Sent: Wednesday, June 19, 2024 4:00 PM
To: Li, Pan2 
Cc: gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai; kito.ch...@gmail.com; 
jeffreya...@gmail.com; rdapp@gmail.com
Subject: Re: [PATCH v1] Match: Support more forms for the scalar unsigned 
.SAT_SUB

On Wed, Jun 19, 2024 at 9:37 AM Li, Pan2  wrote:
>
> Hi Richard,
>
> Given almost all unsigned SAT_ADD/SAT_SUB patches are merged, I revisit the 
> original code pattern aka zip benchmark.
> It may look like below:
>
> void test (uint16_t *x, uint16_t *y, unsigned wsize, unsigned count)
> {
>   unsigned m = 0, n = count;
>   register uint16_t *p;
>
>   p = x;
>
>   do {
> m = *--p;
>
> *p = (uint16_t)(m >= wsize ? m-wsize : 0); // There will be a conversion 
> here.
>   } while (--n);
> }
>
> And we can have 179 tree pass as below:
>
>[local count: 1073741824]:
>   # n_3 = PHI 
>   # p_4 = PHI 
>   p_10 = p_4 + 18446744073709551614;
>   _1 = *p_10;
>   m_11 = (unsigned int) _1;
>   _2 = m_11 - wsize_12(D);
>   iftmp.0_13 = (short unsigned int) _2;
>   _18 = m_11 >= wsize_12(D);
>   iftmp.0_5 = _18 ? iftmp.0_13 : 0;
>   *p_10 = iftmp.0_5;
>
> The above form doesn't hit any form we have supported in match.pd. Then I 
> have one idea that to convert
>
> uint16 d, tmp;
> uint32 a, b, m;
>
> m = a - b;
> tmp = (uint16)m;
> d = a >= b ? tmp : 0;
>
> to
>
> d = (uint16)(.SAT_SUB (a, b));

The key here is to turn this into

 m = a - b;
 tmp = a >= b ? m : 0;
 d = (uint16) tmp;

I guess?  We probably have the reverse transform, turn
(uint16) a ? b : c; into a ? (uint16)b : (uint16)c if any of the arm simplifies.

OTOH if you figure the correct rules for the allowed conversions adjusting the
pattern matching to allow a conversion on the subtract would work.

> I am not very sure it is reasonable to make it work, it may have gimple 
> assignment with convert similar as below (may require the help of 
> vectorize_conversion?).
> Would like to get some hint from you before the next step, thanks a lot.
>
> patt_34 = .SAT_SUB (m_11, wsize_12(D));
> patt_35 = (vector([8,8]) short unsigned int) patt_34;
>
> Pan
>
> -Original Message-
> From: Richard Biener 
> Sent: Friday, June 14, 2024 4:05 PM
> To: Li, Pan2 
> Cc: gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai; kito.ch...@gmail.com; 
> jeffreya...@gmail.com; rdapp@gmail.com
> Subject: Re: [PATCH v1] Match: Support more forms for the scalar unsigned 
> .SAT_SUB
>
> On Wed, Jun 12, 2024 at 2:38 PM  wrote:
> >
> > From: Pan Li 
> >
> > After we support the scalar unsigned form 1 and 2,  we would like
> > to introduce more forms include the branch and branchless.  There
> > are forms 3-10 list as below:
> >
> > Form 3:
> >   #define SAT_SUB_U_3(T) \
> >   T sat_sub_u_3_##T (T x, T y) \
> >   { \
> > return x > y ? x - y : 0; \
> >   }
> >
> > Form 4:
> >   #define SAT_SUB_U_4(T) \
> >   T sat_sub_u_4_##T (T x, T y) \
> >   { \
> > return x >= y ? x - y : 0; \
> >   }
> >
> > Form 5:
> >   #define SAT_SUB_U_5(T) \
> >   T sat_sub_u_5_##T (T x, T y) \
> >   { \
> > return x < y ? 0 : x - y; \
> >   }
> >
> > Form 6:
> >   #define SAT_SUB_U_6(T) \
> >   T sat_sub_u_6_##T (T x, T y) \
> >   { \
> > return x <= y ? 0 : x - y; \
> >   }
> >
> > Form 7:
> >   #define SAT_SUB_U_7(T) \
> >   T sat_sub_u_7_##T (T x, T y) \
> >   { \
> > T ret; \
> > T overflow = __builtin_sub_overflow (x, y, &ret); \
> > return ret & (T)(overflow - 1); \
> >   }
> >
> > Form 8:
> >   #define SAT_SUB_U_8(T) \
> >   T sat_sub_u_8_##T (T x, T y) \
> >   { \
> > T ret; \
> > T overflow = __builtin_sub_overflow (x, y, &ret); \
> > return ret & (T)-(!overflow); \
> >   }
> >
> > Form 9:
> >   #define SAT_SUB_U_9(T) \
> >   T sat_sub_u_9_##T (T x, T y) \
> >   { \
> > T ret; \
> > T overflow = __builtin_sub_overflow (x, y, &ret); \
> > return overflow ? 0 : ret; \
> >   }
> >
> > Form 10:
> >   #define SAT_SUB_U_10(T) \
> >   T sat_sub_u_10_##T (T x, T y) \
> >   { \
> > T ret; \
> > T overflow = __builtin_sub_overflow (x, y, &ret); \
> > return !overflow ? ret : 0; \
> >   }
> >
> > Take form 10 as example:
> >
> > SAT_SUB_U_10(uint64_t);
> >
> > Before this patch:
> > uint8_t sat_sub_u_10_uint8_t (uint8_t x, uint8_t y)
> > {
> >   unsigned char _1;
> >   unsigned char _2;
> >   uint8_t _3;
> >   __complex__ unsigned char _6;
> >
> > ;;   basic block 2, loop depth 0
> > ;;pred:   ENTRY
> >   _6 = .SUB_OVERFLOW (x_4(D), y_5(D));
> >   _2 = IMAGPART_EXPR <_6>;
> >   if (_2 == 0)
> > goto ; [50.00%]
> >   else
> > goto ; [50.00%]
> > ;;succ:   3
> > ;;4
> >
> > ;;   basic block 3, loop depth 0
> > ;;pred:   2
> >   _1 = REALPART_EXPR <_6>;
> > ;;succ:   4
> >
> > ;;   basic block 4, loop depth 0
> > ;;pred:   2
> > ;;3
> >   # _3 = PHI <0(2), _1(3)>
> >   return _3;
> > ;;succ:   EXIT
> >
> > }

RE: [PATCH v1 1/8] RISC-V: Add testcases for unsigned .SAT_SUB vector form 3

2024-06-19 Thread Li, Pan2
Committed the series, thanks Juzhe.

Pan

From: 钟居哲 
Sent: Wednesday, June 19, 2024 9:20 PM
To: Li, Pan2 ; gcc-patches 
Cc: kito.cheng ; jeffreyalaw ; 
rdapp.gcc ; Li, Pan2 
Subject: Re: [PATCH v1 1/8] RISC-V: Add testcases for unsigned .SAT_SUB vector 
form 3

lgtm



--Reply to Message--
On Wed, Jun 19, 2024 21:17 PM 
pan2.limailto:pan2...@intel.com>> wrote:
From: Pan Li mailto:pan2...@intel.com>>

After the middle-end support the form 3 of unsigned SAT_SUB and
the RISC-V backend implement the .SAT_SUB for vector mode,  thus
add more test case to cover that.

Form 3:
  #define DEF_VEC_SAT_U_SUB_FMT_3(T)   \
  void __attribute__((noinline))   \
  vec_sat_u_sub_##T##_fmt_3 (T *out, T *op_1, T *op_2, unsigned limit) \
  {\
unsigned i;\
for (i = 0; i < limit; i++)\
  {\
T x = op_1[i]; \
T y = op_2[i]; \
out[i] = x > y ? x - y : 0;\
  }\
  }

Passed the rv64gcv regression test.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add test macro.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-10.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-11.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-12.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-9.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-10.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-11.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-12.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-9.c: New test.

Signed-off-by: Pan Li mailto:pan2...@intel.com>>
---
 .../riscv/rvv/autovec/binop/vec_sat_arith.h   | 17 +
 .../rvv/autovec/binop/vec_sat_u_sub-10.c  | 20 +
 .../rvv/autovec/binop/vec_sat_u_sub-11.c  | 20 +
 .../rvv/autovec/binop/vec_sat_u_sub-12.c  | 20 +
 .../riscv/rvv/autovec/binop/vec_sat_u_sub-9.c | 19 +
 .../rvv/autovec/binop/vec_sat_u_sub-run-10.c  | 75 +++
 .../rvv/autovec/binop/vec_sat_u_sub-run-11.c  | 75 +++
 .../rvv/autovec/binop/vec_sat_u_sub-run-12.c  | 75 +++
 .../rvv/autovec/binop/vec_sat_u_sub-run-9.c   | 75 +++
 9 files changed, 396 insertions(+)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-10.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-11.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-12.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-9.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-10.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-11.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-12.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-9.c

diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h
index 443f88261ba..182cf2cf064 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h
@@ -167,9 +167,26 @@ vec_sat_u_sub_##T##_fmt_2 (T *out, T *op_1, T *op_2, 
unsigned limit) \
 }\
 }

+#define DEF_VEC_SAT_U_SUB_FMT_3(T)   \
+void __attribute__((noinline))   \
+vec_sat_u_sub_##T##_fmt_3 (T *out, T *op_1, T *op_2, unsigned limit) \
+{\
+  unsigned i;\
+  for (i = 0; i < limit; i++)\
+{\
+  T x = op_1[i]; \
+  T y = op_2[i]; \
+  out[i] = x > y ? x - y : 0;\
+}\
+}
+
 #define RUN_VEC_SAT_U_SUB_FMT_1(T, out, op_1, op_2, N) \
   vec_sat_u_sub_##T##_fmt_1(out, op_1, op_2, N)
+
 #define RUN_VEC_SAT_U_SUB_FMT_2(T, out, op_1, op_2, N) \
   vec_sat_u_sub_##T##_fmt_2(out, op_1, op_2, N)

+#define RUN

Re: [PATCH v2] RISC-V: Remove float vector eqne pattern

2024-06-19 Thread ??????
lgtm from my side but plz wait for robin the last review. thanks.








 --Reply to Message--
 On Wed, Jun 19, 2024 20:30 PM demin.han

Re: [PATCH v4] RISC-V: Promote Zaamo/Zalrsc to a when using an old binutils

2024-06-19 Thread Kito Cheng
LGTM :)

Patrick O'Neill  於 2024年6月19日 週三 05:40 寫道:

> Binutils 2.42 and before don't support Zaamo/Zalrsc. When users specify
> both Zaamo and Zalrsc, promote them to 'a' in the -march string.
>
> This does not affect testsuite results for users with old versions of
> binutils.
> Testcases that failed due to 'call'/isa string continue to fail after this
> PATCH
> when using an old version of binutils.
>
> gcc/ChangeLog:
>
> * common/config/riscv/riscv-common.cc: Add 'a' extension to
> riscv_combine_info.
>
> Signed-off-by: Patrick O'Neill 
> ---
> We will emit calls if the user only specifies Zaamo or Zalrsc.
> To my knowledge there isn't a way to make a testcase for this in dejagnu.
> I used the most recent version of the 'a' extension arbitrarily since
> AFAICT the
> version of the extension doesn't affect the combine logic.
> ---
>  gcc/common/config/riscv/riscv-common.cc | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/gcc/common/config/riscv/riscv-common.cc
> b/gcc/common/config/riscv/riscv-common.cc
> index 1dc1d9904c7..410e673f5e0 100644
> --- a/gcc/common/config/riscv/riscv-common.cc
> +++ b/gcc/common/config/riscv/riscv-common.cc
> @@ -401,6 +401,7 @@ static const struct riscv_ext_version
> riscv_ext_version_table[] =
>  /* Combine extensions defined in this table  */
>  static const struct riscv_ext_version riscv_combine_info[] =
>  {
> +  {"a", ISA_SPEC_CLASS_20191213, 2, 1},
>{"zk",  ISA_SPEC_CLASS_NONE, 1, 0},
>{"zkn",  ISA_SPEC_CLASS_NONE, 1, 0},
>{"zks",  ISA_SPEC_CLASS_NONE, 1, 0},
> --
> 2.34.1
>
>


RE: [PATCH][ivopts]: perform affine fold on unsigned addressing modes known not to overflow. [PR114932]

2024-06-19 Thread Tamar Christina
> -Original Message-
> From: Richard Biener 
> Sent: Wednesday, June 19, 2024 1:14 PM
> To: Tamar Christina 
> Cc: gcc-patches@gcc.gnu.org; nd ; bin.ch...@linux.alibaba.com
> Subject: Re: [PATCH][ivopts]: perform affine fold on unsigned addressing modes
> known not to overflow. [PR114932]
> 
> On Fri, 14 Jun 2024, Tamar Christina wrote:
> 
> > Hi All,
> >
> > When the patch for PR114074 was applied we saw a good boost in exchange2.
> >
> > This boost was partially caused by a simplification of the addressing modes.
> > With the patch applied IV opts saw the following form for the base 
> > addressing;
> >
> >   Base: (integer(kind=4) *) &block + ((sizetype) ((unsigned long) l0_19(D) *
> > 324) + 36)
> >
> > vs what we normally get:
> >
> >   Base: (integer(kind=4) *) &block + ((sizetype) ((integer(kind=8)) l0_19(D)
> > * 81) + 9) * 4
> >
> > This is because the patch promoted multiplies where one operand is a 
> > constant
> > from a signed multiply to an unsigned one, to attempt to fold away the 
> > constant.
> >
> > This patch attempts the same but due to the various problems with SCEV and
> > niters not being able to analyze the resulting forms (i.e. PR114322) we 
> > can't
> > do it during SCEV or in the general form like in fold-const like 
> > extract_muldiv
> > attempts.
> >
> > Instead this applies the simplification during IVopts initialization when we
> > create the IV.  Essentially when we know the IV won't overflow with regards 
> > to
> > niters then we perform an affine fold which gets it to simplify the internal
> > computation, even if this is signed because we know that for IVOPTs uses the
> > IV won't ever overflow.  This allows IV opts to see the simplified form
> > without influencing the rest of the compiler.
> >
> > as mentioned in PR114074 it would be good to fix the missed optimization in 
> > the
> > other passes so we can perform this in general.
> >
> > The reason this has a big impact on fortran code is that fortran doesn't 
> > seem to
> > have unsigned integer types.  As such all it's addressing are created with
> > signed types and folding does not happen on them due to the possible 
> > overflow.
> >
> > concretely on AArch64 this changes the results from generation:
> >
> > mov x27, -108
> > mov x24, -72
> > mov x23, -36
> > add x21, x1, x0, lsl 2
> > add x19, x20, x22
> > .L5:
> > add x0, x22, x19
> > add x19, x19, 324
> > ldr d1, [x0, x27]
> > add v1.2s, v1.2s, v15.2s
> > str d1, [x20, 216]
> > ldr d0, [x0, x24]
> > add v0.2s, v0.2s, v15.2s
> > str d0, [x20, 252]
> > ldr d31, [x0, x23]
> > add v31.2s, v31.2s, v15.2s
> > str d31, [x20, 288]
> > bl  digits_20_
> > cmp x21, x19
> > bne .L5
> >
> > into:
> >
> > .L5:
> > ldr d1, [x19, -108]
> > add v1.2s, v1.2s, v15.2s
> > str d1, [x20, 216]
> > ldr d0, [x19, -72]
> > add v0.2s, v0.2s, v15.2s
> > str d0, [x20, 252]
> > ldr d31, [x19, -36]
> > add x19, x19, 324
> > add v31.2s, v31.2s, v15.2s
> > str d31, [x20, 288]
> > bl  digits_20_
> > cmp x21, x19
> > bne .L5
> >
> > The two patches together results in a 10% performance increase in exchange2 
> > in
> > SPECCPU 2017 and a 4% reduction in binary size and a 5% improvement in
> compile
> > time. There's also a 5% performance improvement in fotonik3d and similar
> > reduction in binary size.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > PR tree-optimization/114932
> > * tree-scalar-evolution.cc (alloc_iv): Perform affine unsigned fold.
> >
> > gcc/testsuite/ChangeLog:
> >
> > PR tree-optimization/114932
> > * gfortran.dg/addressing-modes_1.f90: New test.
> >
> > ---
> > diff --git a/gcc/testsuite/gfortran.dg/addressing-modes_1.f90
> b/gcc/testsuite/gfortran.dg/addressing-modes_1.f90
> > new file mode 100644
> > index
> ..334d5bc47a16e53e9168b
> b1f90dfeff584b4e494
> > --- /dev/null
> > +++ b/gcc/testsuite/gfortran.dg/addressing-modes_1.f90
> > @@ -0,0 +1,37 @@
> > +! { dg-do compile { target aarch64-*-* } }
> > +! { dg-additional-options "-w -Ofast" }
> > +
> > +  module brute_force
> > +integer, parameter :: r=9
> > + integer  block(r, r, 0)
> > +contains
> > +  subroutine brute
> > + do
> > +  do
> > +  do
> > +   do
> > +do
> > + do
> > + do i7 = l0, 1
> > +   select case(1 )
> > +   case(1)
> > +   block(:2, 7:, 1) = block(:2, 7:, i7) - 1
> > +   end se

Re: [PATCH v1 5/8] RISC-V: Add testcases for unsigned .SAT_SUB vector form 7

2024-06-19 Thread ??????
lgtm








 --Reply to Message--
 On Wed, Jun 19, 2024 21:17 PM pan2.li

Re: [PATCH v1 8/8] RISC-V: Add testcases for unsigned .SAT_SUB vector form 10

2024-06-19 Thread ??????
lgtm








 --Reply to Message--
 On Wed, Jun 19, 2024 21:17 PM pan2.li

Re: [PATCH v1 7/8] RISC-V: Add testcases for unsigned .SAT_SUB vector form 9

2024-06-19 Thread ??????
lgtm








 --Reply to Message--
 On Wed, Jun 19, 2024 21:17 PM pan2.li

Re: [PATCH v1 4/8] RISC-V: Add testcases for unsigned .SAT_SUB vector form 6

2024-06-19 Thread ??????
lgtm








 --Reply to Message--
 On Wed, Jun 19, 2024 21:17 PM pan2.li

Re: [PATCH v1 3/8] RISC-V: Add testcases for unsigned .SAT_SUB vector form 5

2024-06-19 Thread ??????
lgtm








 --Reply to Message--
 On Wed, Jun 19, 2024 21:17 PM pan2.li

Re: [PATCH v1 6/8] RISC-V: Add testcases for unsigned .SAT_SUB vector form 8

2024-06-19 Thread ??????
lgtm








 --Reply to Message--
 On Wed, Jun 19, 2024 21:17 PM pan2.li

Re: [PATCH v1 2/8] RISC-V: Add testcases for unsigned .SAT_SUB vector form 4

2024-06-19 Thread ??????
lgtm








 --Reply to Message--
 On Wed, Jun 19, 2024 21:17 PM pan2.li

[PATCH v1 5/8] RISC-V: Add testcases for unsigned .SAT_SUB vector form 7

2024-06-19 Thread pan2 . li
From: Pan Li 

After the middle-end support the form 7 of unsigned SAT_SUB and
the RISC-V backend implement the .SAT_SUB for vector mode,  thus
add more test case to cover that.

Form 7:
  #define DEF_VEC_SAT_U_SUB_FMT_7(T)   \
  void __attribute__((noinline))   \
  vec_sat_u_sub_##T##_fmt_7 (T *out, T *op_1, T *op_2, unsigned limit) \
  {\
unsigned i;\
for (i = 0; i < limit; i++)\
  {\
T x = op_1[i]; \
T y = op_2[i]; \
T ret; \
T overflow = __builtin_sub_overflow (x, y, &ret);  \
out[i] = ret & (T)(overflow - 1);  \
  }\
  }

Passed the rv64gcv regression test.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add test macro.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-25.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-26.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-27.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-28.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-25.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-26.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-27.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-28.c: New test.

Signed-off-by: Pan Li 
---
 .../riscv/rvv/autovec/binop/vec_sat_arith.h   | 18 +
 .../rvv/autovec/binop/vec_sat_u_sub-25.c  | 19 +
 .../rvv/autovec/binop/vec_sat_u_sub-26.c  | 20 +
 .../rvv/autovec/binop/vec_sat_u_sub-27.c  | 20 +
 .../rvv/autovec/binop/vec_sat_u_sub-28.c  | 20 +
 .../rvv/autovec/binop/vec_sat_u_sub-run-25.c  | 75 +++
 .../rvv/autovec/binop/vec_sat_u_sub-run-26.c  | 75 +++
 .../rvv/autovec/binop/vec_sat_u_sub-run-27.c  | 75 +++
 .../rvv/autovec/binop/vec_sat_u_sub-run-28.c  | 75 +++
 9 files changed, 397 insertions(+)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-25.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-26.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-27.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-28.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-25.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-26.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-27.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-28.c

diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h
index fd4d88e6f30..69fbc6b5258 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h
@@ -219,6 +219,21 @@ vec_sat_u_sub_##T##_fmt_6 (T *out, T *op_1, T *op_2, 
unsigned limit) \
 }\
 }
 
+#define DEF_VEC_SAT_U_SUB_FMT_7(T)   \
+void __attribute__((noinline))   \
+vec_sat_u_sub_##T##_fmt_7 (T *out, T *op_1, T *op_2, unsigned limit) \
+{\
+  unsigned i;\
+  for (i = 0; i < limit; i++)\
+{\
+  T x = op_1[i]; \
+  T y = op_2[i]; \
+  T ret; \
+  T overflow = __builtin_sub_overflow (x, y, &ret);  \
+  out[i] = ret & (T)(overflow - 1);  \
+}\
+}
+
 #define RUN_VEC_SAT_U_SUB_FMT_1(T, out, op_1, op_2, N) \
   vec_sat_u_sub_##T##_fmt_1(out, op_1, op_2, N)
 
@@ -237,4 +252,7 @@ vec_sat_u_sub_##T##_fmt_6 (T *out, T *op_1, T *op_2, 
unsigned limit) \
 #define RUN_VEC_SAT_U_SUB_FMT_6(T, out, op_1, op_2, N) \
   vec_sat_u_sub_##T##_fmt_6(out, 

Re: [PATCH v1 1/8] RISC-V: Add testcases for unsigned .SAT_SUB vector form 3

2024-06-19 Thread ??????
lgtm








 --Reply to Message--
 On Wed, Jun 19, 2024 21:17 PM pan2.li

[PATCH v1 8/8] RISC-V: Add testcases for unsigned .SAT_SUB vector form 10

2024-06-19 Thread pan2 . li
From: Pan Li 

After the middle-end support the form 10 of unsigned SAT_SUB and
the RISC-V backend implement the .SAT_SUB for vector mode,  thus
add more test case to cover that.

Form 10:
  #define DEF_VEC_SAT_U_SUB_FMT_10(T)   \
  void __attribute__((noinline))\
  vec_sat_u_sub_##T##_fmt_10 (T *out, T *op_1, T *op_2, unsigned limit) \
  { \
unsigned i; \
for (i = 0; i < limit; i++) \
  { \
T x = op_1[i];  \
T y = op_2[i];  \
T ret;  \
bool overflow = __builtin_sub_overflow (x, y, &ret);\
out[i] = !overflow ? ret : 0;   \
  } \
  }

Passed the rv64gcv regression test.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add test macro.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-37.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-38.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-39.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-40.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-37.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-38.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-39.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-40.c: New test.

Signed-off-by: Pan Li 
---
 .../riscv/rvv/autovec/binop/vec_sat_arith.h   | 18 +
 .../rvv/autovec/binop/vec_sat_u_sub-37.c  | 19 +
 .../rvv/autovec/binop/vec_sat_u_sub-38.c  | 20 +
 .../rvv/autovec/binop/vec_sat_u_sub-39.c  | 20 +
 .../rvv/autovec/binop/vec_sat_u_sub-40.c  | 20 +
 .../rvv/autovec/binop/vec_sat_u_sub-run-37.c  | 75 +++
 .../rvv/autovec/binop/vec_sat_u_sub-run-38.c  | 75 +++
 .../rvv/autovec/binop/vec_sat_u_sub-run-39.c  | 75 +++
 .../rvv/autovec/binop/vec_sat_u_sub-run-40.c  | 75 +++
 9 files changed, 397 insertions(+)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-37.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-38.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-39.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-40.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-37.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-38.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-39.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-40.c

diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h
index e231d1e66aa..d5c81fbe5a9 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h
@@ -265,6 +265,21 @@ vec_sat_u_sub_##T##_fmt_9 (T *out, T *op_1, T *op_2, 
unsigned limit) \
 }\
 }
 
+#define DEF_VEC_SAT_U_SUB_FMT_10(T)   \
+void __attribute__((noinline))\
+vec_sat_u_sub_##T##_fmt_10 (T *out, T *op_1, T *op_2, unsigned limit) \
+{ \
+  unsigned i; \
+  for (i = 0; i < limit; i++) \
+{ \
+  T x = op_1[i];  \
+  T y = op_2[i];  \
+  T ret;  \
+  bool overflow = __builtin_sub_overflow (x, y, &ret);\
+  out[i] = !overflow ? ret : 0;   \
+} \
+}
+
 #define RUN_VEC_SAT_U_SUB_FMT_1(T, out, op_1, op_2, N) \
   vec_sat_u_sub_##T##_fmt_1(out, op_1, op_2, N)
 
@@ -292,4 +307,7 @@ vec_sat_u_sub_##T##_fmt_9 (T *out, T *op_1, T *op_2, 
unsigned limit) \
 #define RUN_VEC_SAT_U_SUB_FMT_9(T, out, op_1, op_2, N) \
   vec

[PATCH v1 4/8] RISC-V: Add testcases for unsigned .SAT_SUB vector form 6

2024-06-19 Thread pan2 . li
From: Pan Li 

After the middle-end support the form 6 of unsigned SAT_SUB and
the RISC-V backend implement the .SAT_SUB for vector mode,  thus
add more test case to cover that.

Form 6:
  #define DEF_VEC_SAT_U_SUB_FMT_6(T)   \
  void __attribute__((noinline))   \
  vec_sat_u_sub_##T##_fmt_6 (T *out, T *op_1, T *op_2, unsigned limit) \
  {\
unsigned i;\
for (i = 0; i < limit; i++)\
  {\
T x = op_1[i]; \
T y = op_2[i]; \
out[i] = x <= y ? 0 : x - y;   \
  }\
  }

Passed the rv64gcv regression test.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add test macro.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-21.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-22.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-23.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-24.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-21.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-22.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-23.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-24.c: New test.

Signed-off-by: Pan Li 
---
 .../riscv/rvv/autovec/binop/vec_sat_arith.h   | 16 
 .../rvv/autovec/binop/vec_sat_u_sub-21.c  | 19 +
 .../rvv/autovec/binop/vec_sat_u_sub-22.c  | 20 +
 .../rvv/autovec/binop/vec_sat_u_sub-23.c  | 20 +
 .../rvv/autovec/binop/vec_sat_u_sub-24.c  | 20 +
 .../rvv/autovec/binop/vec_sat_u_sub-run-21.c  | 75 +++
 .../rvv/autovec/binop/vec_sat_u_sub-run-22.c  | 75 +++
 .../rvv/autovec/binop/vec_sat_u_sub-run-23.c  | 75 +++
 .../rvv/autovec/binop/vec_sat_u_sub-run-24.c  | 75 +++
 9 files changed, 395 insertions(+)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-21.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-22.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-23.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-24.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-21.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-22.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-23.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-24.c

diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h
index b25215c10cb..fd4d88e6f30 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h
@@ -206,6 +206,19 @@ vec_sat_u_sub_##T##_fmt_5 (T *out, T *op_1, T *op_2, 
unsigned limit) \
 }\
 }
 
+#define DEF_VEC_SAT_U_SUB_FMT_6(T)   \
+void __attribute__((noinline))   \
+vec_sat_u_sub_##T##_fmt_6 (T *out, T *op_1, T *op_2, unsigned limit) \
+{\
+  unsigned i;\
+  for (i = 0; i < limit; i++)\
+{\
+  T x = op_1[i]; \
+  T y = op_2[i]; \
+  out[i] = x <= y ? 0 : x - y;   \
+}\
+}
+
 #define RUN_VEC_SAT_U_SUB_FMT_1(T, out, op_1, op_2, N) \
   vec_sat_u_sub_##T##_fmt_1(out, op_1, op_2, N)
 
@@ -221,4 +234,7 @@ vec_sat_u_sub_##T##_fmt_5 (T *out, T *op_1, T *op_2, 
unsigned limit) \
 #define RUN_VEC_SAT_U_SUB_FMT_5(T, out, op_1, op_2, N) \
   vec_sat_u_sub_##T##_fmt_5(out, op_1, op_2, N)
 
+#define RUN_VEC_SAT_U_SUB_FMT_6(T, out, op_1, op_2, N) \
+  vec_sat_u_sub_##T##_fmt_6(out, op_1, op_2, N)
+
 #endif
diff --git 
a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-21.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-21.c
new 

[PATCH v1 7/8] RISC-V: Add testcases for unsigned .SAT_SUB vector form 9

2024-06-19 Thread pan2 . li
From: Pan Li 

After the middle-end support the form 9 of unsigned SAT_SUB and
the RISC-V backend implement the .SAT_SUB for vector mode,  thus
add more test case to cover that.

Form 9:
  #define DEF_VEC_SAT_U_SUB_FMT_9(T)   \
  void __attribute__((noinline))   \
  vec_sat_u_sub_##T##_fmt_9 (T *out, T *op_1, T *op_2, unsigned limit) \
  {\
unsigned i;\
for (i = 0; i < limit; i++)\
  {\
T x = op_1[i]; \
T y = op_2[i]; \
T ret; \
bool overflow = __builtin_sub_overflow (x, y, &ret);   \
out[i] = overflow ? 0 : ret;   \
  }\
  }

Passed the rv64gcv regression test.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add test macro.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-33.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-34.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-35.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-36.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-33.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-34.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-35.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-36.c: New test.

Signed-off-by: Pan Li 
---
 .../riscv/rvv/autovec/binop/vec_sat_arith.h   | 19 +
 .../rvv/autovec/binop/vec_sat_u_sub-33.c  | 19 +
 .../rvv/autovec/binop/vec_sat_u_sub-34.c  | 20 +
 .../rvv/autovec/binop/vec_sat_u_sub-35.c  | 20 +
 .../rvv/autovec/binop/vec_sat_u_sub-36.c  | 20 +
 .../rvv/autovec/binop/vec_sat_u_sub-run-33.c  | 75 +++
 .../rvv/autovec/binop/vec_sat_u_sub-run-34.c  | 75 +++
 .../rvv/autovec/binop/vec_sat_u_sub-run-35.c  | 75 +++
 .../rvv/autovec/binop/vec_sat_u_sub-run-36.c  | 75 +++
 9 files changed, 398 insertions(+)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-33.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-34.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-35.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-36.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-33.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-34.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-35.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-36.c

diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h
index 302fc458708..e231d1e66aa 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h
@@ -2,6 +2,7 @@
 #define HAVE_VEC_SAT_ARITH
 
 #include 
+#include 
 
 
/**/
 /* Saturation Add (unsigned and signed)   
*/
@@ -249,6 +250,21 @@ vec_sat_u_sub_##T##_fmt_8 (T *out, T *op_1, T *op_2, 
unsigned limit) \
 }\
 }
 
+#define DEF_VEC_SAT_U_SUB_FMT_9(T)   \
+void __attribute__((noinline))   \
+vec_sat_u_sub_##T##_fmt_9 (T *out, T *op_1, T *op_2, unsigned limit) \
+{\
+  unsigned i;\
+  for (i = 0; i < limit; i++)\
+{\
+  T x = op_1[i]; \
+  T y = op_2[i]; \
+  T ret; \
+  bool overflow = __builtin_sub_overflow (x, y, &ret);   \
+  out[i] = overflow ? 0 : ret;   \
+}\
+}
+
 #define RUN_VEC_SAT_U_SUB_FMT_1(T, out, op_1, op_2, N) \

[PATCH v1 3/8] RISC-V: Add testcases for unsigned .SAT_SUB vector form 5

2024-06-19 Thread pan2 . li
From: Pan Li 

After the middle-end support the form 5 of unsigned SAT_SUB and
the RISC-V backend implement the .SAT_SUB for vector mode,  thus
add more test case to cover that.

Form 5:
  #define DEF_VEC_SAT_U_SUB_FMT_5(T)   \
  void __attribute__((noinline))   \
  vec_sat_u_sub_##T##_fmt_5 (T *out, T *op_1, T *op_2, unsigned limit) \
  {\
unsigned i;\
for (i = 0; i < limit; i++)\
  {\
T x = op_1[i]; \
T y = op_2[i]; \
out[i] = x < y ? 0 : x - y;\
  }\
  }

Passed the rv64gcv regression test.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add test macro.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-17.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-18.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-19.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-20.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-17.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-18.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-19.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-20.c: New test.

Signed-off-by: Pan Li 
---
 .../riscv/rvv/autovec/binop/vec_sat_arith.h   | 16 
 .../rvv/autovec/binop/vec_sat_u_sub-17.c  | 19 +
 .../rvv/autovec/binop/vec_sat_u_sub-18.c  | 20 +
 .../rvv/autovec/binop/vec_sat_u_sub-19.c  | 20 +
 .../rvv/autovec/binop/vec_sat_u_sub-20.c  | 20 +
 .../rvv/autovec/binop/vec_sat_u_sub-run-17.c  | 75 +++
 .../rvv/autovec/binop/vec_sat_u_sub-run-18.c  | 75 +++
 .../rvv/autovec/binop/vec_sat_u_sub-run-19.c  | 75 +++
 .../rvv/autovec/binop/vec_sat_u_sub-run-20.c  | 75 +++
 9 files changed, 395 insertions(+)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-17.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-18.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-19.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-20.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-17.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-18.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-19.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-20.c

diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h
index a83f964df0c..b25215c10cb 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h
@@ -193,6 +193,19 @@ vec_sat_u_sub_##T##_fmt_4 (T *out, T *op_1, T *op_2, 
unsigned limit) \
 }\
 }
 
+#define DEF_VEC_SAT_U_SUB_FMT_5(T)   \
+void __attribute__((noinline))   \
+vec_sat_u_sub_##T##_fmt_5 (T *out, T *op_1, T *op_2, unsigned limit) \
+{\
+  unsigned i;\
+  for (i = 0; i < limit; i++)\
+{\
+  T x = op_1[i]; \
+  T y = op_2[i]; \
+  out[i] = x < y ? 0 : x - y;\
+}\
+}
+
 #define RUN_VEC_SAT_U_SUB_FMT_1(T, out, op_1, op_2, N) \
   vec_sat_u_sub_##T##_fmt_1(out, op_1, op_2, N)
 
@@ -205,4 +218,7 @@ vec_sat_u_sub_##T##_fmt_4 (T *out, T *op_1, T *op_2, 
unsigned limit) \
 #define RUN_VEC_SAT_U_SUB_FMT_4(T, out, op_1, op_2, N) \
   vec_sat_u_sub_##T##_fmt_4(out, op_1, op_2, N)
 
+#define RUN_VEC_SAT_U_SUB_FMT_5(T, out, op_1, op_2, N) \
+  vec_sat_u_sub_##T##_fmt_5(out, op_1, op_2, N)
+
 #endif
diff --git 
a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-17.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-17.c
new 

[PATCH v1 6/8] RISC-V: Add testcases for unsigned .SAT_SUB vector form 8

2024-06-19 Thread pan2 . li
From: Pan Li 

After the middle-end support the form 8 of unsigned SAT_SUB and
the RISC-V backend implement the .SAT_SUB for vector mode,  thus
add more test case to cover that.

Form 8:
  #define DEF_VEC_SAT_U_SUB_FMT_8(T)   \
  void __attribute__((noinline))   \
  vec_sat_u_sub_##T##_fmt_8 (T *out, T *op_1, T *op_2, unsigned limit) \
  {\
unsigned i;\
for (i = 0; i < limit; i++)\
  {\
T x = op_1[i]; \
T y = op_2[i]; \
T ret; \
T overflow = __builtin_sub_overflow (x, y, &ret);  \
out[i] = ret & (T)-(!overflow);\
  }\
  }

Passed the rv64gcv regression test.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add test macro.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-29.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-30.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-31.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-32.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-29.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-30.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-31.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-32.c: New test.

Signed-off-by: Pan Li 
---
 .../riscv/rvv/autovec/binop/vec_sat_arith.h   | 18 +
 .../rvv/autovec/binop/vec_sat_u_sub-29.c  | 19 +
 .../rvv/autovec/binop/vec_sat_u_sub-30.c  | 20 +
 .../rvv/autovec/binop/vec_sat_u_sub-31.c  | 20 +
 .../rvv/autovec/binop/vec_sat_u_sub-32.c  | 20 +
 .../rvv/autovec/binop/vec_sat_u_sub-run-29.c  | 75 +++
 .../rvv/autovec/binop/vec_sat_u_sub-run-30.c  | 75 +++
 .../rvv/autovec/binop/vec_sat_u_sub-run-31.c  | 75 +++
 .../rvv/autovec/binop/vec_sat_u_sub-run-32.c  | 75 +++
 9 files changed, 397 insertions(+)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-29.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-30.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-31.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-32.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-29.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-30.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-31.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-32.c

diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h
index 69fbc6b5258..302fc458708 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h
@@ -234,6 +234,21 @@ vec_sat_u_sub_##T##_fmt_7 (T *out, T *op_1, T *op_2, 
unsigned limit) \
 }\
 }
 
+#define DEF_VEC_SAT_U_SUB_FMT_8(T)   \
+void __attribute__((noinline))   \
+vec_sat_u_sub_##T##_fmt_8 (T *out, T *op_1, T *op_2, unsigned limit) \
+{\
+  unsigned i;\
+  for (i = 0; i < limit; i++)\
+{\
+  T x = op_1[i]; \
+  T y = op_2[i]; \
+  T ret; \
+  T overflow = __builtin_sub_overflow (x, y, &ret);  \
+  out[i] = ret & (T)-(!overflow);\
+}\
+}
+
 #define RUN_VEC_SAT_U_SUB_FMT_1(T, out, op_1, op_2, N) \
   vec_sat_u_sub_##T##_fmt_1(out, op_1, op_2, N)
 
@@ -255,4 +270,7 @@ vec_sat_u_sub_##T##_fmt_7 (T *out, T *op_1, T *op_2, 
unsigned limit) \
 #define RUN_VEC_SAT_U_SUB_FMT_7(T, out, op_1, op_2, N) \
   vec_sat_u_sub_##T##_fmt_7(out, 

[PATCH v1 1/8] RISC-V: Add testcases for unsigned .SAT_SUB vector form 3

2024-06-19 Thread pan2 . li
From: Pan Li 

After the middle-end support the form 3 of unsigned SAT_SUB and
the RISC-V backend implement the .SAT_SUB for vector mode,  thus
add more test case to cover that.

Form 3:
  #define DEF_VEC_SAT_U_SUB_FMT_3(T)   \
  void __attribute__((noinline))   \
  vec_sat_u_sub_##T##_fmt_3 (T *out, T *op_1, T *op_2, unsigned limit) \
  {\
unsigned i;\
for (i = 0; i < limit; i++)\
  {\
T x = op_1[i]; \
T y = op_2[i]; \
out[i] = x > y ? x - y : 0;\
  }\
  }

Passed the rv64gcv regression test.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add test macro.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-10.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-11.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-12.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-9.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-10.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-11.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-12.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-9.c: New test.

Signed-off-by: Pan Li 
---
 .../riscv/rvv/autovec/binop/vec_sat_arith.h   | 17 +
 .../rvv/autovec/binop/vec_sat_u_sub-10.c  | 20 +
 .../rvv/autovec/binop/vec_sat_u_sub-11.c  | 20 +
 .../rvv/autovec/binop/vec_sat_u_sub-12.c  | 20 +
 .../riscv/rvv/autovec/binop/vec_sat_u_sub-9.c | 19 +
 .../rvv/autovec/binop/vec_sat_u_sub-run-10.c  | 75 +++
 .../rvv/autovec/binop/vec_sat_u_sub-run-11.c  | 75 +++
 .../rvv/autovec/binop/vec_sat_u_sub-run-12.c  | 75 +++
 .../rvv/autovec/binop/vec_sat_u_sub-run-9.c   | 75 +++
 9 files changed, 396 insertions(+)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-10.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-11.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-12.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-9.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-10.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-11.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-12.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-9.c

diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h
index 443f88261ba..182cf2cf064 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h
@@ -167,9 +167,26 @@ vec_sat_u_sub_##T##_fmt_2 (T *out, T *op_1, T *op_2, 
unsigned limit) \
 }\
 }
 
+#define DEF_VEC_SAT_U_SUB_FMT_3(T)   \
+void __attribute__((noinline))   \
+vec_sat_u_sub_##T##_fmt_3 (T *out, T *op_1, T *op_2, unsigned limit) \
+{\
+  unsigned i;\
+  for (i = 0; i < limit; i++)\
+{\
+  T x = op_1[i]; \
+  T y = op_2[i]; \
+  out[i] = x > y ? x - y : 0;\
+}\
+}
+
 #define RUN_VEC_SAT_U_SUB_FMT_1(T, out, op_1, op_2, N) \
   vec_sat_u_sub_##T##_fmt_1(out, op_1, op_2, N)
+
 #define RUN_VEC_SAT_U_SUB_FMT_2(T, out, op_1, op_2, N) \
   vec_sat_u_sub_##T##_fmt_2(out, op_1, op_2, N)
 
+#define RUN_VEC_SAT_U_SUB_FMT_3(T, out, op_1, op_2, N) \
+  vec_sat_u_sub_##T##_fmt_3(out, op_1, op_2, N)
+
 #endif
diff --git 
a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-10.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-10.c
new file mode 100644
index 000..e1c4020b36d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/ri

[PATCH v1 2/8] RISC-V: Add testcases for unsigned .SAT_SUB vector form 4

2024-06-19 Thread pan2 . li
From: Pan Li 

After the middle-end support the form 4 of unsigned SAT_SUB and
the RISC-V backend implement the .SAT_SUB for vector mode,  thus
add more test case to cover that.

Form 4:
  #define DEF_VEC_SAT_U_SUB_FMT_4(T)   \
  void __attribute__((noinline))   \
  vec_sat_u_sub_##T##_fmt_4 (T *out, T *op_1, T *op_2, unsigned limit) \
  {\
unsigned i;\
for (i = 0; i < limit; i++)\
  {\
T x = op_1[i]; \
T y = op_2[i]; \
out[i] = x >= y ? x - y : 0;   \
  }\
  }

Passed the rv64gcv regression test.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h: Add test macro.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-13.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-14.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-15.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-16.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-13.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-14.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-15.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-16.c: New test.

Signed-off-by: Pan Li 
---
 .../riscv/rvv/autovec/binop/vec_sat_arith.h   | 16 
 .../rvv/autovec/binop/vec_sat_u_sub-13.c  | 19 +
 .../rvv/autovec/binop/vec_sat_u_sub-14.c  | 20 +
 .../rvv/autovec/binop/vec_sat_u_sub-15.c  | 20 +
 .../rvv/autovec/binop/vec_sat_u_sub-16.c  | 20 +
 .../rvv/autovec/binop/vec_sat_u_sub-run-13.c  | 75 +++
 .../rvv/autovec/binop/vec_sat_u_sub-run-14.c  | 75 +++
 .../rvv/autovec/binop/vec_sat_u_sub-run-15.c  | 75 +++
 .../rvv/autovec/binop/vec_sat_u_sub-run-16.c  | 75 +++
 9 files changed, 395 insertions(+)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-13.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-14.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-15.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-16.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-13.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-14.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-15.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-run-16.c

diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h
index 182cf2cf064..a83f964df0c 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_arith.h
@@ -180,6 +180,19 @@ vec_sat_u_sub_##T##_fmt_3 (T *out, T *op_1, T *op_2, 
unsigned limit) \
 }\
 }
 
+#define DEF_VEC_SAT_U_SUB_FMT_4(T)   \
+void __attribute__((noinline))   \
+vec_sat_u_sub_##T##_fmt_4 (T *out, T *op_1, T *op_2, unsigned limit) \
+{\
+  unsigned i;\
+  for (i = 0; i < limit; i++)\
+{\
+  T x = op_1[i]; \
+  T y = op_2[i]; \
+  out[i] = x >= y ? x - y : 0;   \
+}\
+}
+
 #define RUN_VEC_SAT_U_SUB_FMT_1(T, out, op_1, op_2, N) \
   vec_sat_u_sub_##T##_fmt_1(out, op_1, op_2, N)
 
@@ -189,4 +202,7 @@ vec_sat_u_sub_##T##_fmt_3 (T *out, T *op_1, T *op_2, 
unsigned limit) \
 #define RUN_VEC_SAT_U_SUB_FMT_3(T, out, op_1, op_2, N) \
   vec_sat_u_sub_##T##_fmt_3(out, op_1, op_2, N)
 
+#define RUN_VEC_SAT_U_SUB_FMT_4(T, out, op_1, op_2, N) \
+  vec_sat_u_sub_##T##_fmt_4(out, op_1, op_2, N)
+
 #endif
diff --git 
a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-13.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_u_sub-13.c
new 

Re: [PATCH] tree-optimization/114413 - SLP CSE after permute optimization

2024-06-19 Thread Richard Sandiford
Richard Biener  writes:
> On Wed, 19 Jun 2024, Richard Sandiford wrote:
>
>> Richard Biener  writes:
>> > We currently fail to re-CSE SLP nodes after optimizing permutes
>> > which results in off cost estimates.  For gcc.dg/vect/bb-slp-32.c
>> > this shows in not re-using the SLP node with the load and arithmetic
>> > for both the store and the reduction.  The following implements
>> > CSE by re-bst-mapping nodes as finalization part of vect_optimize_slp.
>> >
>> > I've tried to make the CSE part of permute materialization but it
>> > isn't a very good fit there.  I've not bothered to implement something
>> > more complete, also handling external defs or defs without
>> > SLP_TREE_SCALAR_STMTS.
>> >
>> > I realize this might result in more BB SLP which in turn might slow
>> > down code given costing for BB SLP is difficult (even that we now
>> > vectorize gcc.dg/vect/bb-slp-32.c on x86_64 might be not a good idea).
>> > This is nevertheless feeding more accurate info to costing which is
>> > good.
>> >
>> > Bootstrapped and tested on x86_64-unknown-linux-gnu.
>> >
>> > Does this look OK?
>> >
>> > Thanks,
>> > Richard.
>> >
>> >PR tree-optimization/114413
>> >* tree-vect-slp.cc (release_scalar_stmts_to_slp_tree_map):
>> >New function, split out from ...
>> >(vect_analyze_slp): ... here.  Call it.
>> >(vect_cse_slp_nodes): New function.
>> >(vect_optimize_slp): Call it.
>> >
>> >* gcc.dg/vect/bb-slp-32.c: Expect CSE and vectorization on x86.
>> > ---
>> >  gcc/testsuite/gcc.dg/vect/bb-slp-32.c |  6 +++
>> >  gcc/tree-vect-slp.cc  | 77 ++-
>> >  2 files changed, 71 insertions(+), 12 deletions(-)
>> >
>> > diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-32.c 
>> > b/gcc/testsuite/gcc.dg/vect/bb-slp-32.c
>> > index 4f72727b694..475b241c36e 100644
>> > --- a/gcc/testsuite/gcc.dg/vect/bb-slp-32.c
>> > +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-32.c
>> > @@ -38,3 +38,9 @@ int main()
>> >  abort ();
>> >return 0;
>> >  }
>> > +
>> > +/* This is a weak test but we want to re-use the arithmetic for both the
>> > +   store and the reduction.  */
>> > +/* { dg-final { scan-tree-dump "re-using SLP tree" "slp2" { target { 
>> > x86_64-*-* i?86-*-* } } } } */
>> > +/* On i386 we vectorize both the store and the reduction.  */
>> > +/* { dg-final { scan-tree-dump-times "basic block part vectorized" 2 
>> > "slp2" { target { x86_64-*-* i?86-*-* } } } } */
>> > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
>> > index 2552dacbd69..980d1e7267d 100644
>> > --- a/gcc/tree-vect-slp.cc
>> > +++ b/gcc/tree-vect-slp.cc
>> > @@ -1586,6 +1586,23 @@ bst_traits::equal (value_type existing, value_type 
>> > candidate)
>> >return true;
>> >  }
>> >  
>> > +typedef hash_map , slp_tree,
>> > +simple_hashmap_traits  >
>> > +  scalar_stmts_to_slp_tree_map_t;
>> > +
>> > +/* Release BST_MAP.  */
>> > +
>> > +static void
>> > +release_scalar_stmts_to_slp_tree_map (scalar_stmts_to_slp_tree_map_t 
>> > *bst_map)
>> > +{
>> > +  /* The map keeps a reference on SLP nodes built, release that.  */
>> > +  for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
>> > +   it != bst_map->end (); ++it)
>> > +if ((*it).second)
>> > +  vect_free_slp_tree ((*it).second);
>> > +  delete bst_map;
>> > +}
>> > +
>> >  /* ???  This was std::pair, tree>
>> > but then vec::insert does memmove and that's not compatible with
>> > std::pair.  */
>> > @@ -1684,10 +1701,6 @@ vect_slp_linearize_chain (vec_info *vinfo,
>> >  }
>> >  }
>> >  
>> > -typedef hash_map , slp_tree,
>> > -simple_hashmap_traits  >
>> > -  scalar_stmts_to_slp_tree_map_t;
>> > -
>> >  static slp_tree
>> >  vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
>> >   vec stmts, unsigned int group_size,
>> > @@ -4308,14 +4321,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned 
>> > max_tree_size)
>> >}
>> >  }
>> >  
>> > -
>> > -
>> > -  /* The map keeps a reference on SLP nodes built, release that.  */
>> > -  for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
>> > -   it != bst_map->end (); ++it)
>> > -if ((*it).second)
>> > -  vect_free_slp_tree ((*it).second);
>> > -  delete bst_map;
>> > +  release_scalar_stmts_to_slp_tree_map (bst_map);
>> >  
>> >if (pattern_found && dump_enabled_p ())
>> >  {
>> > @@ -6373,6 +6379,43 @@ vect_optimize_slp_pass::run ()
>> >free_graph (m_slpg);
>> >  }
>> >  
>> > +/* Apply CSE to NODE and its children using BST_MAP.  */
>> > +
>> > +static void
>> > +vect_cse_slp_nodes (scalar_stmts_to_slp_tree_map_t *bst_map, slp_tree& 
>> > node)
>> > +{
>> > +  if (SLP_TREE_DEF_TYPE (node) == vect_internal_def
>> > +  && SLP_TREE_CODE (node) != VEC_PERM_EXPR
>> > +  /* Besides some VEC_PERM_EXPR, two-operator nodes also
>> > +   lack scalar stmts and thus CSE doesn't work via bst_map.  Ideally
>> > +   we'd have sth that works for all internal and externa

[committed] libstdc++: Fix warning regressions in

2024-06-19 Thread Jonathan Wakely
Tested x86_64-linux. Pushed to trunk.

-- >8 --

I caused some new warnings with -Wsystem-headers with my recent changes
to std::get_temporary_buffer and std::_Temporary_buffer. There's a
-Wsign-compare warning which can be avoided by casting the ptrdiff_t
argument to size_t (which also conveniently rejects negative values).

There's also a -Wdeprecated-declarations warning because I moved where
std::get_temporary_buffer is called, but didn't move the diagnostic
pragmas that suppress the warning for calling it.

libstdc++-v3/ChangeLog:

* include/bits/stl_tempbuf.h (__get_temporary_buffer): Cast
argument to size_t to handle negative values and suppress
-Wsign-compare warning.
(_Temporary_buffer): Move diagnostic pragmas to new location of
call to std::get_temporary_buffer.
---
 libstdc++-v3/include/bits/stl_tempbuf.h | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libstdc++-v3/include/bits/stl_tempbuf.h 
b/libstdc++-v3/include/bits/stl_tempbuf.h
index fa03fd27704..759c4937744 100644
--- a/libstdc++-v3/include/bits/stl_tempbuf.h
+++ b/libstdc++-v3/include/bits/stl_tempbuf.h
@@ -82,7 +82,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   inline _Tp*
   __get_temporary_buffer(ptrdiff_t __len) _GLIBCXX_NOTHROW
   {
-   if (__builtin_expect(__len > (size_t(-1) / sizeof(_Tp)), 0))
+   if (__builtin_expect(size_t(__len) > (size_t(-1) / sizeof(_Tp)), 0))
  return 0;
 
 #if __cpp_aligned_new
@@ -200,6 +200,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   size_type  _M_original_len;
   struct _Impl
   {
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
explicit
_Impl(ptrdiff_t __original_len)
{
@@ -208,6 +210,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
  _M_len = __p.second;
  _M_buffer = __p.first;
}
+#pragma GCC diagnostic pop
 
~_Impl()
{ std::__detail::__return_temporary_buffer(_M_buffer, _M_len); }
@@ -315,8 +318,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
  __ucr(__first, __last, __seed);
 }
 
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
   template
 _Temporary_buffer<_ForwardIterator, _Tp>::
 _Temporary_buffer(_ForwardIterator __seed, size_type __original_len)
@@ -324,7 +325,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 {
   std::__uninitialized_construct_buf(begin(), end(), __seed);
 }
-#pragma GCC diagnostic pop
 
 _GLIBCXX_END_NAMESPACE_VERSION
 } // namespace
-- 
2.45.1



Re: [PATCH] tree-optimization/114413 - SLP CSE after permute optimization

2024-06-19 Thread Richard Biener
On Wed, 19 Jun 2024, Richard Sandiford wrote:

> Richard Biener  writes:
> > We currently fail to re-CSE SLP nodes after optimizing permutes
> > which results in off cost estimates.  For gcc.dg/vect/bb-slp-32.c
> > this shows in not re-using the SLP node with the load and arithmetic
> > for both the store and the reduction.  The following implements
> > CSE by re-bst-mapping nodes as finalization part of vect_optimize_slp.
> >
> > I've tried to make the CSE part of permute materialization but it
> > isn't a very good fit there.  I've not bothered to implement something
> > more complete, also handling external defs or defs without
> > SLP_TREE_SCALAR_STMTS.
> >
> > I realize this might result in more BB SLP which in turn might slow
> > down code given costing for BB SLP is difficult (even that we now
> > vectorize gcc.dg/vect/bb-slp-32.c on x86_64 might be not a good idea).
> > This is nevertheless feeding more accurate info to costing which is
> > good.
> >
> > Bootstrapped and tested on x86_64-unknown-linux-gnu.
> >
> > Does this look OK?
> >
> > Thanks,
> > Richard.
> >
> > PR tree-optimization/114413
> > * tree-vect-slp.cc (release_scalar_stmts_to_slp_tree_map):
> > New function, split out from ...
> > (vect_analyze_slp): ... here.  Call it.
> > (vect_cse_slp_nodes): New function.
> > (vect_optimize_slp): Call it.
> >
> > * gcc.dg/vect/bb-slp-32.c: Expect CSE and vectorization on x86.
> > ---
> >  gcc/testsuite/gcc.dg/vect/bb-slp-32.c |  6 +++
> >  gcc/tree-vect-slp.cc  | 77 ++-
> >  2 files changed, 71 insertions(+), 12 deletions(-)
> >
> > diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-32.c 
> > b/gcc/testsuite/gcc.dg/vect/bb-slp-32.c
> > index 4f72727b694..475b241c36e 100644
> > --- a/gcc/testsuite/gcc.dg/vect/bb-slp-32.c
> > +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-32.c
> > @@ -38,3 +38,9 @@ int main()
> >  abort ();
> >return 0;
> >  }
> > +
> > +/* This is a weak test but we want to re-use the arithmetic for both the
> > +   store and the reduction.  */
> > +/* { dg-final { scan-tree-dump "re-using SLP tree" "slp2" { target { 
> > x86_64-*-* i?86-*-* } } } } */
> > +/* On i386 we vectorize both the store and the reduction.  */
> > +/* { dg-final { scan-tree-dump-times "basic block part vectorized" 2 
> > "slp2" { target { x86_64-*-* i?86-*-* } } } } */
> > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> > index 2552dacbd69..980d1e7267d 100644
> > --- a/gcc/tree-vect-slp.cc
> > +++ b/gcc/tree-vect-slp.cc
> > @@ -1586,6 +1586,23 @@ bst_traits::equal (value_type existing, value_type 
> > candidate)
> >return true;
> >  }
> >  
> > +typedef hash_map , slp_tree,
> > + simple_hashmap_traits  >
> > +  scalar_stmts_to_slp_tree_map_t;
> > +
> > +/* Release BST_MAP.  */
> > +
> > +static void
> > +release_scalar_stmts_to_slp_tree_map (scalar_stmts_to_slp_tree_map_t 
> > *bst_map)
> > +{
> > +  /* The map keeps a reference on SLP nodes built, release that.  */
> > +  for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
> > +   it != bst_map->end (); ++it)
> > +if ((*it).second)
> > +  vect_free_slp_tree ((*it).second);
> > +  delete bst_map;
> > +}
> > +
> >  /* ???  This was std::pair, tree>
> > but then vec::insert does memmove and that's not compatible with
> > std::pair.  */
> > @@ -1684,10 +1701,6 @@ vect_slp_linearize_chain (vec_info *vinfo,
> >  }
> >  }
> >  
> > -typedef hash_map , slp_tree,
> > - simple_hashmap_traits  >
> > -  scalar_stmts_to_slp_tree_map_t;
> > -
> >  static slp_tree
> >  vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
> >vec stmts, unsigned int group_size,
> > @@ -4308,14 +4321,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned 
> > max_tree_size)
> > }
> >  }
> >  
> > -
> > -
> > -  /* The map keeps a reference on SLP nodes built, release that.  */
> > -  for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
> > -   it != bst_map->end (); ++it)
> > -if ((*it).second)
> > -  vect_free_slp_tree ((*it).second);
> > -  delete bst_map;
> > +  release_scalar_stmts_to_slp_tree_map (bst_map);
> >  
> >if (pattern_found && dump_enabled_p ())
> >  {
> > @@ -6373,6 +6379,43 @@ vect_optimize_slp_pass::run ()
> >free_graph (m_slpg);
> >  }
> >  
> > +/* Apply CSE to NODE and its children using BST_MAP.  */
> > +
> > +static void
> > +vect_cse_slp_nodes (scalar_stmts_to_slp_tree_map_t *bst_map, slp_tree& 
> > node)
> > +{
> > +  if (SLP_TREE_DEF_TYPE (node) == vect_internal_def
> > +  && SLP_TREE_CODE (node) != VEC_PERM_EXPR
> > +  /* Besides some VEC_PERM_EXPR, two-operator nodes also
> > +lack scalar stmts and thus CSE doesn't work via bst_map.  Ideally
> > +we'd have sth that works for all internal and external nodes.  */
> > +  && !SLP_TREE_SCALAR_STMTS (node).is_empty ())
> > +{
> > +  if (slp_tree *leader = bst_map->get (SLP_TREE_SCAL

Re: [PATCH 5/8] vect: Use an array to replace 3 relevant variables

2024-06-19 Thread Richard Biener
On Sun, Jun 16, 2024 at 9:27 AM Feng Xue OS  wrote:
>
> It's better to place 3 relevant independent variables into array, since we
> have requirement to access them via an index in the following patch. At the
> same time, this change may get some duplicated code be more compact.

OK.  I might have caused a conflict for this patch, so even OK after conflict
resolution.

Thanks,
Richard.

> Thanks,
> Feng
>
> ---
> gcc/
> * tree-vect-loop.cc (vect_transform_reduction): Replace 
> vec_oprnds0/1/2
> with one new array variable vec_oprnds[3].
> ---
>  gcc/tree-vect-loop.cc | 42 +-
>  1 file changed, 17 insertions(+), 25 deletions(-)
>
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 39aa5cb1197..7909d63d4df 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -8605,9 +8605,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>
>/* Transform.  */
>tree new_temp = NULL_TREE;
> -  auto_vec vec_oprnds0;
> -  auto_vec vec_oprnds1;
> -  auto_vec vec_oprnds2;
> +  auto_vec vec_oprnds[3];
>
>if (dump_enabled_p ())
>  dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
> @@ -8657,12 +8655,12 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>  {
>vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
>  single_defuse_cycle && reduc_index == 0
> -? NULL_TREE : op.ops[0], &vec_oprnds0,
> +? NULL_TREE : op.ops[0], &vec_oprnds[0],
>  single_defuse_cycle && reduc_index == 1
> -? NULL_TREE : op.ops[1], &vec_oprnds1,
> +? NULL_TREE : op.ops[1], &vec_oprnds[1],
>  op.num_ops == 3
>  && !(single_defuse_cycle && reduc_index == 2)
> -? op.ops[2] : NULL_TREE, &vec_oprnds2);
> +? op.ops[2] : NULL_TREE, &vec_oprnds[2]);
>  }
>else
>  {
> @@ -8670,12 +8668,12 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>  vectype.  */
>gcc_assert (single_defuse_cycle
>   && (reduc_index == 1 || reduc_index == 2));
> -  vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
> -op.ops[0], truth_type_for (vectype_in), &vec_oprnds0,
> +  vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies, op.ops[0],
> +truth_type_for (vectype_in), &vec_oprnds[0],
>  reduc_index == 1 ? NULL_TREE : op.ops[1],
> -NULL_TREE, &vec_oprnds1,
> +NULL_TREE, &vec_oprnds[1],
>  reduc_index == 2 ? NULL_TREE : op.ops[2],
> -NULL_TREE, &vec_oprnds2);
> +NULL_TREE, &vec_oprnds[2]);
>  }
>
>/* For single def-use cycles get one copy of the vectorized reduction
> @@ -8683,20 +8681,21 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>if (single_defuse_cycle)
>  {
>vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, 1,
> -reduc_index == 0 ? op.ops[0] : NULL_TREE, 
> &vec_oprnds0,
> -reduc_index == 1 ? op.ops[1] : NULL_TREE, 
> &vec_oprnds1,
> +reduc_index == 0 ? op.ops[0] : NULL_TREE,
> +&vec_oprnds[0],
> +reduc_index == 1 ? op.ops[1] : NULL_TREE,
> +&vec_oprnds[1],
>  reduc_index == 2 ? op.ops[2] : NULL_TREE,
> -&vec_oprnds2);
> +&vec_oprnds[2]);
>  }
>
>bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
> +  unsigned num = vec_oprnds[reduc_index == 0 ? 1 : 0].length ();
>
> -  unsigned num = (reduc_index == 0
> - ? vec_oprnds1.length () : vec_oprnds0.length ());
>for (unsigned i = 0; i < num; ++i)
>  {
>gimple *new_stmt;
> -  tree vop[3] = { vec_oprnds0[i], vec_oprnds1[i], NULL_TREE };
> +  tree vop[3] = { vec_oprnds[0][i], vec_oprnds[1][i], NULL_TREE };
>if (masked_loop_p && !mask_by_cond_expr)
> {
>   /* No conditional ifns have been defined for dot-product yet.  */
> @@ -8721,7 +8720,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>else
> {
>   if (op.num_ops >= 3)
> -   vop[2] = vec_oprnds2[i];
> +   vop[2] = vec_oprnds[2][i];
>
>   if (masked_loop_p && mask_by_cond_expr)
> {
> @@ -8752,14 +8751,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
> }
>
>if (single_defuse_cycle && i < num - 1)
> -   {
> - if (reduc_index == 0)
> -   vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
> - else if (reduc_index == 1)
> -   vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
> -  

Re: [PATCH 6/8] vect: Tighten an assertion for lane-reducing in transform

2024-06-19 Thread Richard Biener
On Sun, Jun 16, 2024 at 9:28 AM Feng Xue OS  wrote:
>
> According to logic of code nearby the assertion, all lane-reducing operations
> should not appear, not just DOT_PROD_EXPR. Since "use_mask_by_cond_expr_p"
> treats SAD_EXPR same as DOT_PROD_EXPR, and WIDEN_SUM_EXPR should not be 
> allowed
> by the following assertion "gcc_assert (commutative_binary_op_p (...))", so
> tighten the assertion.

OK.

Thanks,
Richard.

> Thanks,
> Feng
>
> ---
> gcc/
> * tree-vect-loop.cc (vect_transform_reduction): Change assertion to
> cover all lane-reducing ops.
> ---
>  gcc/tree-vect-loop.cc | 8 +---
>  1 file changed, 5 insertions(+), 3 deletions(-)
>
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 7909d63d4df..e0561feddce 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -8643,7 +8643,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>  }
>
>bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
> -  gcc_assert (single_defuse_cycle || lane_reducing_op_p (code));
> +  bool lane_reducing = lane_reducing_op_p (code);
> +  gcc_assert (single_defuse_cycle || lane_reducing);
>
>/* Create the destination vector  */
>tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
> @@ -8698,8 +8699,9 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>tree vop[3] = { vec_oprnds[0][i], vec_oprnds[1][i], NULL_TREE };
>if (masked_loop_p && !mask_by_cond_expr)
> {
> - /* No conditional ifns have been defined for dot-product yet.  */
> - gcc_assert (code != DOT_PROD_EXPR);
> + /* No conditional ifns have been defined for lane-reducing op
> +yet.  */
> + gcc_assert (!lane_reducing);
>
>   /* Make sure that the reduction accumulator is vop[0].  */
>   if (reduc_index == 1)
> --
> 2.17.1


Re: [PATCH 4/8] vect: Determine input vectype for multiple lane-reducing

2024-06-19 Thread Richard Biener
On Sun, Jun 16, 2024 at 9:25 AM Feng Xue OS  wrote:
>
> The input vectype of reduction PHI statement must be determined before
> vect cost computation for the reduction. Since lance-reducing operation has
> different input vectype from normal one, so we need to traverse all reduction
> statements to find out the input vectype with the least lanes, and set that to
> the PHI statement.
>
> Thanks,
> Feng
>
> ---
> gcc/
> * tree-vect-loop.cc (vectorizable_reduction): Determine input vectype
> during traversal of reduction statements.
> ---
>  gcc/tree-vect-loop.cc | 72 +--
>  1 file changed, 49 insertions(+), 23 deletions(-)
>
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 0f7b125e72d..39aa5cb1197 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -7643,7 +7643,9 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>  {
>stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
>stmt_vec_info vdef = vect_stmt_to_vectorize (def);
> -  if (STMT_VINFO_REDUC_IDX (vdef) == -1)
> +  int reduc_idx = STMT_VINFO_REDUC_IDX (vdef);
> +
> +  if (reduc_idx == -1)
> {
>   if (dump_enabled_p ())
> dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -7686,10 +7688,50 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>   return false;
> }
> }
> -  else if (!stmt_info)
> -   /* First non-conversion stmt.  */
> -   stmt_info = vdef;
> -  reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
> +  else
> +   {
> + /* First non-conversion stmt.  */
> + if (!stmt_info)
> +   stmt_info = vdef;
> +
> + if (lane_reducing_op_p (op.code))
> +   {
> + unsigned group_size = slp_node ? SLP_TREE_LANES (slp_node) : 0;
> + tree op_type = TREE_TYPE (op.ops[0]);
> + tree new_vectype_in = get_vectype_for_scalar_type (loop_vinfo,
> +op_type,
> +group_size);

I think doing it this way does not adhere to the vector type size constraint
with loop vectorization.  You should use vect_is_simple_use like the
original code did as the actual vector definition determines the vector type
used.

You are always using op.ops[0] here - I think that works because
reduc_idx is the last operand of all lane-reducing ops.  But then
we should assert reduc_idx != 0 here and add a comment.

> +
> + /* The last operand of lane-reducing operation is for
> +reduction.  */
> + gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 1);
> +
> + /* For lane-reducing operation vectorizable analysis needs the
> +reduction PHI information */
> + STMT_VINFO_REDUC_DEF (def) = phi_info;
> +
> + if (!new_vectype_in)
> +   return false;
> +
> + /* Each lane-reducing operation has its own input vectype, while
> +reduction PHI will record the input vectype with the least
> +lanes.  */
> + STMT_VINFO_REDUC_VECTYPE_IN (vdef) = new_vectype_in;
> +
> + /* To accommodate lane-reducing operations of mixed input
> +vectypes, choose input vectype with the least lanes for the
> +reduction PHI statement, which would result in the most
> +ncopies for vectorized reduction results.  */
> + if (!vectype_in
> + || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE 
> (vectype_in)))
> +  < GET_MODE_SIZE (SCALAR_TYPE_MODE (op_type
> +   vectype_in = new_vectype_in;

I know this is a fragile area but I always wonder since the accumulating operand
is the largest (all lane-reducing ops are widening), and that will be
equal to the
type of the PHI node, how this condition can be ever true.

ncopies is determined by the VF, so the comment is at least misleading.

> +   }
> + else
> +   vectype_in = STMT_VINFO_VECTYPE (phi_info);

Please initialize vectype_in from phi_info before the loop (that
should never be NULL).

I'll note that with your patch it seems we'd initialize vectype_in to
the biggest
non-accumulation vector type involved in lane-reducing ops but the accumulating
type might still be larger.   Why, when we have multiple lane-reducing
ops, would
we chose the largest input here?  I see we eventually do

  if (slp_node)
ncopies = 1;
  else
ncopies = vect_get_num_copies (loop_vinfo, vectype_in);

but then IIRC we always force a single cycle def for lane-reducing ops(?).
In particular for vect_transform_reduction and SLP we rely on
SLP_TREE_NUMBER_OF_VEC_STMTS while non-SLP uses
STMT_VINFO_REDUC_VECTYPE_IN.

So I wonder what breaks when we set vectype_in = vector type of PHI?

Re: [PATCH v2] Arm: Fix ldrd offset range [PR115153]

2024-06-19 Thread Richard Earnshaw (lists)
On 11/06/2024 17:42, Wilco Dijkstra wrote:
> v2: use a new arm_arch_v7ve_neon, fix use of DImode in output_move_neon
> 
> The valid offset range of LDRD in arm_legitimate_index_p is increased to
> -1024..1020 if NEON is enabled since VALID_NEON_DREG_MODE includes DImode.
> Fix this by moving the LDRD check earlier.
> 
> Passes bootstrap & regress, OK for commit?
> 
> gcc:
> PR target/115153
> * config/arm/arm.cc (arm_legitimate_index_p): Move LDRD case before 
> NEON.
> (thumb2_legitimate_index_p): Update comments.
> (output_move_neon): Use DFmode for vldr/vstr.
> * lib/target-supports.exp: Add arm_arch_v7ve_neon target support.
> 
> gcc/testsuite:
> PR target/11515> * gcc.target/arm/pr115153.c: Add new test.

The Linaro CI is reporting an ICE while building libgfortran with this change.

# 00:14:58 
/home/tcwg-build/workspace/tcwg_gnu_3/abe/snapshots/gcc.git~master/libgfortran/generated/matmul_i1.c:3006:1:
 internal compiler error: in change_address_1, at emit-rtl.cc:2299
# 00:14:58 make[3]: *** [Makefile:4262: generated/matmul_i1.lo] Error 1
# 00:14:58 make[2]: *** [Makefile:1861: all] Error 2
# 00:14:58 make[1]: *** [Makefile:15767: all-target-libgfortran] Error 2
# 00:14:58 make: *** [Makefile:1065: all] Error 2

Could you investigate please?

R.

> 
> ---
> 
> diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
> index 
> ea0c963a4d67ecd70e1571624e84dfe46d757df9..7dec0254f5a953050c9c52aa297fad7f3dfb6c74
>  100644
> --- a/gcc/config/arm/arm.cc
> +++ b/gcc/config/arm/arm.cc
> @@ -8852,6 +8852,28 @@ arm_legitimate_index_p (machine_mode mode, rtx index, 
> RTX_CODE outer,
>   && INTVAL (index) > -1024
>   && (INTVAL (index) & 3) == 0);
>  
> +  if (arm_address_register_rtx_p (index, strict_p)
> +  && (GET_MODE_SIZE (mode) <= 4))
> +return 1;
> +
> +  /* This handles DFmode only if !TARGET_HARD_FLOAT.  */
> +  if (mode == DImode || mode == DFmode)
> +{
> +  if (code == CONST_INT)
> + {
> +   HOST_WIDE_INT val = INTVAL (index);
> +
> +   /* Assume we emit ldrd or 2x ldr if !TARGET_LDRD.
> +  If vldr is selected it uses arm_coproc_mem_operand.  */
> +   if (TARGET_LDRD)
> + return val > -256 && val < 256;
> +   else
> + return val > -4096 && val < 4092;
> + }
> +
> +  return TARGET_LDRD && arm_address_register_rtx_p (index, strict_p);
> +}
> +
>/* For quad modes, we restrict the constant offset to be slightly less
>   than what the instruction format permits.  We do this because for
>   quad mode moves, we will actually decompose them into two separate
> @@ -8864,7 +8886,7 @@ arm_legitimate_index_p (machine_mode mode, rtx index, 
> RTX_CODE outer,
>   && (INTVAL (index) & 3) == 0);
>  
>/* We have no such constraint on double mode offsets, so we permit the
> - full range of the instruction format.  */
> + full range of the instruction format.  Note DImode is included here.  */
>if (TARGET_NEON && VALID_NEON_DREG_MODE (mode))
>  return (code == CONST_INT
>   && INTVAL (index) < 1024
> @@ -8877,27 +8899,6 @@ arm_legitimate_index_p (machine_mode mode, rtx index, 
> RTX_CODE outer,
>   && INTVAL (index) > -1024
>   && (INTVAL (index) & 3) == 0);
>  
> -  if (arm_address_register_rtx_p (index, strict_p)
> -  && (GET_MODE_SIZE (mode) <= 4))
> -return 1;
> -
> -  if (mode == DImode || mode == DFmode)
> -{
> -  if (code == CONST_INT)
> - {
> -   HOST_WIDE_INT val = INTVAL (index);
> -
> -   /* Assume we emit ldrd or 2x ldr if !TARGET_LDRD.
> -  If vldr is selected it uses arm_coproc_mem_operand.  */
> -   if (TARGET_LDRD)
> - return val > -256 && val < 256;
> -   else
> - return val > -4096 && val < 4092;
> - }
> -
> -  return TARGET_LDRD && arm_address_register_rtx_p (index, strict_p);
> -}
> -
>if (GET_MODE_SIZE (mode) <= 4
>&& ! (arm_arch4
>   && (mode == HImode
> @@ -9000,7 +9001,7 @@ thumb2_legitimate_index_p (machine_mode mode, rtx 
> index, int strict_p)
>   && (INTVAL (index) & 3) == 0);
>  
>/* We have no such constraint on double mode offsets, so we permit the
> - full range of the instruction format.  */
> + full range of the instruction format.  Note DImode is included here.  */
>if (TARGET_NEON && VALID_NEON_DREG_MODE (mode))
>  return (code == CONST_INT
>   && INTVAL (index) < 1024
> @@ -9011,6 +9012,7 @@ thumb2_legitimate_index_p (machine_mode mode, rtx 
> index, int strict_p)
>&& (GET_MODE_SIZE (mode) <= 4))
>  return 1;
>  
> +  /* This handles DImode if !TARGET_NEON, and DFmode if !TARGET_VFP_BASE.  */
>if (mode == DImode || mode == DFmode)
>  {
>if (code == CONST_INT)
> @@ -20854,7 +20856,7 @@ output_move_neon (rtx *operands)
>   /* We're only using DImode here because it's a convenient
>  size.  */
>   op

Re: [PATCH] tree-optimization/114413 - SLP CSE after permute optimization

2024-06-19 Thread Richard Sandiford
Richard Biener  writes:
> We currently fail to re-CSE SLP nodes after optimizing permutes
> which results in off cost estimates.  For gcc.dg/vect/bb-slp-32.c
> this shows in not re-using the SLP node with the load and arithmetic
> for both the store and the reduction.  The following implements
> CSE by re-bst-mapping nodes as finalization part of vect_optimize_slp.
>
> I've tried to make the CSE part of permute materialization but it
> isn't a very good fit there.  I've not bothered to implement something
> more complete, also handling external defs or defs without
> SLP_TREE_SCALAR_STMTS.
>
> I realize this might result in more BB SLP which in turn might slow
> down code given costing for BB SLP is difficult (even that we now
> vectorize gcc.dg/vect/bb-slp-32.c on x86_64 might be not a good idea).
> This is nevertheless feeding more accurate info to costing which is
> good.
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu.
>
> Does this look OK?
>
> Thanks,
> Richard.
>
>   PR tree-optimization/114413
>   * tree-vect-slp.cc (release_scalar_stmts_to_slp_tree_map):
>   New function, split out from ...
>   (vect_analyze_slp): ... here.  Call it.
>   (vect_cse_slp_nodes): New function.
>   (vect_optimize_slp): Call it.
>
>   * gcc.dg/vect/bb-slp-32.c: Expect CSE and vectorization on x86.
> ---
>  gcc/testsuite/gcc.dg/vect/bb-slp-32.c |  6 +++
>  gcc/tree-vect-slp.cc  | 77 ++-
>  2 files changed, 71 insertions(+), 12 deletions(-)
>
> diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-32.c 
> b/gcc/testsuite/gcc.dg/vect/bb-slp-32.c
> index 4f72727b694..475b241c36e 100644
> --- a/gcc/testsuite/gcc.dg/vect/bb-slp-32.c
> +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-32.c
> @@ -38,3 +38,9 @@ int main()
>  abort ();
>return 0;
>  }
> +
> +/* This is a weak test but we want to re-use the arithmetic for both the
> +   store and the reduction.  */
> +/* { dg-final { scan-tree-dump "re-using SLP tree" "slp2" { target { 
> x86_64-*-* i?86-*-* } } } } */
> +/* On i386 we vectorize both the store and the reduction.  */
> +/* { dg-final { scan-tree-dump-times "basic block part vectorized" 2 "slp2" 
> { target { x86_64-*-* i?86-*-* } } } } */
> diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> index 2552dacbd69..980d1e7267d 100644
> --- a/gcc/tree-vect-slp.cc
> +++ b/gcc/tree-vect-slp.cc
> @@ -1586,6 +1586,23 @@ bst_traits::equal (value_type existing, value_type 
> candidate)
>return true;
>  }
>  
> +typedef hash_map , slp_tree,
> +   simple_hashmap_traits  >
> +  scalar_stmts_to_slp_tree_map_t;
> +
> +/* Release BST_MAP.  */
> +
> +static void
> +release_scalar_stmts_to_slp_tree_map (scalar_stmts_to_slp_tree_map_t 
> *bst_map)
> +{
> +  /* The map keeps a reference on SLP nodes built, release that.  */
> +  for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
> +   it != bst_map->end (); ++it)
> +if ((*it).second)
> +  vect_free_slp_tree ((*it).second);
> +  delete bst_map;
> +}
> +
>  /* ???  This was std::pair, tree>
> but then vec::insert does memmove and that's not compatible with
> std::pair.  */
> @@ -1684,10 +1701,6 @@ vect_slp_linearize_chain (vec_info *vinfo,
>  }
>  }
>  
> -typedef hash_map , slp_tree,
> -   simple_hashmap_traits  >
> -  scalar_stmts_to_slp_tree_map_t;
> -
>  static slp_tree
>  vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
>  vec stmts, unsigned int group_size,
> @@ -4308,14 +4321,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned 
> max_tree_size)
>   }
>  }
>  
> -
> -
> -  /* The map keeps a reference on SLP nodes built, release that.  */
> -  for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
> -   it != bst_map->end (); ++it)
> -if ((*it).second)
> -  vect_free_slp_tree ((*it).second);
> -  delete bst_map;
> +  release_scalar_stmts_to_slp_tree_map (bst_map);
>  
>if (pattern_found && dump_enabled_p ())
>  {
> @@ -6373,6 +6379,43 @@ vect_optimize_slp_pass::run ()
>free_graph (m_slpg);
>  }
>  
> +/* Apply CSE to NODE and its children using BST_MAP.  */
> +
> +static void
> +vect_cse_slp_nodes (scalar_stmts_to_slp_tree_map_t *bst_map, slp_tree& node)
> +{
> +  if (SLP_TREE_DEF_TYPE (node) == vect_internal_def
> +  && SLP_TREE_CODE (node) != VEC_PERM_EXPR
> +  /* Besides some VEC_PERM_EXPR, two-operator nodes also
> +  lack scalar stmts and thus CSE doesn't work via bst_map.  Ideally
> +  we'd have sth that works for all internal and external nodes.  */
> +  && !SLP_TREE_SCALAR_STMTS (node).is_empty ())
> +{
> +  if (slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node)))
> + {
> +   if (*leader != node)
> + {
> +   if (dump_enabled_p ())
> + dump_printf_loc (MSG_NOTE, vect_location,
> +  "re-using SLP tree %p for %p\n",
> +  (void *)*leader, (v

Re: [PING^3][PATCH v2] rs6000: Fix issue in specifying PTImode as an attribute [PR106895]

2024-06-19 Thread jeevitha
Ping!

Please review. Are there any more changes required?

Thanks & Regards
Jeevitha

On 21/05/24 10:28 am, jeevitha wrote:
> Ping!
> 
> please review.
> 
> Thanks & Regards
> Jeevitha
> 
> 
> On 17/04/24 2:44 pm, jeevitha wrote:
>> Ping!
>>
>> I've incorporated all the suggested changes. Please review.
>>
>> Thanks & Regards
>> Jeevitha
>>
>> On 21/03/24 6:21 pm, jeevitha wrote:
>>> Hi All,
>>>
>>> The following patch has been bootstrapped and regtested on 
>>> powerpc64le-linux.
>>>
>>> PTImode assists in generating even/odd register pairs on 128 bits. When the 
>>> user 
>>> specifies PTImode as an attribute, it breaks because there is no internal 
>>> type 
>>> to handle this mode. To address this, we have created a tree node with 
>>> dummy type
>>> to handle PTImode. We are not documenting this dummy type since users are 
>>> not
>>> allowed to use this type externally.
>>>
>>> 2024-03-21  Jeevitha Palanisamy  
>>>
>>> gcc/
>>> PR target/110411
>>> * config/rs6000/rs6000.h (enum rs6000_builtin_type_index): Add
>>> RS6000_BTI_INTPTI.
>>> * config/rs6000/rs6000-builtin.cc (rs6000_init_builtins): Add node
>>> for PTImode type.
>>>
>>> gcc/testsuite/
>>> PR target/106895
>>> * gcc.target/powerpc/pr106895.c: New testcase.
>>>
>>> diff --git a/gcc/config/rs6000/rs6000-builtin.cc 
>>> b/gcc/config/rs6000/rs6000-builtin.cc
>>> index 6698274031b..f553c72779e 100644
>>> --- a/gcc/config/rs6000/rs6000-builtin.cc
>>> +++ b/gcc/config/rs6000/rs6000-builtin.cc
>>> @@ -756,6 +756,15 @@ rs6000_init_builtins (void)
>>>else
>>>  ieee128_float_type_node = NULL_TREE;
>>>  
>>> +  /* PTImode to get even/odd register pairs.  */
>>> +  intPTI_type_internal_node = make_node(INTEGER_TYPE);
>>> +  TYPE_PRECISION (intPTI_type_internal_node) = GET_MODE_BITSIZE (PTImode);
>>> +  layout_type (intPTI_type_internal_node);
>>> +  SET_TYPE_MODE (intPTI_type_internal_node, PTImode);
>>> +  t = build_qualified_type (intPTI_type_internal_node, TYPE_QUAL_CONST);
>>> +  lang_hooks.types.register_builtin_type (intPTI_type_internal_node,
>>> + "__dummypti");
>>> +
>>>/* Vector pair and vector quad support.  */
>>>vector_pair_type_node = make_node (OPAQUE_TYPE);
>>>SET_TYPE_MODE (vector_pair_type_node, OOmode);
>>> diff --git a/gcc/config/rs6000/rs6000.h b/gcc/config/rs6000/rs6000.h
>>> index 68bc45d65ba..b6078077b20 100644
>>> --- a/gcc/config/rs6000/rs6000.h
>>> +++ b/gcc/config/rs6000/rs6000.h
>>> @@ -2302,6 +2302,7 @@ enum rs6000_builtin_type_index
>>>RS6000_BTI_ptr_vector_quad,
>>>RS6000_BTI_ptr_long_long,
>>>RS6000_BTI_ptr_long_long_unsigned,
>>> +  RS6000_BTI_INTPTI,
>>>RS6000_BTI_MAX
>>>  };
>>>  
>>> @@ -2346,6 +2347,7 @@ enum rs6000_builtin_type_index
>>>  #define uintDI_type_internal_node   
>>> (rs6000_builtin_types[RS6000_BTI_UINTDI])
>>>  #define intTI_type_internal_node
>>> (rs6000_builtin_types[RS6000_BTI_INTTI])
>>>  #define uintTI_type_internal_node   
>>> (rs6000_builtin_types[RS6000_BTI_UINTTI])
>>> +#define intPTI_type_internal_node   
>>> (rs6000_builtin_types[RS6000_BTI_INTPTI])
>>>  #define float_type_internal_node
>>> (rs6000_builtin_types[RS6000_BTI_float])
>>>  #define double_type_internal_node   
>>> (rs6000_builtin_types[RS6000_BTI_double])
>>>  #define long_double_type_internal_node  
>>> (rs6000_builtin_types[RS6000_BTI_long_double])
>>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr106895.c 
>>> b/gcc/testsuite/gcc.target/powerpc/pr106895.c
>>> new file mode 100644
>>> index 000..56547b7fa9d
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/powerpc/pr106895.c
>>> @@ -0,0 +1,15 @@
>>> +/* PR target/106895 */
>>> +/* { dg-require-effective-target int128 } */
>>> +/* { dg-options "-O2" } */
>>> +
>>> +/* Verify the following generates even/odd register pairs.  */
>>> +
>>> +typedef __int128 pti __attribute__((mode(PTI)));
>>> +
>>> +void
>>> +set128 (pti val, pti *mem)
>>> +{
>>> +asm("stq %1,%0" : "=m"(*mem) : "r"(val));
>>> +}
>>> +
>>> +/* { dg-final { scan-assembler "stq \[123\]?\[02468\]" } } */
>>>
>>>


Re: [PATCH 0/2] arm, doloop: Add support for MVE Tail-Predicated Low Overhead Loops

2024-06-19 Thread Richard Earnshaw (lists)
On 23/05/2024 15:37, Andre Vieira wrote:
> 
> Hi,
> 
>   We held these two patches back in stage 4 because they touched 
> target-agnostic code, though I am quite confident they will not affect other 
> targets. Given stage one has reopened, I am reposting them, I rebased them 
> but they seem to apply cleanly on trunk.
>   No changes from previously reviewed patches.
> 
>   OK for trunk?
> 
> Andre Vieira (2):
>   doloop: Add support for predicated vectorized loops
>   arm: Add support for MVE Tail-Predicated Low Overhead Loops
> 
>  gcc/config/arm/arm-protos.h   |4 +-
>  gcc/config/arm/arm.cc | 1249 -
>  gcc/config/arm/arm.opt|3 +
>  gcc/config/arm/iterators.md   |   15 +
>  gcc/config/arm/mve.md |   50 +
>  gcc/config/arm/thumb2.md  |  138 +-
>  gcc/config/arm/types.md   |6 +-
>  gcc/config/arm/unspecs.md |   14 +-
>  gcc/df-core.cc|   15 +
>  gcc/df.h  |1 +
>  gcc/loop-doloop.cc|  164 ++-
>  gcc/testsuite/gcc.target/arm/lob.h|  128 +-
>  gcc/testsuite/gcc.target/arm/lob1.c   |   23 +-
>  gcc/testsuite/gcc.target/arm/lob6.c   |8 +-
>  .../gcc.target/arm/mve/dlstp-compile-asm-1.c  |  146 ++
>  .../gcc.target/arm/mve/dlstp-compile-asm-2.c  |  749 ++
>  .../gcc.target/arm/mve/dlstp-compile-asm-3.c  |   46 +
>  .../gcc.target/arm/mve/dlstp-int16x8-run.c|   44 +
>  .../gcc.target/arm/mve/dlstp-int16x8.c|   31 +
>  .../gcc.target/arm/mve/dlstp-int32x4-run.c|   45 +
>  .../gcc.target/arm/mve/dlstp-int32x4.c|   31 +
>  .../gcc.target/arm/mve/dlstp-int64x2-run.c|   48 +
>  .../gcc.target/arm/mve/dlstp-int64x2.c|   28 +
>  .../gcc.target/arm/mve/dlstp-int8x16-run.c|   44 +
>  .../gcc.target/arm/mve/dlstp-int8x16.c|   32 +
>  .../gcc.target/arm/mve/dlstp-invalid-asm.c|  521 +++
>  26 files changed, 3434 insertions(+), 149 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-1.c
>  create mode 100644 gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-2.c
>  create mode 100644 gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-3.c
>  create mode 100644 gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8-run.c
>  create mode 100644 gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8.c
>  create mode 100644 gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4-run.c
>  create mode 100644 gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4.c
>  create mode 100644 gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2-run.c
>  create mode 100644 gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2.c
>  create mode 100644 gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16-run.c
>  create mode 100644 gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16.c
>  create mode 100644 gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c
> 

These are OK.  Please watch out for any fallout on other architectures.

R.


[PATCH V2] rs6000: load high and low part of 128bit vector independently [PR110040]

2024-06-19 Thread jeevitha
Hi All,

Updated the patch based on review comments. This patch passed bootstrap
and regression testing on powerpc64le-linux with no regressions.

PR110040 exposes an issue concerning moves from vector registers to GPRs.
There are two moves, one for upper 64 bits and the other for the lower
64 bits.  In the problematic test case, we are only interested in storing
the lower 64 bits.  However, the instruction for copying the upper 64 bits
is still emitted and is dead code.  This patch adds a splitter that splits
apart the two move instructions so that DCE can remove the dead code after
splitting.

2024-06-19  Jeevitha Palanisamy  

gcc/
PR target/110040
* config/rs6000/vsx.md (split pattern for V1TI to DI move): Defined.

gcc/testsuite/
PR target/110040
* gcc.target/powerpc/pr110040-1.c: New testcase.
* gcc.target/powerpc/pr110040-2.c: New testcase.


diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index f135fa079bd..f1979815df6 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -6706,3 +6706,20 @@
   "vmsumcud %0,%1,%2,%3"
   [(set_attr "type" "veccomplex")]
 )
+
+(define_split
+  [(set (match_operand:V1TI 0 "gpc_reg_operand")
+   (match_operand:V1TI 1 "vsx_register_operand"))]
+  "reload_completed
+   && TARGET_DIRECT_MOVE_64BIT
+   && int_reg_operand (operands[0], V1TImode)
+   && vsx_register_operand (operands[1], V1TImode)"
+   [(pc)]
+{
+  rtx src_op = gen_rtx_REG (V2DImode, REGNO (operands[1]));
+  rtx dest_op0 = gen_rtx_REG (DImode, REGNO (operands[0]));
+  rtx dest_op1 = gen_rtx_REG (DImode, REGNO (operands[0]) + 1);
+  emit_insn (gen_vsx_extract_v2di (dest_op0, src_op, const0_rtx));
+  emit_insn (gen_vsx_extract_v2di (dest_op1, src_op, const1_rtx));
+  DONE;
+})
diff --git a/gcc/testsuite/gcc.target/powerpc/pr110040-1.c 
b/gcc/testsuite/gcc.target/powerpc/pr110040-1.c
new file mode 100644
index 000..0a521e9e51d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr110040-1.c
@@ -0,0 +1,15 @@
+/* PR target/110040 */
+/* { dg-do compile } */
+/* { dg-require-effective-target int128 } */
+/* { dg-require-effective-target powerpc_vsx } */
+/* { dg-options "-O2 -mdejagnu-cpu=power9" } */
+/* { dg-final { scan-assembler-not {\mmfvsrd\M} } } */
+
+#include 
+
+void
+foo (signed long *dst, vector signed __int128 src)
+{
+  *dst = (signed long) src[0];
+}
+
diff --git a/gcc/testsuite/gcc.target/powerpc/pr110040-2.c 
b/gcc/testsuite/gcc.target/powerpc/pr110040-2.c
new file mode 100644
index 000..d2ef471d666
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr110040-2.c
@@ -0,0 +1,16 @@
+/* PR target/110040 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mdejagnu-cpu=power10" } */
+/* { dg-require-effective-target int128 } */
+/* { dg-require-effective-target powerpc_vsx } */
+/* { dg-final { scan-assembler-not {\mmfvsrd\M} } } */
+
+/* Note: __builtin_altivec_tr_stxvrwx requires the -mcpu=power10 option */
+
+#include 
+
+void
+foo (signed int *dst, vector signed __int128 src)
+{
+  __builtin_vec_xst_trunc (src, 0, dst);
+}




Re: [PATCH 3/8] vect: Use one reduction_type local variable

2024-06-19 Thread Richard Biener
On Sun, Jun 16, 2024 at 9:23 AM Feng Xue OS  wrote:
>
> Two local variables were defined to refer same STMT_VINFO_REDUC_TYPE, better
> to keep only one.

OK.

Richard.

> Thanks,
> Feng
>
> ---
> gcc/
> * tree-vect-loop.cc (vectorizable_reduction): Remove v_reduc_type, and
> replace it to another local variable reduction_type.
> ---
>  gcc/tree-vect-loop.cc | 8 
>  1 file changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 6e8b3639daf..0f7b125e72d 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -7868,10 +7868,10 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>if (lane_reducing)
>  STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in;
>
> -  enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
> -  STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
> +  enum vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (phi_info);
> +  STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type;
>/* If we have a condition reduction, see if we can simplify it further.  */
> -  if (v_reduc_type == COND_REDUCTION)
> +  if (reduction_type == COND_REDUCTION)
>  {
>if (slp_node && SLP_TREE_LANES (slp_node) != 1)
> return false;
> @@ -8038,7 +8038,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>
>STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
>
> -  vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
> +  reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
>if (reduction_type == TREE_CODE_REDUCTION)
>  {
>/* Check whether it's ok to change the order of the computation.
> --
> 2.17.1


Re: [PATCH 2/8] vect: Remove duplicated check on reduction operand

2024-06-19 Thread Richard Biener
On Sun, Jun 16, 2024 at 9:22 AM Feng Xue OS  wrote:
>
> In vectorizable_reduction, one check on a reduction operand via index could be
> contained by another one check via pointer, so remove the former.

OK.

Thanks,
Richard.

> Thanks,
> Feng
>
> ---
> gcc/
> * tree-vect-loop.cc (vectorizable_reduction): Remove the duplicated
> check.
> ---
>  gcc/tree-vect-loop.cc | 6 ++
>  1 file changed, 2 insertions(+), 4 deletions(-)
>
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index d9a2ad69484..6e8b3639daf 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -7815,11 +7815,9 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>  "use not simple.\n");
>   return false;
> }
> -  if (i == STMT_VINFO_REDUC_IDX (stmt_info))
> -   continue;
>
> -  /* For an IFN_COND_OP we might hit the reduction definition operand
> -twice (once as definition, once as else).  */
> +  /* Skip reduction operands, and for an IFN_COND_OP we might hit the
> +reduction operand twice (once as definition, once as else).  */
>if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
> continue;
>
> --
> 2.17.1


Re: [PATH 1/8] vect: Add a function to check lane-reducing stmt

2024-06-19 Thread Richard Biener
On Sun, Jun 16, 2024 at 9:21 AM Feng Xue OS  wrote:
>
> The series of patches are meant to support multiple lane-reducing reduction 
> statements. Since the original ones conflicted with the new single-lane slp 
> node patches, I have reworked most of the patches, and split them as small as 
> possible, which may make code review easier.

Thanks for that - as always please feel free to push approved parts of
the series if dependences allow.

> In the 1st one, I add a utility function to check if a statement is 
> lane-reducing operation,
> which could simplify some existing code.

OK.

Thanks,
Richard.

> Thanks,
> Feng
>
> ---
> gcc/
> * tree-vectorizer.h (lane_reducing_stmt_p): New function.
> * tree-vect-slp.cc (vect_analyze_slp): Use new function
> lane_reducing_stmt_p to check statement.
> ---
>  gcc/tree-vect-slp.cc  |  4 +---
>  gcc/tree-vectorizer.h | 12 
>  2 files changed, 13 insertions(+), 3 deletions(-)
>
> diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> index 7e3d0107b4e..b4ea2e18f00 100644
> --- a/gcc/tree-vect-slp.cc
> +++ b/gcc/tree-vect-slp.cc
> @@ -3919,7 +3919,6 @@ vect_analyze_slp (vec_info *vinfo, unsigned 
> max_tree_size)
>   scalar_stmts.create (loop_vinfo->reductions.length ());
>   for (auto next_info : loop_vinfo->reductions)
> {
> - gassign *g;
>   next_info = vect_stmt_to_vectorize (next_info);
>   if ((STMT_VINFO_RELEVANT_P (next_info)
>|| STMT_VINFO_LIVE_P (next_info))
> @@ -3931,8 +3930,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned 
> max_tree_size)
> {
>   /* Do not discover SLP reductions combining lane-reducing
>  ops, that will fail later.  */
> - if (!(g = dyn_cast  (STMT_VINFO_STMT 
> (next_info)))
> - || !lane_reducing_op_p (gimple_assign_rhs_code (g)))
> + if (!lane_reducing_stmt_p (STMT_VINFO_STMT (next_info)))
> scalar_stmts.quick_push (next_info);
>   else
> {
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 6bb0f5c3a56..60224f4e284 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -2169,12 +2169,24 @@ vect_apply_runtime_profitability_check_p 
> (loop_vec_info loop_vinfo)
>   && th >= vect_vf_for_cost (loop_vinfo));
>  }
>
> +/* Return true if CODE is a lane-reducing opcode.  */
> +
>  inline bool
>  lane_reducing_op_p (code_helper code)
>  {
>return code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR;
>  }
>
> +/* Return true if STMT is a lane-reducing statement.  */
> +
> +inline bool
> +lane_reducing_stmt_p (gimple *stmt)
> +{
> +  if (auto *assign = dyn_cast  (stmt))
> +return lane_reducing_op_p (gimple_assign_rhs_code (assign));
> +  return false;
> +}
> +
>  /* Source location + hotness information. */
>  extern dump_user_location_t vect_location;
>
> --
> 2.17.1


[PATCH v2] RISC-V: Remove float vector eqne pattern

2024-06-19 Thread demin.han
We can unify eqne and other comparison operations.

Tested on RV32 and RV64

gcc/ChangeLog:

* config/riscv/riscv-vector-builtins-bases.cc: Remove eqne cond
* config/riscv/vector.md (@pred_eqne_scalar): Remove patterns
(*pred_eqne_scalar_merge_tie_mask): Ditto
(*pred_eqne_scalar): Ditto
(*pred_eqne_scalar_narrow): Ditto

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/base/float-point-cmp-eqne.c: New test.

Signed-off-by: demin.han 
---

v2 changes:
  1. add test

  Only intrinsics utilize those removed vf patterns.
  Auto vectorization use vv format now.
  The NaN will optimized out before expand in autovec as I tested.

 .../riscv/riscv-vector-builtins-bases.cc  |  4 -
 gcc/config/riscv/vector.md| 86 ---
 .../riscv/rvv/base/float-point-cmp-eqne.c | 54 
 3 files changed, 54 insertions(+), 90 deletions(-)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/base/float-point-cmp-eqne.c

diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc 
b/gcc/config/riscv/riscv-vector-builtins-bases.cc
index b6f6e4ff37e..d414721ede8 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.cc
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc
@@ -1420,10 +1420,6 @@ public:
 switch (e.op_info->op)
   {
case OP_TYPE_vf: {
- if (CODE == EQ || CODE == NE)
-   return e.use_compare_insn (CODE, code_for_pred_eqne_scalar (
-  e.vector_mode ()));
- else
return e.use_compare_insn (CODE, code_for_pred_cmp_scalar (
   e.vector_mode ()));
}
diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index fbcdf96f038..f8fae6557d9 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -7545,92 +7545,6 @@ (define_insn "*pred_cmp_scalar_narrow"
(set_attr "mode" "")
(set_attr "spec_restriction" "none,thv,thv,none,none")])
 
-(define_expand "@pred_eqne_scalar"
-  [(set (match_operand: 0 "register_operand")
-   (if_then_else:
- (unspec:
-   [(match_operand: 1 "vector_mask_operand")
-(match_operand 6 "vector_length_operand")
-(match_operand 7 "const_int_operand")
-(match_operand 8 "const_int_operand")
-(reg:SI VL_REGNUM)
-(reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
- (match_operator: 3 "equality_operator"
-[(vec_duplicate:V_VLSF
-   (match_operand: 5 "register_operand"))
- (match_operand:V_VLSF 4 "register_operand")])
- (match_operand: 2 "vector_merge_operand")))]
-  "TARGET_VECTOR"
-  {})
-
-(define_insn "*pred_eqne_scalar_merge_tie_mask"
-  [(set (match_operand: 0 "register_operand"  "=vm")
-   (if_then_else:
- (unspec:
-   [(match_operand: 1 "register_operand" "  0")
-(match_operand 5 "vector_length_operand" " rK")
-(match_operand 6 "const_int_operand" "  i")
-(match_operand 7 "const_int_operand" "  i")
-(reg:SI VL_REGNUM)
-(reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
- (match_operator: 2 "equality_operator"
-[(vec_duplicate:V_VLSF
-   (match_operand: 4 "register_operand" "  f"))
- (match_operand:V_VLSF 3 "register_operand"  " vr")])
- (match_dup 1)))]
-  "TARGET_VECTOR"
-  "vmf%B2.vf\t%0,%3,%4,v0.t"
-  [(set_attr "type" "vfcmp")
-   (set_attr "mode" "")
-   (set_attr "merge_op_idx" "1")
-   (set_attr "vl_op_idx" "5")
-   (set (attr "ma") (symbol_ref "riscv_vector::get_ma(operands[6])"))
-   (set (attr "avl_type_idx") (const_int 7))])
-
-;; We don't use early-clobber for LMUL <= 1 to get better codegen.
-(define_insn "*pred_eqne_scalar"
-  [(set (match_operand: 0 "register_operand""=vr,   vr,   
&vr,   &vr")
-   (if_then_else:
- (unspec:
-   [(match_operand: 1 "vector_mask_operand"  
"vmWc1,vmWc1,vmWc1,vmWc1")
-(match_operand 6 "vector_length_operand" "   rK,   rK,   
rK,   rK")
-(match_operand 7 "const_int_operand" "i,i,
i,i")
-(match_operand 8 "const_int_operand" "i,i,
i,i")
-(reg:SI VL_REGNUM)
-(reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
- (match_operator: 3 "equality_operator"
-[(vec_duplicate:V_VLSF
-   (match_operand: 5 "register_operand" "f,f,
f,f"))
- (match_operand:V_VLSF 4 "register_operand"  "   vr,   vr,   
vr,   vr")])
- (match_operand: 2 "vector_merge_operand""   vu,0,
vu,0")))]
-  "TARGET_VECTOR && riscv_vector::cmp_lmul_le_one (mode)"
-  "vmf%B3.vf\t%0,%4,%5%p1"
-  [(set_attr "type" "vfcmp")
-   (set_attr "mode" "")
-   (set_attr "spec_restriction" "thv,thv

  1   2   >