Re: [PATCH 09/10] tree-vect-slp-patterns.cc: add 'final' and 'override' to vect_pattern::build impls

2022-05-23 Thread Richard Biener via Gcc-patches
On Mon, May 23, 2022 at 9:36 PM David Malcolm via Gcc-patches
 wrote:
>
> gcc/ChangeLog:
> * tree-vect-slp-patterns.cc: Add "final" and "override" to
> vect_pattern::build impls as appropriate.

OK.

> Signed-off-by: David Malcolm 
> ---
>  gcc/tree-vect-slp-patterns.cc | 12 ++--
>  1 file changed, 6 insertions(+), 6 deletions(-)
>
> diff --git a/gcc/tree-vect-slp-patterns.cc b/gcc/tree-vect-slp-patterns.cc
> index a6b0d106d5f..e6a6db8beba 100644
> --- a/gcc/tree-vect-slp-patterns.cc
> +++ b/gcc/tree-vect-slp-patterns.cc
> @@ -492,7 +492,7 @@ class complex_pattern : public vect_pattern
>  }
>
>public:
> -void build (vec_info *);
> +void build (vec_info *) override;
>
>  static internal_fn
>  matches (complex_operation_t op, slp_tree_to_load_perm_map_t *, slp_tree 
> *,
> @@ -595,7 +595,7 @@ class complex_add_pattern : public complex_pattern
>  }
>
>public:
> -void build (vec_info *);
> +void build (vec_info *) final override;
>  static internal_fn
>  matches (complex_operation_t op, slp_tree_to_load_perm_map_t *,
>  slp_compat_nodes_map_t *, slp_tree *, vec *);
> @@ -977,7 +977,7 @@ class complex_mul_pattern : public complex_pattern
>  }
>
>public:
> -void build (vec_info *);
> +void build (vec_info *) final override;
>  static internal_fn
>  matches (complex_operation_t op, slp_tree_to_load_perm_map_t *,
>  slp_compat_nodes_map_t *, slp_tree *, vec *);
> @@ -1204,7 +1204,7 @@ class complex_fms_pattern : public complex_pattern
>  }
>
>public:
> -void build (vec_info *);
> +void build (vec_info *) final override;
>  static internal_fn
>  matches (complex_operation_t op, slp_tree_to_load_perm_map_t *,
>  slp_compat_nodes_map_t *, slp_tree *, vec *);
> @@ -1380,7 +1380,7 @@ class complex_operations_pattern : public 
> complex_pattern
>  }
>
>public:
> -void build (vec_info *);
> +void build (vec_info *) final override;
>  static internal_fn
>  matches (complex_operation_t op, slp_tree_to_load_perm_map_t *,
>  slp_compat_nodes_map_t *, slp_tree *, vec *);
> @@ -1446,7 +1446,7 @@ class addsub_pattern : public vect_pattern
>  addsub_pattern (slp_tree *node, internal_fn ifn)
> : vect_pattern (node, NULL, ifn) {};
>
> -void build (vec_info *);
> +void build (vec_info *) final override;
>
>  static vect_pattern*
>  recognize (slp_tree_to_load_perm_map_t *, slp_compat_nodes_map_t *,
> --
> 2.26.3
>


Re: [PATCH v2] DSE: Use the constant store source if possible

2022-05-23 Thread Richard Biener via Gcc-patches
On Mon, May 23, 2022 at 8:34 PM H.J. Lu  wrote:
>
> On Mon, May 23, 2022 at 12:38:06PM +0200, Richard Biener wrote:
> > On Sat, May 21, 2022 at 5:02 AM H.J. Lu via Gcc-patches
> >  wrote:
> > >
> > > When recording store for RTL dead store elimination, check if the source
> > > register is set only once to a constant.  If yes, record the constant
> > > as the store source.  It eliminates unrolled zero stores after memset 0
> > > in a loop where a vector register is used as the zero store source.
> > >
> > > gcc/
> > >
> > > PR rtl-optimization/105638
> > > * dse.cc (record_store): Use the constant source if the source
> > > register is set only once.
> > >
> > > gcc/testsuite/
> > >
> > > PR rtl-optimization/105638
> > > * g++.target/i386/pr105638.C: New test.
> > > ---
> > >  gcc/dse.cc   | 19 ++
> > >  gcc/testsuite/g++.target/i386/pr105638.C | 44 
> > >  2 files changed, 63 insertions(+)
> > >  create mode 100644 gcc/testsuite/g++.target/i386/pr105638.C
> > >
> > > diff --git a/gcc/dse.cc b/gcc/dse.cc
> > > index 30c11cee034..0433dd3d846 100644
> > > --- a/gcc/dse.cc
> > > +++ b/gcc/dse.cc
> > > @@ -1508,6 +1508,25 @@ record_store (rtx body, bb_info_t bb_info)
> > >
> > >   if (tem && CONSTANT_P (tem))
> > > const_rhs = tem;
> > > + else
> > > +   {
> > > + /* If RHS is set only once to a constant, set CONST_RHS
> > > +to the constant.  */
> > > + df_ref def = DF_REG_DEF_CHAIN (REGNO (rhs));
> > > + if (def != nullptr
> > > + && !DF_REF_IS_ARTIFICIAL (def)
> > > + && !DF_REF_NEXT_REG (def))
> > > +   {
> > > + rtx_insn *def_insn = DF_REF_INSN (def);
> > > + rtx def_body = PATTERN (def_insn);
> > > + if (GET_CODE (def_body) == SET)
> > > +   {
> > > + rtx def_src = SET_SRC (def_body);
> > > + if (CONSTANT_P (def_src))
> > > +   const_rhs = def_src;
> >
> > doesn't DSE have its own tracking of stored values?  Shouldn't we
>
> It tracks stored values only within the basic block.  When RTL loop
> invariant motion hoists a constant initialization out of the loop into
> a separate basic block, the constant store value becomes unknown
> within the original basic block.
>
> > improve _that_ if it is not enough?  I also wonder if you need to
>
> My patch extends DSE stored value tracking to include the constant which
> is set only once in another basic block.
>
> > verify the SET isn't partial?
> >
>
> Here is the v2 patch to check that the constant is set by a non-partial
> unconditional load.
>
> OK for master?
>
> Thanks.
>
> H.J.
> ---
> RTL DSE tracks redundant constant stores within a basic block.  When RTL
> loop invariant motion hoists a constant initialization out of the loop
> into a separate basic block, the constant store value becomes unknown
> within the original basic block.  When recording store for RTL DSE, check
> if the source register is set only once to a constant by a non-partial
> unconditional load.  If yes, record the constant as the constant store
> source.  It eliminates unrolled zero stores after memset 0 in a loop
> where a vector register is used as the zero store source.
>
> gcc/
>
> PR rtl-optimization/105638
> * dse.cc (record_store): Use the constant source if the source
> register is set only once.
>
> gcc/testsuite/
>
> PR rtl-optimization/105638
> * g++.target/i386/pr105638.C: New test.
> ---
>  gcc/dse.cc   | 22 
>  gcc/testsuite/g++.target/i386/pr105638.C | 44 
>  2 files changed, 66 insertions(+)
>  create mode 100644 gcc/testsuite/g++.target/i386/pr105638.C
>
> diff --git a/gcc/dse.cc b/gcc/dse.cc
> index 30c11cee034..af8e88dac32 100644
> --- a/gcc/dse.cc
> +++ b/gcc/dse.cc
> @@ -1508,6 +1508,28 @@ record_store (rtx body, bb_info_t bb_info)
>
>   if (tem && CONSTANT_P (tem))
> const_rhs = tem;
> + else
> +   {
> + /* If RHS is set only once to a constant, set CONST_RHS
> +to the constant.  */
> + df_ref def = DF_REG_DEF_CHAIN (REGNO (rhs));
> + if (def != nullptr
> + && !DF_REF_IS_ARTIFICIAL (def)
> + && !(DF_REF_FLAGS (def)
> +  & (DF_REF_PARTIAL | DF_REF_CONDITIONAL))
> + && !DF_REF_NEXT_REG (def))

Can we really use df-chain here and rely that a single definition is
the only one?  If rhs is a hardreg does df-chain include implicit
sets of function argument registers for example?  Don't we need RD
here or at least verify the single df-chain definition dominates the
use here (if we can rely on the reg otherwise be uninitialized and thus
the use invoking undefined beh

Re: [ping2][PATCH 0/8][RFC] Support BTF decl_tag and type_tag annotations

2022-05-23 Thread Yonghong Song via Gcc-patches




On 5/11/22 11:44 AM, David Faust wrote:



On 5/10/22 22:05, Yonghong Song wrote:



On 5/10/22 8:43 PM, Yonghong Song wrote:



On 5/6/22 2:18 PM, David Faust wrote:



On 5/5/22 16:00, Yonghong Song wrote:



On 5/4/22 10:03 AM, David Faust wrote:



On 5/3/22 15:32, Joseph Myers wrote:

On Mon, 2 May 2022, David Faust via Gcc-patches wrote:


Consider the following example:

  #define __typetag1 __attribute__((btf_type_tag("tag1")))
  #define __typetag2 __attribute__((btf_type_tag("tag2")))
  #define __typetag3 __attribute__((btf_type_tag("tag3")))

  int __typetag1 * __typetag2 __typetag3 * g;

The expected behavior is that 'g' is "a pointer with tags 'tag2' 
and

'tag3',
to a pointer with tag 'tag1' to an int". i.e.:


That's not a correct expectation for either GNU __attribute__ or
C2x [[]]
attribute syntax.  In either syntax, __typetag2 __typetag3 should
apply to
the type to which g points, not to g or its type, just as if you 
had a
type qualifier there.  You'd need to put the attributes (or 
qualifier)
after the *, not before, to make them apply to the pointer type.  
See
"Attribute Syntax" in the GCC manual for how the syntax is 
defined for

GNU
attributes and deduce in turn, for each subsequence of the tokens
matching
the syntax for some kind of declarator, what the type for "T D1"
would be
as defined there and in the C standard, as deduced from the type for
"T D"
for a sub-declarator D.
   >> But GCC's attribute parsing produces a variable 'g' which 
is "a

pointer with
tag 'tag1' to a pointer with tags 'tag2' and 'tag3' to an int", 
i.e.


In GNU syntax, __typetag1 applies to the declaration, whereas in C2x
syntax it applies to int.  Again, if you wanted it to apply to the
pointer
type it would need to go after the * not before.

If you are concerned with the fine details of what construct an
attribute
appertains to, I recommend using C2x syntax not GNU syntax.



Joseph, thank you! This is very helpful. My understanding of the 
syntax

was not correct.

(Actually, I made a bad mistake in paraphrasing this example from the
discussion of it in the series cover letter. But, the reason why 
it is

incorrect is the same.)


Yonghong, is the specific ordering an expectation in BPF programs or
other users of the tags?


This is probably a language writing issue. We are saying tags only
apply to pointer. We probably should say it only apply to pointee.

$ cat t.c
int const *ptr;

the llvm ir debuginfo:

!5 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !6, size: 64)
!6 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !7)
!7 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)

We could replace 'const' with a tag like below:

int __attribute__((btf_type_tag("tag"))) *ptr;

!5 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !6, size: 64,
annotations: !7)
!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
!7 = !{!8}
!8 = !{!"btf_type_tag", !"tag"}

In the above IR, we generate annotations to pointer_type because
we didn't invent a new DI type for encode btf_type_tag. But it is
totally okay to have IR looks like

!5 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64)
!11 = !DIBtfTypeTagType(..., baseType: !6, name: !"Tag")
!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)


OK, thanks.

There is still the question of why the DWARF generated for this case
that I have been concerned about:

    int __typetag1 * __typetag2 __typetag3 * g;

differs between GCC (with this series) and clang. After studying it,
GCC is doing with the attributes exactly as is described in the
Attribute Syntax portion of the GCC manual where the GNU syntax is
described. I do not think there is any problem here.

So the difference in DWARF suggests to me that clang is not handling
the GNU attribute syntax in this particular case correctly, since it
seems to be associating __typetag2 and __typetag3 to g's type rather
than the type to which it points.

I am not sure whether for the use purposes of the tags this difference
is very important, but it is worth noting.


As Joseph suggested, it may be better to encourage users of these tags
to use the C2x attribute syntax if they are concerned with precisely
which construct the tag applies.

This would also be a way around any issues in handling the attributes
due to the GNU syntax.

I tried a few test cases using C2x syntax BTF type tags with a
clang-15 build, but ran into some issues (in particular, some of the
tag attributes being ignored altogether). I couldn't find confirmation
whether C2x attribute syntax is fully supported in clang yet, so maybe
this isn't expected to work. Do you know whether the C2x syntax is
fully supported in clang yet?


Actually, I don't know either. But since the btf decl_tag and type_tag
are also used to compile linux kernel and the minimum compiler version
to compile kernel is gcc5.1 and clang11. I am not sure whether gcc5.1
supports c2x or not, I guess probably not.

Re: [PATCH] middle-end/105604 - snprintf dianostics and non-constant sizes/offsets

2022-05-23 Thread Richard Biener via Gcc-patches
On Mon, 23 May 2022, Martin Sebor wrote:

> On 5/19/22 05:39, Richard Biener wrote:
> > On Wed, 18 May 2022, Martin Sebor wrote:
> > 
> >> On 5/18/22 00:26, Richard Biener wrote:
> >>> On Tue, 17 May 2022, Martin Sebor wrote:
> >>>
>  On 5/16/22 03:16, Richard Biener wrote:
> > The following tries to correct get_origin_and_offset_r not handling
> > non-constant sizes of array elements in ARRAY_REFs and non-constant
> > offsets of COMPONENT_REFs.  It isn't exactly clear how such failures
> > should be treated in this API and existing handling isn't consistent
> > here either.  The following applies two different variants, treating
> > non-constant array sizes like non-constant array indices and
> > treating non-constant offsets of COMPONENT_REFs by terminating
> > the recursion (not sure what that means to the callers).
> >
> > Basically the code failed to use component_ref_field_offset and
> > array_ref_element_size and instead relies on inappropriate
> > helpers (that shouldn't exist in the first place ...).  The code
> > is also not safe-guarded against overflows in the final offset/size
> > computations but I'm not trying to rectify that.
> >
> > Martin - can you comment on how the API should handle such
> > situations?
> 
>  It looks like the -Wrestrict warning here ignores offsets equal to
>  HOST_WIDE_INT_MIN so presumably setting dst_offset (via *fldoff) to
>  that should avoid it.  Or maybe to HWI_MAX as it does for variable
>  offsets.
> >>>
> >>> Can you suggest wording for the function comment as to how it handles
> >>> the case when offset or size cannot be determined exactly?   The
> >>> comment currently only suggests that the caller possibly cannot
> >>> trust fldsize or off when the function returns NULL but the actual
> >>> implementation differs from that.
> >>
> >>
> >>
> >>>
>  It also looks like the function only handles constant offsets and
>  sizes, and I have a vague recollection of enhancing it to work with
>  ranges.  That should avoid the overflow problem too.
> >>>
> >>> So the correct thing is to return NULL?
> >>
> >> No, I don't think so.  The recursive get_origin_and_offset_r() assumes
> >> its own invocations never return null (the one place it does that should
> >> probably be moved to the nonrecursive caller).
> >>
> >>>
> >>> Is the patch OK as-is?
> >>
> >> It's an improvement but it's not complete as the following also ICEs
> >> (albeit somewhere else):
> >>
> >> void* f (void);
> >>
> >> void g (int n)
> >> {
> >>struct {
> >>  char a[n], b[];
> >>} *p = f ();
> >>
> >>__builtin_sprintf (p->b, "%s", p->a);
> >> }
> >>
> >> With the ICE fixed the warning triggers.  That's not ideal but it's
> >> unavoidable given the IR (I believe I mentioned this caveat some time
> >> back).  This is the same as for:
> >>
> >>struct {
> >>  char a[8], b[8];
> >>} *p = f ();
> >>
> >>__builtin_sprintf (&p->b[n], "%s", p->a);
> >>
> >> because the IR looks more or less the same for &p->a[n] as it is for
> >> &p->b[n].
> >>
> >>> As said, I'm not sure how the caller interprets
> >>> the result and how it can distinguish the exact vs. non-exact cases
> >>> or what a "conservative" inexact answer would be.
> >>
> >> The warning triggers in both the certain cases and the inexact
> >> ones like the one above when an overlap cannot be ruled out.  To
> >> differentiate the two it's phrased as "may overlap".  The handling
> >> is in maybe_warn_overlap().
> >>
> >>>
> >>> Please help properly documenting this API.
> >>
> >> I can spend some time in the next few days to page it all in, see
> >> if I can clean it up a bit in addition to fixing the ICEs and
> >> improve the comment.  Let me know if you have a different
> >> preference.
> > 
> > That works for me - thanks for taking it from here.
> Attached is a slightly enhanced patch that fixes both of the ICEs,
> improves the comments, and adds more tests.  I tested it on x86_64.
> Let me know if there's something else you'd like me to do here.

Looks good to me!

Thanks for fixing.
Richard.


Re: [PATCH] x86: Avoid uninitialized variable in PR target/104441 test

2022-05-23 Thread Richard Biener via Gcc-patches
On Mon, 23 May 2022, H.J. Lu wrote:

>   PR target/104441
>   * gcc.target/i386/pr104441-1a.c (load8bit_4x4_avx2): Initialize
>   src23.

OK.

Thanks for fixing.
Richard.

> ---
>  gcc/testsuite/gcc.target/i386/pr104441-1a.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/gcc/testsuite/gcc.target/i386/pr104441-1a.c 
> b/gcc/testsuite/gcc.target/i386/pr104441-1a.c
> index 83734f710bd..0931029f2bb 100644
> --- a/gcc/testsuite/gcc.target/i386/pr104441-1a.c
> +++ b/gcc/testsuite/gcc.target/i386/pr104441-1a.c
> @@ -8,7 +8,7 @@ __attribute__((always_inline, target("avx2")))
>  static __m256i
>  load8bit_4x4_avx2(const uint8_t *const src, const uint32_t stride)
>  {
> -  __m128i src01, src23;
> +  __m128i src01, src23 = _mm_setzero_si128();
>src01 = _mm_cvtsi32_si128(*(int32_t*)(src + 0 * stride));
>src23 = _mm_insert_epi32(src23, *(int32_t *)(src + 3 * stride), 1);
>return _mm256_setr_m128i(src01, src23);
> 


Re: [PATCH v3] x86: Document -mcet-switch

2022-05-23 Thread Richard Biener via Gcc-patches
On Mon, 23 May 2022, H.J. Lu wrote:

> When -fcf-protection=branch is used, the compiler will generate jump
> tables for switch statements where the indirect jump is prefixed with
> the NOTRACK prefix, so it can jump to non-ENDBR targets.  Since the
> indirect jump targets are generated by the compiler and stored in
> read-only memory, this does not result in a direct loss of hardening.
> But if the jump table index is attacker-controlled, the indirect jump
> may not be constrained by CET.
> 
> Document -mcet-switch to generate jump tables for switch statements with
> ENDBR and skip the NOTRACK prefix for indirect jump.  This option should
> be used when the NOTRACK prefix is disabled.

OK.

>   PR target/104816
>   * config/i386/i386.opt: Remove Undocumented.
>   * doc/invoke.texi: Document -mcet-switch.
> ---
>  gcc/config/i386/i386.opt |  2 +-
>  gcc/doc/invoke.texi  | 14 +-
>  2 files changed, 14 insertions(+), 2 deletions(-)
> 
> diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
> index a6b0e28f238..0dbaacb57ed 100644
> --- a/gcc/config/i386/i386.opt
> +++ b/gcc/config/i386/i386.opt
> @@ -1047,7 +1047,7 @@ Enable shadow stack built-in functions from 
> Control-flow Enforcement
>  Technology (CET).
>  
>  mcet-switch
> -Target Undocumented Var(flag_cet_switch) Init(0)
> +Target Var(flag_cet_switch) Init(0)
>  Turn on CET instrumentation for switch statements that use a jump table and
>  an indirect jump.
>  
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index d8095e3128f..1f38e91b50b 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -1425,7 +1425,8 @@ See RS/6000 and PowerPC Options.
>  -msse4a  -m3dnow  -m3dnowa  -mpopcnt  -mabm  -mbmi  -mtbm  -mfma4  -mxop @gol
>  -madx  -mlzcnt  -mbmi2  -mfxsr  -mxsave  -mxsaveopt  -mrtm  -mhle  -mlwp @gol
>  -mmwaitx  -mclzero  -mpku  -mthreads  -mgfni  -mvaes  -mwaitpkg @gol
> --mshstk -mmanual-endbr -mforce-indirect-call  -mavx512vbmi2 -mavx512bf16 
> -menqcmd @gol
> +-mshstk -mmanual-endbr -mcet-switch -mforce-indirect-call @gol
> +-mavx512vbmi2 -mavx512bf16 -menqcmd @gol
>  -mvpclmulqdq  -mavx512bitalg  -mmovdiri  -mmovdir64b  -mavx512vpopcntdq @gol
>  -mavx5124fmaps  -mavx512vnni  -mavx5124vnniw  -mprfchw  -mrdpid @gol
>  -mrdseed  -msgx -mavx512vp2intersect -mserialize -mtsxldtrk@gol
> @@ -32719,6 +32720,17 @@ function attribute. This is useful when used with 
> the option
>  @option{-fcf-protection=branch} to control ENDBR insertion at the
>  function entry.
>  
> +@item -mcet-switch
> +@opindex mcet-switch
> +By default, CET instrumentation is turned off on switch statements that
> +use a jump table and indirect branch track is disabled.  Since jump
> +tables are stored in read-only memory, this does not result in a direct
> +loss of hardening.  But if the jump table index is attacker-controlled,
> +the indirect jump may not be constrained by CET.  This option turns on
> +CET instrumentation to enable indirect branch track for switch statements
> +with jump tables which leads to the jump targets reachable via any indirect
> +jumps.
> +
>  @item -mcall-ms2sysv-xlogues
>  @opindex mcall-ms2sysv-xlogues
>  @opindex mno-call-ms2sysv-xlogues
> 

-- 
Richard Biener 
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)


[PATCH, committed] testsuite/rs6000: Adjust gcc.target/powerpc/pr78604.c [PR105706]

2022-05-23 Thread Kewen.Lin via Gcc-patches
Hi,

Tested on powerpc64le-linux-gnu P8.

Pushed this as r13-721-g8fa8bca9f53fcfdedc2b4fa55093dbd1ab7abbd1.

BR,
Kewen
-
Commit r13-707 adjusts the below gimple:

  iftmp.7_4 = _1 < _2 ? val2_7(D) : val1_8(D);

to

  _3 = _1 >= _2;
  iftmp.7_4 = _3 ? val1_8(D) : val2_7(D);

and result in one more vect_model_simple_cost dumping for each
function.  Need to adjust the match count accordingly.

PR testsuite/105706

gcc/testsuite/ChangeLog:

* gcc.target/powerpc/pr78604.c: Adjust.
---
 gcc/testsuite/gcc.target/powerpc/pr78604.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/powerpc/pr78604.c 
b/gcc/testsuite/gcc.target/powerpc/pr78604.c
index 35bfdb35412..7a371af8c28 100644
--- a/gcc/testsuite/gcc.target/powerpc/pr78604.c
+++ b/gcc/testsuite/gcc.target/powerpc/pr78604.c
@@ -109,4 +109,6 @@ uns_gte (UNS_TYPE val1, UNS_TYPE val2)
 /* { dg-final { scan-assembler-times {\mvcmpgtsd\M} 4 } } */
 /* { dg-final { scan-assembler-times {\mvcmpgtud\M} 4 } } */
 /* { dg-final { scan-assembler-not   {\mvcmpequd\M} } } */
-/* { dg-final { scan-tree-dump-times "vect_model_simple_cost" 8 "vect" } } */
+/* For each function, one is for the comparison statement and the other
+   is for the condition statement which consumes the compared result.  */
+/* { dg-final { scan-tree-dump-times "vect_model_simple_cost" 16 "vect" } } */
--
2.27.0


[PATCH] tree-optimization/100221 - improve DSE a bit

2022-05-23 Thread Richard Biener via Gcc-patches
When facing multiple PHI defs and one feeding the other we can
postpone processing uses of one and thus can proceed.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

2022-05-20  Richard Biener  

PR tree-optimization/100221
* tree-ssa-dse.cc (contains_phi_arg): New function.
(dse_classify_store): Postpone PHI defs that feed another PHI in defs.

* gcc.dg/tree-ssa/ssa-dse-44.c: New testcase.
* gcc.dg/tree-ssa/ssa-dse-45.c: Likewise.
---
 gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-44.c | 19 +
 gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-45.c | 24 +++
 gcc/tree-ssa-dse.cc| 46 +++---
 3 files changed, 84 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-44.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-45.c

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-44.c 
b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-44.c
new file mode 100644
index 000..aaec41d7bdf
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-44.c
@@ -0,0 +1,19 @@
+/* { dg-do link } */
+/* { dg-options "-O -fdump-tree-dse1-details" } */
+
+extern void foo(void);
+int a, b;
+static int c;
+int main()
+{
+  if (c)
+foo ();
+  int *g = &c;
+  int **h = &g;
+  int ***h1 = &h;
+  if (a)
+while (b)
+  b = 0;
+}
+
+/* { dg-final { scan-tree-dump "Deleted dead store: g = &c;" "dse1" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-45.c 
b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-45.c
new file mode 100644
index 000..fd92d7b599a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-45.c
@@ -0,0 +1,24 @@
+/* { dg-do link } */
+/* { dg-options "-O" } */
+
+extern void foo(void);
+int a, b;
+static int c;
+static void f() {
+  while (a)
+for (; b; b--)
+  ;
+}
+void i() {
+  if (c)
+foo();
+  int *g = &c;
+  {
+int **h[1] = {&g};
+f();
+  }
+}
+int main() {
+  i();
+  return 0;
+}
diff --git a/gcc/tree-ssa-dse.cc b/gcc/tree-ssa-dse.cc
index 881a2d0f98d..ea50de789b1 100644
--- a/gcc/tree-ssa-dse.cc
+++ b/gcc/tree-ssa-dse.cc
@@ -898,6 +898,17 @@ dse_optimize_redundant_stores (gimple *stmt)
 }
 }
 
+/* Return whether PHI contains ARG as an argument.  */
+
+static bool
+contains_phi_arg (gphi *phi, tree arg)
+{
+  for (unsigned i = 0; i < gimple_phi_num_args (phi); ++i)
+if (gimple_phi_arg_def (phi, i) == arg)
+  return true;
+  return false;
+}
+
 /* A helper of dse_optimize_stmt.
Given a GIMPLE_ASSIGN in STMT that writes to REF, classify it
according to downstream uses and defs.  Sets *BY_CLOBBER_P to true
@@ -949,8 +960,8 @@ dse_classify_store (ao_ref *ref, gimple *stmt,
return DSE_STORE_LIVE;
 
   auto_vec defs;
-  gimple *first_phi_def = NULL;
-  gimple *last_phi_def = NULL;
+  gphi *first_phi_def = NULL;
+  gphi *last_phi_def = NULL;
   FOR_EACH_IMM_USE_STMT (use_stmt, ui, defvar)
{
  /* Limit stmt walking.  */
@@ -973,8 +984,8 @@ dse_classify_store (ao_ref *ref, gimple *stmt,
{
  defs.safe_push (use_stmt);
  if (!first_phi_def)
-   first_phi_def = use_stmt;
- last_phi_def = use_stmt;
+   first_phi_def = as_a  (use_stmt);
+ last_phi_def = as_a  (use_stmt);
}
}
  /* If the statement is a use the store is not dead.  */
@@ -1046,6 +1057,7 @@ dse_classify_store (ao_ref *ref, gimple *stmt,
  use_operand_p use_p;
  tree vdef = (gimple_code (def) == GIMPLE_PHI
   ? gimple_phi_result (def) : gimple_vdef (def));
+ gphi *phi_def;
  /* If the path to check starts with a kill we do not need to
 process it further.
 ???  With byte tracking we need only kill the bytes currently
@@ -1079,7 +1091,31 @@ dse_classify_store (ao_ref *ref, gimple *stmt,
   && bitmap_bit_p (visited,
SSA_NAME_VERSION
  (PHI_RESULT (use_stmt))
-   defs.unordered_remove (i);
+   {
+ defs.unordered_remove (i);
+ if (def == first_phi_def)
+   first_phi_def = NULL;
+ else if (def == last_phi_def)
+   last_phi_def = NULL;
+   }
+ /* If def is a PHI and one of its arguments is another PHI node still
+in consideration we can defer processing it.  */
+ else if ((phi_def = dyn_cast  (def))
+  && ((last_phi_def
+   && phi_def != last_phi_def
+   && contains_phi_arg (phi_def,
+gimple_phi_result (last_phi_def)))
+  || (first_phi_def
+  && phi_def != first_phi_def
+  && contains_phi_arg
+   (phi_def, g

Re: Back porting to GCC11/GCC12: Re: [patch][gcc13][i386][pr101891]Adjust -fzero-call-used-regs to always use XOR

2022-05-23 Thread Richard Biener via Gcc-patches
On Mon, 23 May 2022, Qing Zhao wrote:

> Hi,
> 
> I have added the patch to GCC11 and GCC12 in my local area and bootstrapped 
> and regress tested on both x86 and aarch64, no any issues.
> 
> Can I committed them to both GCC11 and GCC12 branches?

Yes.

Thanks,
Richard.

> Thanks.
> 
> 
> 
> 
> > On May 10, 2022, at 8:38 AM, Qing Zhao via Gcc-patches 
> >  wrote:
> >
> >
> >
> >> On May 10, 2022, at 1:12 AM, Richard Biener  wrote:
> >>
> >> On Mon, 9 May 2022, Uros Bizjak wrote:
> >>
> >>> On Mon, May 9, 2022 at 5:44 PM Qing Zhao  wrote:
> 
>  Another question:
> 
>  I think that this patch might need to be back ported to Gcc12 and GCC11.
> 
>  What?s your opinion on this?
> >>>
> >>> It is not a regression, so following general rules, the patch should
> >>> not be backported. OTOH, the patch creates functionally equivalent
> >>> code, better in some security aspects. The functionality is also
> >>> hidden behind some non-default flag, so I think if release managers
> >>> (CC'd) are OK with the backport, I'd give it a technical approval.
> >>>
>  If so, when can I backport it?
> >>>
> >>> Let's keep it in the mainline for a week or two, before backporting it
> >>> to non-EoL branches.
> >>
> >> OK from my POV after a week or two on trunk.
> >
> > Sure, I will do the back porting after two weeks.
> >
> > thanks.
> >
> > Qing
> >>
> >> Richard.
> >>
> >>> Uros.
> >>>
> 
>  thanks.
> 
>  Qing
> 
> > On May 7, 2022, at 4:06 AM, Uros Bizjak  wrote:
> >
> > On Fri, May 6, 2022 at 6:42 PM Qing Zhao  wrote:
> >>
> >>
> >>
> >>> On May 6, 2022, at 10:58 AM, Uros Bizjak  wrote:
> >>>
> >>> On Fri, May 6, 2022 at 4:29 PM Qing Zhao  wrote:
> 
>  Hi,
> 
>  As Kee?s requested in this PR: 
>  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101891
> 
>  =
> 
>  Currently -fzero-call-used-regs will use a pattern of:
> 
>  XOR regA,regA
>  MOV regA,regB
>  MOV regA,regC
>  ...
>  RET
> 
>  However, this introduces both a register ordering dependency (e.g. 
>  the CPU cannot clear regB without clearing regA first), and while 
>  greatly reduces available ROP gadgets, it does technically leave a 
>  set of "MOV" ROP gadgets at the end of functions (e.g. "MOV 
>  regA,regC; RET").
> 
>  Please switch to always using XOR:
> 
>  XOR regA,regA
>  XOR regB,regB
>  XOR regC,regC
>  ...
>  RET
> 
>  ===
> 
>  This patch switch all MOV to XOR on i386.
> 
>  Bootstrapped and regresstion tested on x86_64-linux-gnu.
> 
>  Okay for gcc13?
> 
>  Thanks.
> 
>  Qing
> 
>  ==
> >>>
>  gcc/ChangeLog:
> 
>  * config/i386/i386.cc (zero_all_mm_registers): Use SET to zero 
>  instead
>  of MOV for zeroing scratch registers.
>  (ix86_zero_call_used_regs): Likewise.
> 
>  gcc/testsuite/ChangeLog:
> 
>  * gcc.target/i386/zero-scratch-regs-1.c: Add -fno-stack-protector
>  -fno-PIC.
>  * gcc.target/i386/zero-scratch-regs-10.c: Adjust mov to xor.
>  * gcc.target/i386/zero-scratch-regs-13.c: Add -msse.
>  * gcc.target/i386/zero-scratch-regs-14.c: Adjust mov to xor.
>  * gcc.target/i386/zero-scratch-regs-15.c: Add -fno-stack-protector
>  -fno-PIC.
>  * gcc.target/i386/zero-scratch-regs-16.c: Likewise.
>  * gcc.target/i386/zero-scratch-regs-17.c: Likewise.
>  * gcc.target/i386/zero-scratch-regs-18.c: Add -fno-stack-protector
>  -fno-PIC, adjust mov to xor.
>  * gcc.target/i386/zero-scratch-regs-19.c: Add -fno-stack-protector
>  -fno-PIC.
>  * gcc.target/i386/zero-scratch-regs-2.c: Adjust mov to xor.
>  * gcc.target/i386/zero-scratch-regs-20.c: Add -msse.
>  * gcc.target/i386/zero-scratch-regs-21.c: Add -fno-stack-protector
>  -fno-PIC, Adjust mov to xor.
>  * gcc.target/i386/zero-scratch-regs-22.c: Adjust mov to xor.
>  * gcc.target/i386/zero-scratch-regs-23.c: Likewise.
>  * gcc.target/i386/zero-scratch-regs-26.c: Likewise.
>  * gcc.target/i386/zero-scratch-regs-27.c: Likewise.
>  * gcc.target/i386/zero-scratch-regs-28.c: Likewise.
>  * gcc.target/i386/zero-scratch-regs-3.c: Add -fno-stack-protector.
>  * gcc.target/i386/zero-scratch-regs-31.c: Adjust mov to xor.
>  * gcc.target/i386/zero-scratch-regs-4.c: Add -fno-stack-protector
>  -fno-PIC.
>  * gcc.target/i386/zero-scratch-regs-5.c: Adjust mov to xor.
>  * gcc.target/i386/zero-scratch-regs-6.c: Add -fno-stack-protector.
>  * gcc.targ

[PATCH, committed] rs6000: Skip debug insns for union [PR105627]

2022-05-23 Thread Kewen.Lin via Gcc-patches
Hi,

Bootstrapped and regress-tested on powerpc64-linux-gnu P8 and
powerpc64le-linux-gnu P8, P9 and P10.

Pushed this as r13-720-g149d04ccbb908b3a251485b43faf204752942b9f.

---
As PR105627 exposes, pass analyze_swaps should skip debug
insn when doing unionfind_union.  One debug insn can use
several pseudos, if we take debug insn into account, we can
union those insns defining them and generate some unexpected
unions.

Based on the assumption that it's impossible to have one
pseudo which is defined by one debug insn but is used by one
nondebug insn, we just asserts debug insn never shows up in
function union_defs.

PR target/105627

gcc/ChangeLog:

* config/rs6000/rs6000-p8swap.cc (union_defs): Assert def_insn can't
be a debug insn.
(union_uses): Skip debug use_insn.

gcc/testsuite/ChangeLog:

* gcc.target/powerpc/pr105627.c: New test.
---
 gcc/config/rs6000/rs6000-p8swap.cc  | 10 
 gcc/testsuite/gcc.target/powerpc/pr105627.c | 26 +
 2 files changed, 32 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr105627.c

diff --git a/gcc/config/rs6000/rs6000-p8swap.cc 
b/gcc/config/rs6000/rs6000-p8swap.cc
index d301bc3fe59..275702fee1b 100644
--- a/gcc/config/rs6000/rs6000-p8swap.cc
+++ b/gcc/config/rs6000/rs6000-p8swap.cc
@@ -214,8 +214,9 @@ union_defs (swap_web_entry *insn_entry, rtx insn, df_ref 
use)
   if (DF_REF_INSN_INFO (link->ref))
{
  rtx def_insn = DF_REF_INSN (link->ref);
- (void)unionfind_union (insn_entry + INSN_UID (insn),
-insn_entry + INSN_UID (def_insn));
+ gcc_assert (NONDEBUG_INSN_P (def_insn));
+ unionfind_union (insn_entry + INSN_UID (insn),
+  insn_entry + INSN_UID (def_insn));
}

   link = link->next;
@@ -242,8 +243,9 @@ union_uses (swap_web_entry *insn_entry, rtx insn, df_ref 
def)
   if (DF_REF_INSN_INFO (link->ref))
{
  rtx use_insn = DF_REF_INSN (link->ref);
- (void)unionfind_union (insn_entry + INSN_UID (insn),
-insn_entry + INSN_UID (use_insn));
+ if (NONDEBUG_INSN_P (use_insn))
+   unionfind_union (insn_entry + INSN_UID (insn),
+insn_entry + INSN_UID (use_insn));
}

   link = link->next;
diff --git a/gcc/testsuite/gcc.target/powerpc/pr105627.c 
b/gcc/testsuite/gcc.target/powerpc/pr105627.c
new file mode 100644
index 000..bafb31ff061
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr105627.c
@@ -0,0 +1,26 @@
+/* Specify -w to disable some warnings, such as: -Wpsabi.  */
+/* { dg-options "-Og -fcompare-debug -mdejagnu-cpu=power8 -w" } */
+
+typedef unsigned char __attribute__ ((__vector_size__ (8))) U;
+typedef unsigned char __attribute__ ((__vector_size__ (64))) V;
+
+U u;
+char c;
+V v;
+
+V
+foo (void)
+{
+  V w = c
+   & __builtin_shufflevector (u, (V){0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 5},
+  24, 24, 41, 45, 53, 60, 22, 35, 45, 12, 61,
+  9, 52, 15, 44, 46, 5, 5, 1, 0, 4, 9, 0, 8, 5,
+  7, 2, 5, 9, 2, 7, 7, 5, 6, 0, 2, 6, 1, 7, 7,
+  0, 4, 0, 1, 7, 2, 5, 3, 2, 3, 5, 6, 6, 6, 0,
+  6, 1, 9, 0, 5, 4, 3, 5, 4);
+  w = w + v;
+  return w;
+}
+
--
2.27.0


Re: [PATCH] middle-end/105604 - snprintf dianostics and non-constant sizes/offsets

2022-05-23 Thread Martin Sebor via Gcc-patches

On 5/19/22 05:39, Richard Biener wrote:

On Wed, 18 May 2022, Martin Sebor wrote:


On 5/18/22 00:26, Richard Biener wrote:

On Tue, 17 May 2022, Martin Sebor wrote:


On 5/16/22 03:16, Richard Biener wrote:

The following tries to correct get_origin_and_offset_r not handling
non-constant sizes of array elements in ARRAY_REFs and non-constant
offsets of COMPONENT_REFs.  It isn't exactly clear how such failures
should be treated in this API and existing handling isn't consistent
here either.  The following applies two different variants, treating
non-constant array sizes like non-constant array indices and
treating non-constant offsets of COMPONENT_REFs by terminating
the recursion (not sure what that means to the callers).

Basically the code failed to use component_ref_field_offset and
array_ref_element_size and instead relies on inappropriate
helpers (that shouldn't exist in the first place ...).  The code
is also not safe-guarded against overflows in the final offset/size
computations but I'm not trying to rectify that.

Martin - can you comment on how the API should handle such
situations?


It looks like the -Wrestrict warning here ignores offsets equal to
HOST_WIDE_INT_MIN so presumably setting dst_offset (via *fldoff) to
that should avoid it.  Or maybe to HWI_MAX as it does for variable
offsets.


Can you suggest wording for the function comment as to how it handles
the case when offset or size cannot be determined exactly?   The
comment currently only suggests that the caller possibly cannot
trust fldsize or off when the function returns NULL but the actual
implementation differs from that.







It also looks like the function only handles constant offsets and
sizes, and I have a vague recollection of enhancing it to work with
ranges.  That should avoid the overflow problem too.


So the correct thing is to return NULL?


No, I don't think so.  The recursive get_origin_and_offset_r() assumes
its own invocations never return null (the one place it does that should
probably be moved to the nonrecursive caller).



Is the patch OK as-is?


It's an improvement but it's not complete as the following also ICEs
(albeit somewhere else):

void* f (void);

void g (int n)
{
   struct {
 char a[n], b[];
   } *p = f ();

   __builtin_sprintf (p->b, "%s", p->a);
}

With the ICE fixed the warning triggers.  That's not ideal but it's
unavoidable given the IR (I believe I mentioned this caveat some time
back).  This is the same as for:

   struct {
 char a[8], b[8];
   } *p = f ();

   __builtin_sprintf (&p->b[n], "%s", p->a);

because the IR looks more or less the same for &p->a[n] as it is for
&p->b[n].


As said, I'm not sure how the caller interprets
the result and how it can distinguish the exact vs. non-exact cases
or what a "conservative" inexact answer would be.


The warning triggers in both the certain cases and the inexact
ones like the one above when an overlap cannot be ruled out.  To
differentiate the two it's phrased as "may overlap".  The handling
is in maybe_warn_overlap().



Please help properly documenting this API.


I can spend some time in the next few days to page it all in, see
if I can clean it up a bit in addition to fixing the ICEs and
improve the comment.  Let me know if you have a different
preference.


That works for me - thanks for taking it from here.

Attached is a slightly enhanced patch that fixes both of the ICEs,
improves the comments, and adds more tests.  I tested it on x86_64.
Let me know if there's something else you'd like me to do here.

MartinPR middle-end/105604 - ICE: in tree_to_shwi with vla in struct and sprintf

gcc/ChangeLog:

	PR middle-end/105604
	* gimple-ssa-sprintf.cc (set_aggregate_size_and_offset): Add comments.
	(get_origin_and_offset_r): Remove null handling.  Handle variable array
	sizes.
	(get_origin_and_offset): Handle null argument here.  Simplify.
	(alias_offset):
	* pointer-query.cc (field_at_offset): Update comment.

gcc/testsuite/ChangeLog:

	PR middle-end/105604
	* gcc.dg/Wrestrict-24.c: New test.
	* gcc.dg/Wrestrict-25.c: New test.
	* gcc.dg/Wrestrict-26.c: New test.

diff --git a/gcc/gimple-ssa-sprintf.cc b/gcc/gimple-ssa-sprintf.cc
index 8202129667e..6bd27302213 100644
--- a/gcc/gimple-ssa-sprintf.cc
+++ b/gcc/gimple-ssa-sprintf.cc
@@ -2232,8 +2232,9 @@ format_character (const directive &dir, tree arg, pointer_query &ptr_qry)
 }
 
 /* If TYPE is an array or struct or union, increment *FLDOFF by the starting
-   offset of the member that *OFF point into and set *FLDSIZE to its size
-   in bytes and decrement *OFF by the same.  Otherwise do nothing.  */
+   offset of the member that *OFF points into if one can be determined and
+   set *FLDSIZE to its size in bytes and decrement *OFF by the same.
+   Otherwise do nothing.  */
 
 static void
 set_aggregate_size_and_offset (tree type, HOST_WIDE_INT *fldoff,
@@ -2249,9 +2250,9 @@ set_aggregate_size_and_offset (tree type, HOST_WIDE_INT *fldoff,
   if (array_elt_at_offset (type, *off, 

[PATCH] libiberty: remove FINAL and OVERRIDE from ansidecl.h

2022-05-23 Thread David Malcolm via Gcc-patches
libiberty's ansidecl.h provides macros FINAL and OVERRIDE to allow
virtual functions to be labelled with the C++11 "final" and "override"
specifiers, but with empty implementations on pre-C++11 C++ compilers.

We've used the macros in many places in GCC, but as of as of GCC 11
onwards GCC has required a C++11 compiler, such as GCC 4.8 or later.
On the assumption that any such compiler correctly implements "final"
and "override", I've recently simplified GCC's codebase by replacing all
uses of the FINAL and OVERRIDE macros in GCC's source tree with the
lower-case specifiers (via commits r13-690-gff171cb13df671 and
r13-716-g8473ef7be60443).

Here's a patch to eliminate the macros from ansidecl.h which I was
hoping to apply to GCC to complete this transition - but ansidecl.h is
shared with other projects.

I've successfully bootstrapped & regrtested GCC trunk on
x86_64-pc-linux-gnu with this patch.

Of the various other GNU projects using libiberty implemented in C++,
does anyone support being built with a pre-C++11 compiler, or does
everyone assume C++11 or later?  Is anyone else still using these
macros?

Any objections, or is there a reason to keep these macros that I'm
not aware of?  (and did I send this to all the pertinent lists?)

Thanks
Dave

include/ChangeLog:
* ansidecl.h: Drop macros OVERRIDE and FINAL.

Signed-off-by: David Malcolm 
---
 include/ansidecl.h | 41 -
 1 file changed, 41 deletions(-)

diff --git a/include/ansidecl.h b/include/ansidecl.h
index 46fe3ffabd9..056a03ebb6e 100644
--- a/include/ansidecl.h
+++ b/include/ansidecl.h
@@ -321,47 +321,6 @@ So instead we use the macro below and test it against 
specific values.  */
 #define CONSTEXPR
 #endif
 
-/* C++11 adds the ability to add "override" after an implementation of a
-   virtual function in a subclass, to:
- (A) document that this is an override of a virtual function
- (B) allow the compiler to issue a warning if it isn't (e.g. a mismatch
- of the type signature).
-
-   Similarly, it allows us to add a "final" to indicate that no subclass
-   may subsequently override the vfunc.
-
-   Provide OVERRIDE and FINAL as macros, allowing us to get these benefits
-   when compiling with C++11 support, but without requiring C++11.
-
-   For gcc, use "-std=c++11" to enable C++11 support; gcc 6 onwards enables
-   this by default (actually GNU++14).  */
-
-#if defined __cplusplus
-# if __cplusplus >= 201103
-   /* C++11 claims to be available: use it.  Final/override were only
-  implemented in 4.7, though.  */
-#  if GCC_VERSION < 4007
-#   define OVERRIDE
-#   define FINAL
-#  else
-#   define OVERRIDE override
-#   define FINAL final
-#  endif
-# elif GCC_VERSION >= 4007
-   /* G++ 4.7 supports __final in C++98.  */
-#  define OVERRIDE
-#  define FINAL __final
-# else
-   /* No C++11 support; leave the macros empty.  */
-#  define OVERRIDE
-#  define FINAL
-# endif
-#else
-  /* No C++11 support; leave the macros empty.  */
-# define OVERRIDE
-# define FINAL
-#endif
-
 /* A macro to disable the copy constructor and assignment operator.
When building with C++11 and above, the methods are explicitly
deleted, causing a compile-time error if something tries to copy.
-- 
2.26.3



Re: [PATCH] RISC-V: Enable TARGET_SUPPORTS_WIDE_INT

2022-05-23 Thread Palmer Dabbelt

On Mon, 23 May 2022 14:58:29 PDT (-0700), Vineet Gupta wrote:

Ping ! With commit restrictions relaxed now, can this be added to trunk
now ?


Committed, with some fixups to indentation and to handle the .c -> .cc 
move (which git didn't figure out for this one, not exactly sure why).




Thx,
-Vineet

On 2/6/22 22:06, Vineet Gupta wrote:

This is at par with other major arches such as aarch64, i386, s390 ...

No testsuite regressions: same numbers w/ w/o

|   === gcc Summary ===
|
|# of expected passes   113392
|# of unexpected failures   27
|# of unexpected successes  3
|# of expected failures 605
|# of unsupported tests 2523
|
|   === g++ Summary ===
|
|# of expected passes   172997
|# of unexpected failures   26
|# of expected failures 706
|# of unsupported tests 9566

Signed-off-by: Vineet Gupta 
---
  gcc/config/riscv/predicates.md | 2 +-
  gcc/config/riscv/riscv.c   | 6 ++
  gcc/config/riscv/riscv.h   | 2 ++
  3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/gcc/config/riscv/predicates.md b/gcc/config/riscv/predicates.md
index 3da6fd4c0491..cf902229954b 100644
--- a/gcc/config/riscv/predicates.md
+++ b/gcc/config/riscv/predicates.md
@@ -52,7 +52,7 @@
 (match_test "INTVAL (op) + 1 != 0")))

  (define_predicate "const_0_operand"
-  (and (match_code "const_int,const_wide_int,const_double,const_vector")
+  (and (match_code "const_int,const_wide_int,const_vector")
 (match_test "op == CONST0_RTX (GET_MODE (op))")))

  (define_predicate "reg_or_0_operand"
diff --git a/gcc/config/riscv/riscv.c b/gcc/config/riscv/riscv.c
index c830cd8f4ad1..d2f2d9e0276f 100644
--- a/gcc/config/riscv/riscv.c
+++ b/gcc/config/riscv/riscv.c
@@ -1774,6 +1774,12 @@ riscv_rtx_costs (rtx x, machine_mode mode, int 
outer_code, int opno ATTRIBUTE_UN
  case SYMBOL_REF:
  case LABEL_REF:
  case CONST_DOUBLE:
+  /* With TARGET_SUPPORTS_WIDE_INT const int can't be in CONST_DOUBLE
+ rtl object. Weird recheck due to switch-case fall through above.  */
+  if (GET_CODE (x) == CONST_DOUBLE)
+gcc_assert (GET_MODE (x) != VOIDmode);
+  /* Fall through.  */
+
  case CONST:
if ((cost = riscv_const_insns (x)) > 0)
{
diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h
index ff6729aedac2..91cfc82b4aa4 100644
--- a/gcc/config/riscv/riscv.h
+++ b/gcc/config/riscv/riscv.h
@@ -997,4 +997,6 @@ extern void riscv_remove_unneeded_save_restore_calls (void);

  #define HARD_REGNO_RENAME_OK(FROM, TO) riscv_hard_regno_rename_ok (FROM, TO)

+#define TARGET_SUPPORTS_WIDE_INT 1
+
  #endif /* ! GCC_RISCV_H */


[committed] test plugins: use "final" and "override" directly, rather than via macros

2022-05-23 Thread David Malcolm via Gcc-patches
Tested on x86_64-pc-linux-gnu.
Pushed to trunk as r13-716-g8473ef7be60443.

gcc/testsuite/ChangeLog:
* gcc.dg/plugin/analyzer_gil_plugin.c: Replace uses of "FINAL" and
"OVERRIDE" with "final" and "override".

Signed-off-by: David Malcolm 
---
 .../gcc.dg/plugin/analyzer_gil_plugin.c   | 36 +--
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/plugin/analyzer_gil_plugin.c 
b/gcc/testsuite/gcc.dg/plugin/analyzer_gil_plugin.c
index 12d1c8d0ba2..b5ae128e2e2 100644
--- a/gcc/testsuite/gcc.dg/plugin/analyzer_gil_plugin.c
+++ b/gcc/testsuite/gcc.dg/plugin/analyzer_gil_plugin.c
@@ -47,13 +47,13 @@ class gil_state_machine : public state_machine
 public:
   gil_state_machine (logger *logger);
 
-  bool inherited_state_p () const FINAL OVERRIDE { return false; }
+  bool inherited_state_p () const final override { return false; }
 
   bool on_stmt (sm_context *sm_ctxt,
const supernode *node,
-   const gimple *stmt) const FINAL OVERRIDE;
+   const gimple *stmt) const final override;
 
-  bool can_purge_p (state_t s) const FINAL OVERRIDE;
+  bool can_purge_p (state_t s) const final override;
 
   void check_for_pyobject_usage_without_gil (sm_context *sm_ctxt,
 const supernode *node,
@@ -82,12 +82,12 @@ class gil_diagnostic : public pending_diagnostic
 {
 public:
   /* There isn't a warning ID for us to use.  */
-  int get_controlling_option () const FINAL OVERRIDE
+  int get_controlling_option () const final override
   {
 return 0;
   }
 
-  location_t fixup_location (location_t loc) const FINAL OVERRIDE
+  location_t fixup_location (location_t loc) const final override
   {
 /* Ideally we'd check for specific macros here, and only
resolve certain macros.  */
@@ -98,7 +98,7 @@ public:
   }
 
   label_text describe_state_change (const evdesc::state_change &change)
-FINAL OVERRIDE
+final override
   {
 if (change.is_global_p ()
&& change.m_new_state == m_sm.m_released_gil)
@@ -125,25 +125,25 @@ class double_save_thread : public gil_diagnostic
   : gil_diagnostic (sm), m_call (call)
   {}
 
-  const char *get_kind () const FINAL OVERRIDE
+  const char *get_kind () const final override
   {
 return "double_save_thread";
   }
 
-  bool subclass_equal_p (const pending_diagnostic &base_other) const OVERRIDE
+  bool subclass_equal_p (const pending_diagnostic &base_other) const override
   {
 const double_save_thread &sub_other
   = (const double_save_thread &)base_other;
 return m_call == sub_other.m_call;
   }
 
-  bool emit (rich_location *rich_loc) FINAL OVERRIDE
+  bool emit (rich_location *rich_loc) final override
   {
 return warning_at (rich_loc, get_controlling_option (),
   "nested usage of %qs", "Py_BEGIN_ALLOW_THREADS");
   }
 
-  label_text describe_final_event (const evdesc::final_event &ev) FINAL 
OVERRIDE
+  label_text describe_final_event (const evdesc::final_event &ev) final 
override
   {
 return ev.formatted_print ("nested usage of %qs here",
   "Py_BEGIN_ALLOW_THREADS");
@@ -162,12 +162,12 @@ class fncall_without_gil : public gil_diagnostic
 m_arg_idx (arg_idx)
   {}
 
-  const char *get_kind () const FINAL OVERRIDE
+  const char *get_kind () const final override
   {
 return "fncall_without_gil";
   }
 
-  bool subclass_equal_p (const pending_diagnostic &base_other) const OVERRIDE
+  bool subclass_equal_p (const pending_diagnostic &base_other) const override
   {
 const fncall_without_gil &sub_other
   = (const fncall_without_gil &)base_other;
@@ -176,7 +176,7 @@ class fncall_without_gil : public gil_diagnostic
&& m_arg_idx == sub_other.m_arg_idx);
   }
 
-  bool emit (rich_location *rich_loc) FINAL OVERRIDE
+  bool emit (rich_location *rich_loc) final override
   {
 auto_diagnostic_group d;
 if (m_callee_fndecl)
@@ -191,7 +191,7 @@ class fncall_without_gil : public gil_diagnostic
 m_arg_idx + 1, m_callee_fndecl);
   }
 
-  label_text describe_final_event (const evdesc::final_event &ev) FINAL 
OVERRIDE
+  label_text describe_final_event (const evdesc::final_event &ev) final 
override
   {
 if (m_callee_fndecl)
   return ev.formatted_print ("use of PyObject as argument %i of %qE here"
@@ -216,25 +216,25 @@ class pyobject_usage_without_gil : public gil_diagnostic
   : gil_diagnostic (sm), m_expr (expr)
   {}
 
-  const char *get_kind () const FINAL OVERRIDE
+  const char *get_kind () const final override
   {
 return "pyobject_usage_without_gil";
   }
 
-  bool subclass_equal_p (const pending_diagnostic &base_other) const OVERRIDE
+  bool subclass_equal_p (const pending_diagnostic &base_other) const override
   {
 return same_tree_p (m_expr,
((const pyobject_usage_without_gil&)base_other).m_expr);
   }
 
-  bool emit (rich_location *rich_loc) FINAL OV

Re: [PATCH] RISC-V: Enable TARGET_SUPPORTS_WIDE_INT

2022-05-23 Thread Vineet Gupta
Ping ! With commit restrictions relaxed now, can this be added to trunk 
now ?


Thx,
-Vineet

On 2/6/22 22:06, Vineet Gupta wrote:

This is at par with other major arches such as aarch64, i386, s390 ...

No testsuite regressions: same numbers w/ w/o

|   === gcc Summary ===
|
|# of expected passes   113392
|# of unexpected failures   27
|# of unexpected successes  3
|# of expected failures 605
|# of unsupported tests 2523
|
|   === g++ Summary ===
|
|# of expected passes   172997
|# of unexpected failures   26
|# of expected failures 706
|# of unsupported tests 9566

Signed-off-by: Vineet Gupta 
---
  gcc/config/riscv/predicates.md | 2 +-
  gcc/config/riscv/riscv.c   | 6 ++
  gcc/config/riscv/riscv.h   | 2 ++
  3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/gcc/config/riscv/predicates.md b/gcc/config/riscv/predicates.md
index 3da6fd4c0491..cf902229954b 100644
--- a/gcc/config/riscv/predicates.md
+++ b/gcc/config/riscv/predicates.md
@@ -52,7 +52,7 @@
 (match_test "INTVAL (op) + 1 != 0")))
  
  (define_predicate "const_0_operand"

-  (and (match_code "const_int,const_wide_int,const_double,const_vector")
+  (and (match_code "const_int,const_wide_int,const_vector")
 (match_test "op == CONST0_RTX (GET_MODE (op))")))
  
  (define_predicate "reg_or_0_operand"

diff --git a/gcc/config/riscv/riscv.c b/gcc/config/riscv/riscv.c
index c830cd8f4ad1..d2f2d9e0276f 100644
--- a/gcc/config/riscv/riscv.c
+++ b/gcc/config/riscv/riscv.c
@@ -1774,6 +1774,12 @@ riscv_rtx_costs (rtx x, machine_mode mode, int 
outer_code, int opno ATTRIBUTE_UN
  case SYMBOL_REF:
  case LABEL_REF:
  case CONST_DOUBLE:
+  /* With TARGET_SUPPORTS_WIDE_INT const int can't be in CONST_DOUBLE
+ rtl object. Weird recheck due to switch-case fall through above.  */
+  if (GET_CODE (x) == CONST_DOUBLE)
+gcc_assert (GET_MODE (x) != VOIDmode);
+  /* Fall through.  */
+
  case CONST:
if ((cost = riscv_const_insns (x)) > 0)
{
diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h
index ff6729aedac2..91cfc82b4aa4 100644
--- a/gcc/config/riscv/riscv.h
+++ b/gcc/config/riscv/riscv.h
@@ -997,4 +997,6 @@ extern void riscv_remove_unneeded_save_restore_calls (void);
  
  #define HARD_REGNO_RENAME_OK(FROM, TO) riscv_hard_regno_rename_ok (FROM, TO)
  
+#define TARGET_SUPPORTS_WIDE_INT 1

+
  #endif /* ! GCC_RISCV_H */




Re: [x86 PATCH] Optimize double word negation of zero extended values.

2022-05-23 Thread Uros Bizjak via Gcc-patches
On Mon, May 23, 2022 at 9:40 PM Roger Sayle  wrote:
>
>
> It's not uncommon for GCC to convert between a (zero or one) Boolean
> value and a (zero or all ones) mask value, possibly of a wider type,
> using negation.
>
> Currently on x86_64, the following simple test case:
> __int128 foo(unsigned long x) { return -(__int128)x; }
>
> compiles with -O2 to:
>
> movq%rdi, %rax
> xorl%edx, %edx
> negq%rax
> adcq$0, %rdx
> negq%rdx
> ret
>
> with this patch, which adds an additional peephole2 to i386.md,
> we instead generate the improved:
>
> movq%rdi, %rax
> negq%rax
> sbbq%rdx, %rdx
> ret
>
> [and likewise for the (DImode) long long version using -m32.]
> A peephole2 is appropriate as the double word negation and the
> operation providing the xor are typically only split after combine.
>
> In fact, the new peephole2 sequence:
> ;; Convert:
> ;;   xorl %edx, %edx
> ;;   negl %eax
> ;;   adcl $0, %edx
> ;;   negl %edx
> ;; to:
> ;;   negl %eax
> ;;   sbbl %edx, %edx// *x86_movcc_0_m1
>
> is nearly identical to (and placed immediately after) the existing:
> ;; Convert:
> ;;   mov %esi, %edx
> ;;   negl %eax
> ;;   adcl $0, %edx
> ;;   negl %edx
> ;; to:
> ;;   xorl %edx, %edx
> ;;   negl %eax
> ;;   sbbl %esi, %edx
>
>
> One potential objection/concern is that "sbb? %reg,%reg" may possibly be
> incorrectly perceived as a false register dependency on older hardware,
> much like "xor? %reg,%reg" may be perceived as a false dependency on
> really old hardware.  This doesn't currently appear to be a concern
> for the i386 backend's *x86_movecc_0_m1 as shown by the following
> test code:
>
> int bar(unsigned int x, unsigned int y) {
>   return x > y ? -1 : 0;
> }
>
> which currently generates a "naked" sbb:
> cmp esi, edi
> sbb eax, eax
> ret
>
> If anyone does potentially encounter a stall, it would easy to add
> a splitter or peephole2 controlled by a tuning flag to insert an additional
> xor to break the false dependency chain (when not optimizing for size),
> but I don't believe this is required on recent microarchitectures.
>
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check, both with and without --target_board=unix{-m32}, with
> no new failures.  Ok for mainline?
>
>
> 2022-05-23  Roger Sayle  
>
> gcc/ChangeLog
> * config/i386/i386.md (peephole2): Convert xor;neg;adc;neg,
> i.e. a double word negation of a zero extended operand, to
> neg;sbb.
>
> gcc/testsuite/ChangeLog
> * gcc.target/i386/neg-zext-1.c: New test case for ia32.
> * gcc.target/i386/neg-zext-2.c: New test case for int128.

[1] suggests that sbb reg,reg breaks dependency chain only for AMD
targets (bulldozer/zen/bobcat/jaguar) and not for Intel. However, we
use "naked" sbb extensively and nobody complained about it, so I guess
the patch is OK.

[1] https://reviews.llvm.org/D116804

Thanks,
Uros.

>
> Thanks in advance,
> Roger
> --
>


Re: [PATCH] c++: set TYPE_CANONICAL for most templated types

2022-05-23 Thread Patrick Palka via Gcc-patches
On 5/18/22, Jason Merrill wrote:
> On 5/16/22 15:58, Patrick Palka wrote:
> > When processing a class template specialization, lookup_template_class
> > uses structural equality for the specialized type whenever one of its
> > template arguments uses structural equality.  This the sensible thing to
> > do in a vacuum, but given that we already effectively deduplicate class
> > specializations via the spec_hasher, it seems to me we can safely assume
> > that each class specialization is unique and therefore canonical,
> > regardless of the structure of the template arguments.
> 
> Makes sense.
> 
> > To that end this patch makes us use the canonical type machinery for all
> > type specializations except for the case where a PARM_DECL appears in
> > the template arguments (added in r12-3766-g72394d38d929c7).
> > 
> > Additionally, this patch makes us use the canonical type machinery for
> > TEMPLATE_TEMPLATE_PARMs and BOUND_TEMPLATE_TEMPLATE_PARMs, by extending
> > canonical_type_parameter appropriately.  A comment in tsubst says it's
> > unsafe to set TYPE_CANONICAL for a lowered TEMPLATE_TEMPLATE_PARM, but
> > I'm not sure I understand it.
> 
> I think that comment from r120341 became obsolete when r129844 (later that
> year) started to substitute the template parms of ttps.

Ah, I see.  I'll make note of this in the v2 commit message.

> 
> > Note that r10-7817-ga6f400239d792d
> > recently changed process_template_parm to clear TYPE_CANONICAL for
> > TEMPLATE_TEMPLATE_PARM consistent with the tsubst comment; this patch
> > changes both functions to set instead of clear TYPE_CANONICAL for ttps.
> > 
> > This change improves compile time of heavily templated code by around 10%
> > for me (with a release compiler).  For instance, compile time for the
> > libstdc++ test std/ranges/adaptors/all.cc drops from 1.45s to 1.25s, and
> > for the range-v3 test test/view/zip.cpp it goes from 5.38s to 4.88s.
> > The total number of calls to structural_comptypes for the latter test
> > drops from 8.5M to 1.5M.  Memory use is unchanged (unsurpisingly).
> 
> Nice!
> 
> > Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK for
> > trunk?  Also tested on cmcstl2 and range-v3 and various boost libraries.
> > Will also do more testing overnight...
> 
> One comment below.
> 
> > gcc/cp/ChangeLog:
> > 
> > * pt.cc (any_template_arguments_need_structural_equality_p):
> > Remove.
> > (struct ctp_hasher): Define.
> > (ctp_table): Define.
> > (canonical_type_parameter): Use it.
> > (process_template_parm): Set TYPE_CANONICAL for
> > TEMPLATE_TEMPLATE_PARM too.
> > (lookup_template_class_1): Don't call a_t_a_n_s_e_p.  Inline
> > the PARM_DECL special case from that subroutine into here.
> > (tsubst) : Remove special
> > TYPE_CANONICAL handling specific to ttps, and perform the
> > remaining handling later.
> > (find_parm_usage_r): Remove.
> > * tree.cc (bind_template_template_parm): Set TYPE_CANONICAL
> > when safe to do so.
> > * typeck.cc (structural_comptypes) [check_alias]: Increment
> > processing_template_decl before using
> > dependent_alias_template_spec_p.
> > ---
> >   gcc/cp/pt.cc | 166 ---
> >   gcc/cp/tree.cc   |  16 -
> >   gcc/cp/typeck.cc |   2 +
> >   3 files changed, 73 insertions(+), 111 deletions(-)
> > 
> > diff --git a/gcc/cp/pt.cc b/gcc/cp/pt.cc
> > index fa05e9134df..76562877355 100644
> > --- a/gcc/cp/pt.cc
> > +++ b/gcc/cp/pt.cc
> > @@ -203,7 +203,6 @@ static tree copy_default_args_to_explicit_spec_1 (tree,
> > tree);
> >   static void copy_default_args_to_explicit_spec (tree);
> >   static bool invalid_nontype_parm_type_p (tree, tsubst_flags_t);
> >   static bool dependent_template_arg_p (tree);
> > -static bool any_template_arguments_need_structural_equality_p (tree);
> >   static bool dependent_type_p_r (tree);
> >   static tree tsubst_copy   (tree, tree, tsubst_flags_t, tree);
> >   static tree tsubst_decl (tree, tree, tsubst_flags_t);
> > @@ -4526,6 +4525,27 @@ build_template_parm_index (int index,
> > return t;
> >   }
> >   +struct ctp_hasher : ggc_ptr_hash
> > +{
> > +  static hashval_t hash (tree t)
> > +  {
> > +tree_code code = TREE_CODE (t);
> > +hashval_t val = iterative_hash_object (code, 0);
> > +val = iterative_hash_object (TEMPLATE_TYPE_LEVEL (t), val);
> > +val = iterative_hash_object (TEMPLATE_TYPE_IDX (t), val);
> > +if (TREE_CODE (t) == BOUND_TEMPLATE_TEMPLATE_PARM)
> > +  val = iterative_hash_template_arg (TYPE_TI_ARGS (t), val);
> > +return val;
> > +  }
> > +
> > +  static bool equal (tree t, tree u)
> > +  {
> > +return comptypes (t, u, COMPARE_STRUCTURAL);
> > +  }
> > +};
> > +
> > +static GTY (()) hash_table *ctp_table;
> > +
> >   /* Find the canonical type parameter for the given template type
> >  parameter.  Returns the canonical type parameter, which may be TYPE
> >  if no such parameter exist

Re: [PATCH] x86: Avoid uninitialized variable in PR target/104441 test

2022-05-23 Thread Uros Bizjak via Gcc-patches
On Mon, May 23, 2022 at 7:44 PM H.J. Lu  wrote:
>
> PR target/104441
> * gcc.target/i386/pr104441-1a.c (load8bit_4x4_avx2): Initialize
> src23.

LGTM.

Thanks,
Uros.

> ---
>  gcc/testsuite/gcc.target/i386/pr104441-1a.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr104441-1a.c 
> b/gcc/testsuite/gcc.target/i386/pr104441-1a.c
> index 83734f710bd..0931029f2bb 100644
> --- a/gcc/testsuite/gcc.target/i386/pr104441-1a.c
> +++ b/gcc/testsuite/gcc.target/i386/pr104441-1a.c
> @@ -8,7 +8,7 @@ __attribute__((always_inline, target("avx2")))
>  static __m256i
>  load8bit_4x4_avx2(const uint8_t *const src, const uint32_t stride)
>  {
> -  __m128i src01, src23;
> +  __m128i src01, src23 = _mm_setzero_si128();
>src01 = _mm_cvtsi32_si128(*(int32_t*)(src + 0 * stride));
>src23 = _mm_insert_epi32(src23, *(int32_t *)(src + 3 * stride), 1);
>return _mm256_setr_m128i(src01, src23);
> --
> 2.36.1
>


[x86 PATCH] Optimize double word negation of zero extended values.

2022-05-23 Thread Roger Sayle

It's not uncommon for GCC to convert between a (zero or one) Boolean
value and a (zero or all ones) mask value, possibly of a wider type,
using negation.

Currently on x86_64, the following simple test case:
__int128 foo(unsigned long x) { return -(__int128)x; }

compiles with -O2 to:

movq%rdi, %rax
xorl%edx, %edx
negq%rax
adcq$0, %rdx
negq%rdx
ret

with this patch, which adds an additional peephole2 to i386.md,
we instead generate the improved:

movq%rdi, %rax
negq%rax
sbbq%rdx, %rdx
ret

[and likewise for the (DImode) long long version using -m32.]
A peephole2 is appropriate as the double word negation and the
operation providing the xor are typically only split after combine.

In fact, the new peephole2 sequence:
;; Convert:
;;   xorl %edx, %edx
;;   negl %eax
;;   adcl $0, %edx
;;   negl %edx
;; to:
;;   negl %eax
;;   sbbl %edx, %edx// *x86_movcc_0_m1

is nearly identical to (and placed immediately after) the existing:
;; Convert:
;;   mov %esi, %edx
;;   negl %eax
;;   adcl $0, %edx
;;   negl %edx
;; to:
;;   xorl %edx, %edx
;;   negl %eax
;;   sbbl %esi, %edx


One potential objection/concern is that "sbb? %reg,%reg" may possibly be
incorrectly perceived as a false register dependency on older hardware,
much like "xor? %reg,%reg" may be perceived as a false dependency on
really old hardware.  This doesn't currently appear to be a concern
for the i386 backend's *x86_movecc_0_m1 as shown by the following
test code:

int bar(unsigned int x, unsigned int y) {
  return x > y ? -1 : 0;
}

which currently generates a "naked" sbb:
cmp esi, edi
sbb eax, eax
ret

If anyone does potentially encounter a stall, it would easy to add
a splitter or peephole2 controlled by a tuning flag to insert an additional
xor to break the false dependency chain (when not optimizing for size),
but I don't believe this is required on recent microarchitectures.


This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without --target_board=unix{-m32}, with
no new failures.  Ok for mainline?


2022-05-23  Roger Sayle  

gcc/ChangeLog
* config/i386/i386.md (peephole2): Convert xor;neg;adc;neg,
i.e. a double word negation of a zero extended operand, to
neg;sbb.

gcc/testsuite/ChangeLog
* gcc.target/i386/neg-zext-1.c: New test case for ia32.
* gcc.target/i386/neg-zext-2.c: New test case for int128.


Thanks in advance,
Roger
--

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 792bae1..692f9b6 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -11039,6 +11039,46 @@
  (clobber (reg:CC FLAGS_REG))])]
   "ix86_expand_clear (operands[0]);")
 
+;; Convert:
+;;   xorl %edx, %edx
+;;   negl %eax
+;;   adcl $0, %edx
+;;   negl %edx
+;; to:
+;;   negl %eax
+;;   sbbl %edx, %edx   // *x86_movcc_0_m1
+
+(define_peephole2
+  [(parallel
+[(set (match_operand:SWI48 0 "general_reg_operand") (const_int 0))
+ (clobber (reg:CC FLAGS_REG))])
+   (parallel
+[(set (reg:CCC FLAGS_REG)
+ (ne:CCC (match_operand:SWI48 1 "general_reg_operand") (const_int 0)))
+ (set (match_dup 1) (neg:SWI48 (match_dup 1)))])
+   (parallel
+[(set (match_dup 0)
+ (plus:SWI48 (plus:SWI48
+   (ltu:SWI48 (reg:CC FLAGS_REG) (const_int 0))
+   (match_dup 0))
+ (const_int 0)))
+ (clobber (reg:CC FLAGS_REG))])
+   (parallel
+[(set (match_dup 0)
+ (neg:SWI48 (match_dup 0)))
+ (clobber (reg:CC FLAGS_REG))])]
+  "REGNO (operands[0]) != REGNO (operands[1])"
+  [(parallel
+[(set (reg:CCC FLAGS_REG)
+ (ne:CCC (match_dup 1) (const_int 0)))
+ (set (match_dup 1) (neg:SWI48 (match_dup 1)))])
+   (parallel
+[(set (match_dup 0)
+ (if_then_else:SWI48 (ltu:SWI48 (reg:CC FLAGS_REG) (const_int 0))
+ (const_int -1)
+ (const_int 0)))
+ (clobber (reg:CC FLAGS_REG))])])
+
 (define_insn "*neg_1"
   [(set (match_operand:SWI 0 "nonimmediate_operand" "=m")
(neg:SWI (match_operand:SWI 1 "nonimmediate_operand" "0")))
diff --git a/gcc/testsuite/gcc.target/i386/neg-zext-1.c 
b/gcc/testsuite/gcc.target/i386/neg-zext-1.c
new file mode 100644
index 000..ec91fb1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/neg-zext-1.c
@@ -0,0 +1,7 @@
+/* { dg-do compile { target ia32 } } */
+/* { dg-options "-O2" } */
+
+long long foo(unsigned int x) { return -(long long)x; }
+
+/* { dg-final { scan-assembler "sbb" } } */
+/* { dg-final { scan-assembler-not "adc" } } */
diff --git a/gcc/testsuite/gcc.target/i386/neg-zext-2.c 
b/gcc/testsuite/gcc.target/i386/neg-zext-2.c
new file mode 100644
index 000..a6ed077
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/neg-zext-2.c
@@ -0,0 +1,7 @@
+/* { dg-do compile { target int128 } } */
+/* { d

Re: [PATCH] [PR/target 105666] RISC-V: Inhibit FP <--> int register moves via tune param

2022-05-23 Thread Philipp Tomsich
Good catch!

On Mon, 23 May 2022 at 20:12, Vineet Gupta  wrote:

> Under extreme register pressure, compiler can use FP <--> int
> moves as a cheap alternate to spilling to memory.
> This was seen with SPEC2017 FP benchmark 507.cactu:
> ML_BSSN_Advect.cc:ML_BSSN_Advect_Body()
>
> |   fmv.d.x fa5,s9  # PDupwindNthSymm2Xt1, PDupwindNthSymm2Xt1
> | .LVL325:
> |   ld  s9,184(sp)  # _12469, %sfp
> | ...
> | .LVL339:
> |   fmv.x.d s4,fa5  # PDupwindNthSymm2Xt1, PDupwindNthSymm2Xt1
> |
>
> The FMV instructions could be costlier (than stack spill) on certain
> micro-architectures, thus this needs to be a per-cpu tunable
> (default being to inhibit on all existing RV cpus).
>
> Testsuite run with new test reports 10 failures without the fix
> corresponding to the build variations of pr105666.c
>
> |   === gcc Summary ===
> |
> | # of expected passes  123318   (+10)
> | # of unexpected failures  34   (-10)
> | # of unexpected successes 4
> | # of expected failures780
> | # of unresolved testcases 4
> | # of unsupported tests2796
>
> gcc/Changelog:
>
> * config/riscv/riscv.cc: (struct riscv_tune_param): Add
>   fmv_cost.
> (rocket_tune_info): Add default fmv_cost 8.
> (sifive_7_tune_info): Ditto.
> (thead_c906_tune_info): Ditto.
> (optimize_size_tune_info): Ditto.
> (riscv_register_move_cost): Use fmv_cost for int<->fp moves.
>
> gcc/testsuite/Changelog:
>
> * gcc.target/riscv/pr105666.c: New test.
>
> Signed-off-by: Vineet Gupta 
> ---
>  gcc/config/riscv/riscv.cc |  9 
>  gcc/testsuite/gcc.target/riscv/pr105666.c | 55 +++
>  2 files changed, 64 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/riscv/pr105666.c
>
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index ee756aab6940..f3ac0d8865f0 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -220,6 +220,7 @@ struct riscv_tune_param
>unsigned short issue_rate;
>unsigned short branch_cost;
>unsigned short memory_cost;
> +  unsigned short fmv_cost;
>bool slow_unaligned_access;
>  };
>
> @@ -285,6 +286,7 @@ static const struct riscv_tune_param rocket_tune_info
> = {
>1,   /* issue_rate */
>3,   /* branch_cost */
>5,   /* memory_cost */
> +  8,   /* fmv_cost */
>true,/*
> slow_unaligned_access */
>  };
>
> @@ -298,6 +300,7 @@ static const struct riscv_tune_param
> sifive_7_tune_info = {
>2,   /* issue_rate */
>4,   /* branch_cost */
>3,   /* memory_cost */
> +  8,   /* fmv_cost */
>true,/*
> slow_unaligned_access */
>  };
>
> @@ -311,6 +314,7 @@ static const struct riscv_tune_param
> thead_c906_tune_info = {
>1,/* issue_rate */
>3,/* branch_cost */
>5,/* memory_cost */
> +  8,   /* fmv_cost */
>false,/* slow_unaligned_access */
>  };
>
> @@ -324,6 +328,7 @@ static const struct riscv_tune_param
> optimize_size_tune_info = {
>1,   /* issue_rate */
>1,   /* branch_cost */
>2,   /* memory_cost */
> +  8,   /* fmv_cost */
>false,   /* slow_unaligned_access */
>  };
>
> @@ -4737,6 +4742,10 @@ static int
>  riscv_register_move_cost (machine_mode mode,
>   reg_class_t from, reg_class_t to)
>  {
> +  if ((from == FP_REGS && to == GR_REGS) ||
> +  (from == GR_REGS && to == FP_REGS))
> +return tune_param->fmv_cost;
> +
>return riscv_secondary_memory_needed (mode, from, to) ? 8 : 2;
>  }
>
> diff --git a/gcc/testsuite/gcc.target/riscv/pr105666.c
> b/gcc/testsuite/gcc.target/riscv/pr105666.c
> new file mode 100644
> index ..904f3bc0763f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/pr105666.c
> @@ -0,0 +1,55 @@
> +/* Shamelessly plugged off
> gcc/testsuite/gcc.c-torture/execute/pr28982a.c.
> +
> +   The idea is to induce high register pressure for both int/fp registers
> +   so that they spill. By default FMV instructions would be used to stash
> +   int reg to a fp reg (and vice-versa) but that could be costlier than
> +   spilling to stack.  */
> +
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64g -ffast-math" } */
> +
> +#define NITER 4
> +#define NVARS 20
> +#define MULTI(X) \
> +  X( 0), X( 1), X( 2), X( 3), X( 4), X( 5), X( 6), X( 7), 

[PATCH 01/10] Add 'final' and 'override' to opt_pass vfunc impls

2022-05-23 Thread David Malcolm via Gcc-patches
gcc/ChangeLog:
* adjust-alignment.cc: Add "final" and "override" to opt_pass
vfunc implementations, removing redundant "virtual" as
appropriate.
* asan.cc: Likewise.
* auto-inc-dec.cc: Likewise.
* auto-profile.cc: Likewise.
* bb-reorder.cc: Likewise.
* cfgcleanup.cc: Likewise.
* cfgexpand.cc: Likewise.
* cfgrtl.cc: Likewise.
* cgraphbuild.cc: Likewise.
* combine-stack-adj.cc: Likewise.
* combine.cc: Likewise.
* compare-elim.cc: Likewise.
* config/i386/i386-features.cc: Likewise.
* coroutine-passes.cc: Likewise.
* cprop.cc: Likewise.
* cse.cc: Likewise.
* dce.cc: Likewise.
* df-core.cc: Likewise.
* dse.cc: Likewise.
* dwarf2cfi.cc: Likewise.
* early-remat.cc: Likewise.
* except.cc: Likewise.
* final.cc: Likewise.
* function.cc: Likewise.
* fwprop.cc: Likewise.
* gcse.cc: Likewise.
* gimple-harden-conditionals.cc: Likewise.
* gimple-if-to-switch.cc: Likewise.
* gimple-isel.cc: Likewise.
* gimple-laddress.cc: Likewise.
* gimple-loop-interchange.cc: Likewise.
* gimple-loop-jam.cc: Likewise.
* gimple-loop-versioning.cc: Likewise.
* gimple-low.cc: Likewise.
* gimple-ssa-backprop.cc: Likewise.
* gimple-ssa-evrp.cc: Likewise.
* gimple-ssa-isolate-paths.cc: Likewise.
* gimple-ssa-nonnull-compare.cc: Likewise.
* gimple-ssa-split-paths.cc: Likewise.
* gimple-ssa-store-merging.cc: Likewise.
* gimple-ssa-strength-reduction.cc: Likewise.
* gimple-ssa-warn-access.cc: Likewise.
* gimple-ssa-warn-alloca.cc: Likewise.
* gimple-ssa-warn-restrict.cc: Likewise.
* gimple-warn-recursion.cc: Likewise.
* graphite.cc: Likewise.
* ifcvt.cc: Likewise.
* init-regs.cc: Likewise.
* ipa-comdats.cc: Likewise.
* ipa-cp.cc: Likewise.
* ipa-devirt.cc: Likewise.
* ipa-fnsummary.cc: Likewise.
* ipa-free-lang-data.cc: Likewise.
* ipa-icf.cc: Likewise.
* ipa-inline.cc: Likewise.
* ipa-modref.cc: Likewise.
* ipa-profile.cc: Likewise.
* ipa-pure-const.cc: Likewise.
* ipa-reference.cc: Likewise.
* ipa-split.cc: Likewise.
* ipa-sra.cc: Likewise.
* ipa-visibility.cc: Likewise.
* ipa.cc: Likewise.
* ira.cc: Likewise.
* jump.cc: Likewise.
* loop-init.cc: Likewise.
* lower-subreg.cc: Likewise.
* mode-switching.cc: Likewise.
* modulo-sched.cc: Likewise.
* multiple_target.cc: Likewise.
* omp-expand.cc: Likewise.
* omp-low.cc: Likewise.
* omp-oacc-kernels-decompose.cc: Likewise.
* omp-oacc-neuter-broadcast.cc: Likewise.
* omp-offload.cc: Likewise.
* omp-simd-clone.cc: Likewise.
* passes.cc: Likewise.
* postreload-gcse.cc: Likewise.
* postreload.cc: Likewise.
* predict.cc: Likewise.
* recog.cc: Likewise.
* ree.cc: Likewise.
* reg-stack.cc: Likewise.
* regcprop.cc: Likewise.
* reginfo.cc: Likewise.
* regrename.cc: Likewise.
* reorg.cc: Likewise.
* sancov.cc: Likewise.
* sanopt.cc: Likewise.
* sched-rgn.cc: Likewise.
* stack-ptr-mod.cc: Likewise.
* store-motion.cc: Likewise.
* tracer.cc: Likewise.
* trans-mem.cc: Likewise.
* tree-call-cdce.cc: Likewise.
* tree-cfg.cc: Likewise.
* tree-cfgcleanup.cc: Likewise.
* tree-complex.cc: Likewise.
* tree-eh.cc: Likewise.
* tree-emutls.cc: Likewise.
* tree-if-conv.cc: Likewise.
* tree-into-ssa.cc: Likewise.
* tree-loop-distribution.cc: Likewise.
* tree-nrv.cc: Likewise.
* tree-object-size.cc: Likewise.
* tree-parloops.cc: Likewise.
* tree-predcom.cc: Likewise.
* tree-profile.cc: Likewise.
* tree-sra.cc: Likewise.
* tree-ssa-ccp.cc: Likewise.
* tree-ssa-copy.cc: Likewise.
* tree-ssa-dce.cc: Likewise.
* tree-ssa-dom.cc: Likewise.
* tree-ssa-dse.cc: Likewise.
* tree-ssa-forwprop.cc: Likewise.
* tree-ssa-ifcombine.cc: Likewise.
* tree-ssa-loop-ch.cc: Likewise.
* tree-ssa-loop-im.cc: Likewise.
* tree-ssa-loop-ivcanon.cc: Likewise.
* tree-ssa-loop-prefetch.cc: Likewise.
* tree-ssa-loop-split.cc: Likewise.
* tree-ssa-loop-unswitch.cc: Likewise.
* tree-ssa-loop.cc: Likewise.
* tree-ssa-math-opts.cc: Likewise.
* tree-ssa-phiopt.cc: Likewise.
* tree-ssa-phiprop.cc: Likewise.
* tree-ssa-pre.cc: Likewise.
* tree-ssa-reassoc.cc: Likewise.
* tree-ssa-sccvn.cc: Likewise.
* tree-ssa-sink.cc: Likewise

[PATCH 10/10] Add 'final' and 'override' in various places

2022-05-23 Thread David Malcolm via Gcc-patches
gcc/cp/ChangeLog:
* cxx-pretty-print.h: Add "final" and "override" to various vfunc
implementations, removing redundant "virtual" as appropriate.
* module.cc: Likewise.

gcc/ChangeLog:
* genmatch.cc: Add "final" and "override" to various vfunc
implementations, removing redundant "virtual" as appropriate.
* gensupport.cc: Likewise.
* gimple-range-cache.h: Likewise.
* ipa-icf-gimple.h: Likewise.
* ipa-icf.h: Likewise.
* read-md.h: Likewise.
* read-rtl-function.cc: Likewise.
* tree-ssa-loop-ch.cc: Likewise.
* tree-ssa-sccvn.cc: Likewise.

gcc/lto/ChangeLog:
* lto-dump.cc: Add "final" and "override" to various vfunc
implementations, removing redundant "virtual" as appropriate.

Signed-off-by: David Malcolm 
---
 gcc/cp/cxx-pretty-print.h | 38 +++---
 gcc/cp/module.cc  |  4 ++--
 gcc/genmatch.cc   | 22 +++---
 gcc/gensupport.cc |  2 +-
 gcc/gimple-range-cache.h  |  4 ++--
 gcc/ipa-icf-gimple.h  |  6 --
 gcc/ipa-icf.h | 36 
 gcc/lto/lto-dump.cc   |  8 
 gcc/read-md.h |  2 +-
 gcc/read-rtl-function.cc  |  6 +++---
 gcc/tree-ssa-loop-ch.cc   |  4 ++--
 gcc/tree-ssa-sccvn.cc |  4 ++--
 12 files changed, 71 insertions(+), 65 deletions(-)

diff --git a/gcc/cp/cxx-pretty-print.h b/gcc/cp/cxx-pretty-print.h
index 5080f70a8e4..593bd91d4f7 100644
--- a/gcc/cp/cxx-pretty-print.h
+++ b/gcc/cp/cxx-pretty-print.h
@@ -36,25 +36,25 @@ public:
 
   pretty_printer *clone () const override;
 
-  void constant (tree);
-  void id_expression (tree);
-  void primary_expression (tree);
-  void postfix_expression (tree);
-  void unary_expression (tree);
-  void multiplicative_expression (tree);
-  void conditional_expression (tree);
-  void assignment_expression (tree);
-  void expression (tree);
-  void type_id (tree);
-  void statement (tree);
-  void declaration (tree);
-  void declaration_specifiers (tree);
-  void simple_type_specifier (tree);
-  void function_specifier (tree);
-  void declarator (tree);
-  void direct_declarator (tree);
-  void abstract_declarator (tree);
-  void direct_abstract_declarator (tree);
+  void constant (tree) final override;
+  void id_expression (tree) final override;
+  void primary_expression (tree) final override;
+  void postfix_expression (tree) final override;
+  void unary_expression (tree) final override;
+  void multiplicative_expression (tree) final override;
+  void conditional_expression (tree) final override;
+  void assignment_expression (tree) final override;
+  void expression (tree) final override;
+  void type_id (tree) final override;
+  void statement (tree) final override;
+  void declaration (tree) final override;
+  void declaration_specifiers (tree) final override;
+  void simple_type_specifier (tree) final override;
+  void function_specifier (tree) final override;
+  void declarator (tree) final override;
+  void direct_declarator (tree) final override;
+  void abstract_declarator (tree) final override;
+  void direct_abstract_declarator (tree) final override;
 
   /* This is the enclosing scope of the entity being pretty-printed.  */
   tree enclosing_scope;
diff --git a/gcc/cp/module.cc b/gcc/cp/module.cc
index d1dc73724d1..e93151c98c2 100644
--- a/gcc/cp/module.cc
+++ b/gcc/cp/module.cc
@@ -1483,10 +1483,10 @@ private:
 
 protected:
   using allocator::grow;
-  virtual char *grow (char *, unsigned needed);
+  char *grow (char *, unsigned needed) final override;
 #if MAPPED_WRITING
   using allocator::shrink;
-  virtual void shrink (char *);
+  void shrink (char *) final override;
 #endif
 
 public:
diff --git a/gcc/genmatch.cc b/gcc/genmatch.cc
index 2b84b849330..a0b22c50ae3 100644
--- a/gcc/genmatch.cc
+++ b/gcc/genmatch.cc
@@ -723,9 +723,9 @@ public:
   bool force_leaf;
   /* If non-zero, the group for optional handling.  */
   unsigned char opt_grp;
-  virtual void gen_transform (FILE *f, int, const char *, bool, int,
- const char *, capture_info *,
- dt_operand ** = 0, int = 0);
+  void gen_transform (FILE *f, int, const char *, bool, int,
+ const char *, capture_info *,
+ dt_operand ** = 0, int = 0) override;
 };
 
 /* An operator that is represented by native C code.  This is always
@@ -757,9 +757,9 @@ public:
   unsigned nr_stmts;
   /* The identifier replacement vector.  */
   vec ids;
-  virtual void gen_transform (FILE *f, int, const char *, bool, int,
- const char *, capture_info *,
- dt_operand ** = 0, int = 0);
+  void gen_transform (FILE *f, int, const char *, bool, int,
+ const char *, capture_info *,
+ dt_operand ** = 0, int = 0) final override;
 };
 
 /* A wrapper around another operand that capt

[PATCH 05/10] d: add 'final' and 'override' to gcc/d/*.cc 'visit' impls

2022-05-23 Thread David Malcolm via Gcc-patches
gcc/d/ChangeLog:
* decl.cc: Add "final" and "override" to all "visit" vfunc decls
as appropriate.
* expr.cc: Likewise.
* toir.cc: Likewise.
* typeinfo.cc: Likewise.
* types.cc: Likewise.

Signed-off-by: David Malcolm 
---
 gcc/d/decl.cc | 36 +-
 gcc/d/expr.cc |  2 +-
 gcc/d/toir.cc | 64 +++
 gcc/d/typeinfo.cc | 34 -
 gcc/d/types.cc| 30 +++---
 5 files changed, 83 insertions(+), 83 deletions(-)

diff --git a/gcc/d/decl.cc b/gcc/d/decl.cc
index f5c21078aad..5d850065bf0 100644
--- a/gcc/d/decl.cc
+++ b/gcc/d/decl.cc
@@ -149,13 +149,13 @@ public:
 
   /* This should be overridden by each declaration class.  */
 
-  void visit (Dsymbol *)
+  void visit (Dsymbol *) final override
   {
   }
 
   /* Compile a D module, and all members of it.  */
 
-  void visit (Module *d)
+  void visit (Module *d) final override
   {
 if (d->semanticRun >= PASS::obj)
   return;
@@ -166,7 +166,7 @@ public:
 
   /* Write the imported symbol to debug.  */
 
-  void visit (Import *d)
+  void visit (Import *d) final override
   {
 if (d->semanticRun >= PASS::obj)
   return;
@@ -218,7 +218,7 @@ public:
 
   /* Expand any local variables found in tuples.  */
 
-  void visit (TupleDeclaration *d)
+  void visit (TupleDeclaration *d) final override
   {
 for (size_t i = 0; i < d->objects->length; i++)
   {
@@ -234,7 +234,7 @@ public:
 
   /* Walk over all declarations in the attribute scope.  */
 
-  void visit (AttribDeclaration *d)
+  void visit (AttribDeclaration *d) final override
   {
 Dsymbols *ds = d->include (NULL);
 
@@ -248,7 +248,7 @@ public:
   /* Pragmas are a way to pass special information to the compiler and to add
  vendor specific extensions to D.  */
 
-  void visit (PragmaDeclaration *d)
+  void visit (PragmaDeclaration *d) final override
   {
 if (d->ident == Identifier::idPool ("lib")
|| d->ident == Identifier::idPool ("startaddress"))
@@ -266,7 +266,7 @@ public:
   /* Conditional compilation is the process of selecting which code to compile
  and which code to not compile.  Look for version conditions that may  */
 
-  void visit (ConditionalDeclaration *d)
+  void visit (ConditionalDeclaration *d) final override
   {
 bool old_condition = this->in_version_unittest_;
 
@@ -284,7 +284,7 @@ public:
 
   /* Walk over all members in the namespace scope.  */
 
-  void visit (Nspace *d)
+  void visit (Nspace *d) final override
   {
 if (isError (d) || !d->members)
   return;
@@ -298,7 +298,7 @@ public:
  voldemort type, then it's members must be compiled before the parent
  function finishes.  */
 
-  void visit (TemplateDeclaration *d)
+  void visit (TemplateDeclaration *d) final override
   {
 /* Type cannot be directly named outside of the scope it's declared in, so
the only way it can be escaped is if the function has auto return.  */
@@ -329,7 +329,7 @@ public:
 
   /* Walk over all members in the instantiated template.  */
 
-  void visit (TemplateInstance *d)
+  void visit (TemplateInstance *d) final override
   {
 if (isError (d)|| !d->members)
   return;
@@ -343,7 +343,7 @@ public:
 
   /* Walk over all members in the mixin template scope.  */
 
-  void visit (TemplateMixin *d)
+  void visit (TemplateMixin *d) final override
   {
 if (isError (d)|| !d->members)
   return;
@@ -355,7 +355,7 @@ public:
   /* Write out compiler generated TypeInfo, initializer and functions for the
  given struct declaration, walking over all static members.  */
 
-  void visit (StructDeclaration *d)
+  void visit (StructDeclaration *d) final override
   {
 if (d->semanticRun >= PASS::obj)
   return;
@@ -470,7 +470,7 @@ public:
   /* Write out compiler generated TypeInfo, initializer and vtables for the
  given class declaration, walking over all static members.  */
 
-  void visit (ClassDeclaration *d)
+  void visit (ClassDeclaration *d) final override
   {
 if (d->semanticRun >= PASS::obj)
   return;
@@ -544,7 +544,7 @@ public:
   /* Write out compiler generated TypeInfo and vtables for the given interface
  declaration, walking over all static members.  */
 
-  void visit (InterfaceDeclaration *d)
+  void visit (InterfaceDeclaration *d) final override
   {
 if (d->semanticRun >= PASS::obj)
   return;
@@ -587,7 +587,7 @@ public:
   /* Write out compiler generated TypeInfo and initializer for the given
  enum declaration.  */
 
-  void visit (EnumDeclaration *d)
+  void visit (EnumDeclaration *d) final override
   {
 if (d->semanticRun >= PASS::obj)
   return;
@@ -626,7 +626,7 @@ public:
   /* Finish up a variable declaration and push it into the current scope.
  This can either be a static, local or manifest constant.  */
 
-  void visit (VarDeclaration *d)
+  void visit (VarDeclaration *d) final override
   {
 if (

[PATCH 09/10] tree-vect-slp-patterns.cc: add 'final' and 'override' to vect_pattern::build impls

2022-05-23 Thread David Malcolm via Gcc-patches
gcc/ChangeLog:
* tree-vect-slp-patterns.cc: Add "final" and "override" to
vect_pattern::build impls as appropriate.

Signed-off-by: David Malcolm 
---
 gcc/tree-vect-slp-patterns.cc | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/gcc/tree-vect-slp-patterns.cc b/gcc/tree-vect-slp-patterns.cc
index a6b0d106d5f..e6a6db8beba 100644
--- a/gcc/tree-vect-slp-patterns.cc
+++ b/gcc/tree-vect-slp-patterns.cc
@@ -492,7 +492,7 @@ class complex_pattern : public vect_pattern
 }
 
   public:
-void build (vec_info *);
+void build (vec_info *) override;
 
 static internal_fn
 matches (complex_operation_t op, slp_tree_to_load_perm_map_t *, slp_tree *,
@@ -595,7 +595,7 @@ class complex_add_pattern : public complex_pattern
 }
 
   public:
-void build (vec_info *);
+void build (vec_info *) final override;
 static internal_fn
 matches (complex_operation_t op, slp_tree_to_load_perm_map_t *,
 slp_compat_nodes_map_t *, slp_tree *, vec *);
@@ -977,7 +977,7 @@ class complex_mul_pattern : public complex_pattern
 }
 
   public:
-void build (vec_info *);
+void build (vec_info *) final override;
 static internal_fn
 matches (complex_operation_t op, slp_tree_to_load_perm_map_t *,
 slp_compat_nodes_map_t *, slp_tree *, vec *);
@@ -1204,7 +1204,7 @@ class complex_fms_pattern : public complex_pattern
 }
 
   public:
-void build (vec_info *);
+void build (vec_info *) final override;
 static internal_fn
 matches (complex_operation_t op, slp_tree_to_load_perm_map_t *,
 slp_compat_nodes_map_t *, slp_tree *, vec *);
@@ -1380,7 +1380,7 @@ class complex_operations_pattern : public complex_pattern
 }
 
   public:
-void build (vec_info *);
+void build (vec_info *) final override;
 static internal_fn
 matches (complex_operation_t op, slp_tree_to_load_perm_map_t *,
 slp_compat_nodes_map_t *, slp_tree *, vec *);
@@ -1446,7 +1446,7 @@ class addsub_pattern : public vect_pattern
 addsub_pattern (slp_tree *node, internal_fn ifn)
: vect_pattern (node, NULL, ifn) {};
 
-void build (vec_info *);
+void build (vec_info *) final override;
 
 static vect_pattern*
 recognize (slp_tree_to_load_perm_map_t *, slp_compat_nodes_map_t *,
-- 
2.26.3



[PATCH 08/10] i386: add 'final' and 'override' to scalar_chain vfunc impls

2022-05-23 Thread David Malcolm via Gcc-patches
gcc/ChangeLog:
* config/i386/i386-features.h: Add "final" and "override" to
scalar_chain vfunc implementations as appropriate.

Signed-off-by: David Malcolm 
---
 gcc/config/i386/i386-features.h | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/gcc/config/i386/i386-features.h b/gcc/config/i386/i386-features.h
index 5c307607ae5..f46a6d95b74 100644
--- a/gcc/config/i386/i386-features.h
+++ b/gcc/config/i386/i386-features.h
@@ -169,18 +169,18 @@ class general_scalar_chain : public scalar_chain
  public:
   general_scalar_chain (enum machine_mode smode_, enum machine_mode vmode_);
   ~general_scalar_chain ();
-  int compute_convert_gain ();
+  int compute_convert_gain () final override;
  private:
   hash_map defs_map;
   bitmap insns_conv;
   unsigned n_sse_to_integer;
   unsigned n_integer_to_sse;
-  void mark_dual_mode_def (df_ref def);
-  void convert_insn (rtx_insn *insn);
+  void mark_dual_mode_def (df_ref def) final override;
+  void convert_insn (rtx_insn *insn) final override;
   void convert_op (rtx *op, rtx_insn *insn);
   void convert_reg (rtx_insn *insn, rtx dst, rtx src);
   void make_vector_copies (rtx_insn *, rtx);
-  void convert_registers ();
+  void convert_registers () final override;
   int vector_const_cost (rtx exp);
 };
 
@@ -190,14 +190,14 @@ class timode_scalar_chain : public scalar_chain
   timode_scalar_chain () : scalar_chain (TImode, V1TImode) {}
 
   /* Convert from TImode to V1TImode is always faster.  */
-  int compute_convert_gain () { return 1; }
+  int compute_convert_gain () final override { return 1; }
 
  private:
-  void mark_dual_mode_def (df_ref def);
+  void mark_dual_mode_def (df_ref def) final override;
   void fix_debug_reg_uses (rtx reg);
-  void convert_insn (rtx_insn *insn);
+  void convert_insn (rtx_insn *insn) final override;
   /* We don't convert registers to difference size.  */
-  void convert_registers () {}
+  void convert_registers () final override {}
 };
 
 } // anon namespace
-- 
2.26.3



[PATCH 06/10] ipa: add 'final' and 'override' to call_summary_base vfunc impls

2022-05-23 Thread David Malcolm via Gcc-patches
gcc/ChangeLog:
* ipa-cp.cc: Add "final" and "override" to call_summary_base vfunc
implementations, removing redundant "virtual" as appropriate.
* ipa-fnsummary.h: Likewise.
* ipa-modref.cc: Likewise.
* ipa-param-manipulation.cc: Likewise.
* ipa-profile.cc: Likewise.
* ipa-prop.h: Likewise.
* ipa-pure-const.cc: Likewise.
* ipa-reference.cc: Likewise.
* ipa-sra.cc: Likewise.
* symbol-summary.h: Likewise.
* symtab-thunks.cc: Likewise.

Signed-off-by: David Malcolm 
---
 gcc/ipa-cp.cc |  6 +++---
 gcc/ipa-fnsummary.h   | 21 ++--
 gcc/ipa-modref.cc | 36 +--
 gcc/ipa-param-manipulation.cc |  8 
 gcc/ipa-profile.cc|  6 +++---
 gcc/ipa-prop.h| 26 -
 gcc/ipa-pure-const.cc |  8 
 gcc/ipa-reference.cc  | 10 +-
 gcc/ipa-sra.cc| 14 +++---
 gcc/symbol-summary.h  |  8 
 gcc/symtab-thunks.cc  |  8 
 11 files changed, 76 insertions(+), 75 deletions(-)

diff --git a/gcc/ipa-cp.cc b/gcc/ipa-cp.cc
index 38a21b17e39..bb2d611cbcd 100644
--- a/gcc/ipa-cp.cc
+++ b/gcc/ipa-cp.cc
@@ -4190,9 +4190,9 @@ public:
   m_initialize_when_cloning = true;
 }
 
-  virtual void duplicate (cgraph_edge *src_edge, cgraph_edge *dst_edge,
- edge_clone_summary *src_data,
- edge_clone_summary *dst_data);
+  void duplicate (cgraph_edge *src_edge, cgraph_edge *dst_edge,
+ edge_clone_summary *src_data,
+ edge_clone_summary *dst_data) final override;
 };
 
 /* Edge duplication hook.  */
diff --git a/gcc/ipa-fnsummary.h b/gcc/ipa-fnsummary.h
index e1f1d1b839c..941fea6de0d 100644
--- a/gcc/ipa-fnsummary.h
+++ b/gcc/ipa-fnsummary.h
@@ -236,14 +236,15 @@ public:
   /* Remove ipa_fn_summary for all callees of NODE.  */
   void remove_callees (cgraph_node *node);
 
-  virtual void insert (cgraph_node *, ipa_fn_summary *);
-  virtual void remove (cgraph_node *node, ipa_fn_summary *)
+  void insert (cgraph_node *, ipa_fn_summary *) final override;
+  void remove (cgraph_node *node, ipa_fn_summary *) final override
   {
 remove_callees (node);
   }
 
-  virtual void duplicate (cgraph_node *src, cgraph_node *dst,
- ipa_fn_summary *src_data, ipa_fn_summary *dst_data);
+  void duplicate (cgraph_node *src, cgraph_node *dst,
+ ipa_fn_summary *src_data, ipa_fn_summary *dst_data)
+final override;
 };
 
 extern GTY(()) fast_function_summary 
@@ -259,9 +260,9 @@ public:
 disable_insertion_hook ();
   }
 
-  virtual void duplicate (cgraph_node *, cgraph_node *,
- ipa_size_summary *src_data,
- ipa_size_summary *dst_data)
+  void duplicate (cgraph_node *, cgraph_node *,
+ ipa_size_summary *src_data,
+ ipa_size_summary *dst_data) final override
   {
 *dst_data = *src_data;
   }
@@ -311,9 +312,9 @@ public:
 fast_call_summary  (symtab) {}
 
   /* Hook that is called by summary when an edge is duplicated.  */
-  virtual void duplicate (cgraph_edge *src, cgraph_edge *dst,
- ipa_call_summary *src_data,
- ipa_call_summary *dst_data);
+  void duplicate (cgraph_edge *src, cgraph_edge *dst,
+ ipa_call_summary *src_data,
+ ipa_call_summary *dst_data) final override;
 };
 
 /* Estimated execution times, code sizes and other information about the
diff --git a/gcc/ipa-modref.cc b/gcc/ipa-modref.cc
index 7c1f974bc7a..0d9abacf0a6 100644
--- a/gcc/ipa-modref.cc
+++ b/gcc/ipa-modref.cc
@@ -119,10 +119,10 @@ public:
   fnspec_summaries_t (symbol_table *symtab)
   : call_summary  (symtab) {}
   /* Hook that is called by summary when an edge is duplicated.  */
-  virtual void duplicate (cgraph_edge *,
- cgraph_edge *,
- fnspec_summary *src,
- fnspec_summary *dst)
+  void duplicate (cgraph_edge *,
+ cgraph_edge *,
+ fnspec_summary *src,
+ fnspec_summary *dst) final override
   {
 dst->fnspec = xstrdup (src->fnspec);
   }
@@ -194,10 +194,10 @@ public:
   escape_summaries_t (symbol_table *symtab)
   : call_summary  (symtab) {}
   /* Hook that is called by summary when an edge is duplicated.  */
-  virtual void duplicate (cgraph_edge *,
- cgraph_edge *,
- escape_summary *src,
- escape_summary *dst)
+  void duplicate (cgraph_edge *,
+ cgraph_edge *,
+ escape_summary *src,
+ escape_summary *dst) final override
   {
 dst->esc = src->esc.copy ();
   }
@@ -217,11 +217,11 @@ class GTY((user)) modref_summaries
 public:
   modref_su

[PATCH 07/10] value-relation.h: add 'final' and 'override' to relation_oracle vfunc impls

2022-05-23 Thread David Malcolm via Gcc-patches
gcc/ChangeLog:
* value-relation.h: Add "final" and "override" to relation_oracle
vfunc implementations as appropriate.

Signed-off-by: David Malcolm 
---
 gcc/value-relation.h | 38 +-
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/gcc/value-relation.h b/gcc/value-relation.h
index 19762d8ce2b..478729be0bf 100644
--- a/gcc/value-relation.h
+++ b/gcc/value-relation.h
@@ -130,14 +130,15 @@ public:
   equiv_oracle ();
   ~equiv_oracle ();
 
-  const_bitmap equiv_set (tree ssa, basic_block bb);
+  const_bitmap equiv_set (tree ssa, basic_block bb) final override;
   void register_relation (basic_block bb, relation_kind k, tree ssa1,
- tree ssa2);
+ tree ssa2) override;
 
-  relation_kind query_relation (basic_block, tree, tree);
-  relation_kind query_relation (basic_block, const_bitmap, const_bitmap);
-  void dump (FILE *f, basic_block bb) const;
-  void dump (FILE *f) const;
+  relation_kind query_relation (basic_block, tree, tree) override;
+  relation_kind query_relation (basic_block, const_bitmap, const_bitmap)
+override;
+  void dump (FILE *f, basic_block bb) const override;
+  void dump (FILE *f) const override;
 
 protected:
   bitmap_obstack m_bitmaps;
@@ -185,14 +186,16 @@ public:
   dom_oracle ();
   ~dom_oracle ();
 
-  void register_relation (basic_block bb, relation_kind k, tree op1, tree op2);
+  void register_relation (basic_block bb, relation_kind k, tree op1, tree op2)
+final override;
 
-  relation_kind query_relation (basic_block bb, tree ssa1, tree ssa2);
+  relation_kind query_relation (basic_block bb, tree ssa1, tree ssa2)
+final override;
   relation_kind query_relation (basic_block bb, const_bitmap b1,
-  const_bitmap b2);
+   const_bitmap b2) final override;
 
-  void dump (FILE *f, basic_block bb) const;
-  void dump (FILE *f) const;
+  void dump (FILE *f, basic_block bb) const final override;
+  void dump (FILE *f) const final override;
 private:
   bitmap m_tmp, m_tmp2;
   bitmap m_relation_set;  // Index by ssa-name. True if a relation exists
@@ -229,15 +232,16 @@ class path_oracle : public relation_oracle
 public:
   path_oracle (relation_oracle *oracle = NULL);
   ~path_oracle ();
-  const_bitmap equiv_set (tree, basic_block);
-  void register_relation (basic_block, relation_kind, tree, tree);
+  const_bitmap equiv_set (tree, basic_block) final override;
+  void register_relation (basic_block, relation_kind, tree, tree) final 
override;
   void killing_def (tree);
-  relation_kind query_relation (basic_block, tree, tree);
-  relation_kind query_relation (basic_block, const_bitmap, const_bitmap);
+  relation_kind query_relation (basic_block, tree, tree) final override;
+  relation_kind query_relation (basic_block, const_bitmap, const_bitmap)
+final override;
   void reset_path ();
   void set_root_oracle (relation_oracle *oracle) { m_root = oracle; }
-  void dump (FILE *, basic_block) const;
-  void dump (FILE *) const;
+  void dump (FILE *, basic_block) const final override;
+  void dump (FILE *) const final override;
 private:
   void register_equiv (basic_block bb, tree ssa1, tree ssa2);
   equiv_chain m_equiv;
-- 
2.26.3



[PATCH 00/10] Add 'final' and 'override' where missing

2022-05-23 Thread David Malcolm via Gcc-patches
With C++11 we can add "final" and "override" to the decls of vfuncs
in derived classes, which documents to both human and automated readers
of the code that a decl is intended to override a vfunc in a base class,
and can help catch mistakes where we intended to override a vfunc, but
messed up the prototypes.

The following patch kit adds "final" and "override" specifiers to the
decls of vfunc implementations throughout the source tree.

I added "final override" everywhere where this was possible, or just
"override" for the places where the overridden vfunc gets further
overridden.

I also removed "virtual" from such decls, since this isn't required
when overriding an existing vfunc, and the "final override" better
implies the intent of the code.

I temporarily hacked -Werror=suggest-override into the Makefile whilst
I was creating the patches, but I skipped the following:

(a) gcc/d/dmd/ ...since these sources are copied from an upstream
(b) gcc/go/gofrontend/ ...likewise
(c) gcc/range.op.cc: as I believe this code is under heavy development
(d) target-specific passes other than i386 (for ease of testing); I can
do these in a followup, if desired.

I didn't attempt to add -Wsuggest-override into our compile flags
"properly".

No functional changes intended.

Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.

I split them up into separate patches by topic for ease of review, and
for ease of writing the ChangeLog entries.

Worth an update to https://gcc.gnu.org/codingconventions.html ?

OK for trunk?
Dave

David Malcolm (10):
  Add 'final' and 'override' to opt_pass vfunc impls
  Add 'final' and 'override' on dom_walker vfunc impls
  expr.cc: use final/override on op_by_pieces_d vfuncs
  tree-switch-conversion.h: use final/override for cluster vfunc impls
  d: add 'final' and 'override' to gcc/d/*.cc 'visit' impls
  ipa: add 'final' and 'override' to call_summary_base vfunc impls
  value-relation.h: add 'final' and 'override' to relation_oracle vfunc
impls
  i386: add 'final' and 'override' to scalar_chain vfunc impls
  tree-vect-slp-patterns.cc: add 'final' and 'override' to
vect_pattern::build impls
  Add 'final' and 'override' in various places

 gcc/adjust-alignment.cc  |  2 +-
 gcc/asan.cc  | 19 ++---
 gcc/auto-inc-dec.cc  |  4 +-
 gcc/auto-profile.cc  |  8 ++--
 gcc/bb-reorder.cc| 12 +++---
 gcc/cfgcleanup.cc|  8 ++--
 gcc/cfgexpand.cc |  2 +-
 gcc/cfgrtl.cc|  6 +--
 gcc/cgraphbuild.cc   | 13 +++---
 gcc/combine-stack-adj.cc |  4 +-
 gcc/combine.cc   |  4 +-
 gcc/compare-elim.cc  |  6 +--
 gcc/config/i386/i386-features.cc | 20 -
 gcc/config/i386/i386-features.h  | 16 +++
 gcc/coroutine-passes.cc  |  8 ++--
 gcc/cp/cxx-pretty-print.h| 38 -
 gcc/cp/module.cc |  4 +-
 gcc/cprop.cc |  9 ++--
 gcc/cse.cc   | 18 +---
 gcc/d/decl.cc| 36 
 gcc/d/expr.cc|  2 +-
 gcc/d/toir.cc| 64 ++--
 gcc/d/typeinfo.cc| 34 +++
 gcc/d/types.cc   | 30 ++---
 gcc/dce.cc   |  8 ++--
 gcc/df-core.cc   | 10 ++---
 gcc/dse.cc   | 14 --
 gcc/dwarf2cfi.cc |  7 ++-
 gcc/early-remat.cc   |  4 +-
 gcc/except.cc|  6 +--
 gcc/expr.cc  | 14 +++---
 gcc/final.cc | 14 --
 gcc/function.cc  | 10 ++---
 gcc/fwprop.cc|  8 ++--
 gcc/gcse.cc  | 14 --
 gcc/genmatch.cc  | 22 +-
 gcc/gensupport.cc|  2 +-
 gcc/gimple-harden-conditionals.cc| 20 ++---
 gcc/gimple-if-to-switch.cc   |  4 +-
 gcc/gimple-isel.cc   |  4 +-
 gcc/gimple-laddress.cc   |  6 +--
 gcc/gimple-loop-interchange.cc   |  6 +--
 gcc/gimple-loop-jam.cc   |  4 +-
 gcc/gimple-loop-versioning.cc|  7 ++-
 gcc/gimple-low.cc|  5 ++-
 gcc/gimple-range-cache.h |  4 +-
 gcc/gimple-ssa-backprop.cc   |  6 +--
 gcc/gimple-ssa-evrp.cc   |  6 +--
 gcc/gimple-ssa-isolate-paths.cc  |  9 ++--
 gcc/gimple-ssa-nonnull-compare.cc|  4 +-
 gcc/gimple-ssa-split-paths.cc|  9 ++--
 gcc/gimple-ssa-store-merging.cc  | 10 ++---
 gcc/gimple-ssa-strength-reduction.cc |  6 +--
 gcc/gimple-ssa-warn-access.cc|  8 ++--
 gcc/gimple-ssa-warn-alloca.cc|  8 ++--
 gcc/gimple-ssa-warn-restrict.cc  |  4 +-
 gcc/gimple-warn-recursion.cc 

[PATCH 02/10] Add 'final' and 'override' on dom_walker vfunc impls

2022-05-23 Thread David Malcolm via Gcc-patches
gcc/ChangeLog:
* compare-elim.cc: Add "final" and "override" to dom_walker vfunc
implementations, removing redundant "virtual" as appropriate.
* gimple-ssa-strength-reduction.cc: Likewise.
* ipa-prop.cc: Likewise.
* rtl-ssa/blocks.cc: Likewise.
* tree-into-ssa.cc: Likewise.
* tree-ssa-dom.cc: Likewise.
* tree-ssa-math-opts.cc: Likewise.
* tree-ssa-phiopt.cc: Likewise.
* tree-ssa-propagate.cc: Likewise.
* tree-ssa-sccvn.cc: Likewise.
* tree-ssa-strlen.cc: Likewise.
* tree-ssa-uncprop.cc: Likewise.

Signed-off-by: David Malcolm 
---
 gcc/compare-elim.cc  |  2 +-
 gcc/gimple-ssa-strength-reduction.cc |  2 +-
 gcc/ipa-prop.cc  |  4 ++--
 gcc/rtl-ssa/blocks.cc|  4 ++--
 gcc/tree-into-ssa.cc | 10 +-
 gcc/tree-ssa-dom.cc  |  4 ++--
 gcc/tree-ssa-math-opts.cc|  2 +-
 gcc/tree-ssa-phiopt.cc   |  4 ++--
 gcc/tree-ssa-propagate.cc|  4 ++--
 gcc/tree-ssa-sccvn.cc|  4 ++--
 gcc/tree-ssa-strlen.cc   |  4 ++--
 gcc/tree-ssa-uncprop.cc  |  4 ++--
 12 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/gcc/compare-elim.cc b/gcc/compare-elim.cc
index e869d9d3249..4a23202f8ff 100644
--- a/gcc/compare-elim.cc
+++ b/gcc/compare-elim.cc
@@ -283,7 +283,7 @@ public:
   find_comparison_dom_walker (cdi_direction direction)
 : dom_walker (direction) {}
 
-  virtual edge before_dom_children (basic_block);
+  edge before_dom_children (basic_block) final override;
 };
 
 /* Return true if conforming COMPARE with EH_NOTE is redundant with comparison
diff --git a/gcc/gimple-ssa-strength-reduction.cc 
b/gcc/gimple-ssa-strength-reduction.cc
index 2b559e96fc8..fb2bb9f4e74 100644
--- a/gcc/gimple-ssa-strength-reduction.cc
+++ b/gcc/gimple-ssa-strength-reduction.cc
@@ -1729,7 +1729,7 @@ class find_candidates_dom_walker : public dom_walker
 public:
   find_candidates_dom_walker (cdi_direction direction)
 : dom_walker (direction) {}
-  virtual edge before_dom_children (basic_block);
+  edge before_dom_children (basic_block) final override;
 };
 
 /* Find strength-reduction candidates in block BB.  */
diff --git a/gcc/ipa-prop.cc b/gcc/ipa-prop.cc
index c6c745f84a0..03f0ba2ec75 100644
--- a/gcc/ipa-prop.cc
+++ b/gcc/ipa-prop.cc
@@ -3004,7 +3004,7 @@ public:
   analysis_dom_walker (struct ipa_func_body_info *fbi)
 : dom_walker (CDI_DOMINATORS), m_fbi (fbi) {}
 
-  virtual edge before_dom_children (basic_block);
+  edge before_dom_children (basic_block) final override;
 
 private:
   struct ipa_func_body_info *m_fbi;
@@ -5653,7 +5653,7 @@ public:
 : dom_walker (CDI_DOMINATORS), m_fbi (fbi), m_descriptors (descs),
   m_aggval (av), m_something_changed (sc) {}
 
-  virtual edge before_dom_children (basic_block);
+  edge before_dom_children (basic_block) final override;
   bool cleanup_eh ()
 { return gimple_purge_all_dead_eh_edges (m_need_eh_cleanup); }
 
diff --git a/gcc/rtl-ssa/blocks.cc b/gcc/rtl-ssa/blocks.cc
index 959fad8f829..6b03dd03747 100644
--- a/gcc/rtl-ssa/blocks.cc
+++ b/gcc/rtl-ssa/blocks.cc
@@ -85,8 +85,8 @@ class function_info::bb_walker : public dom_walker
 {
 public:
   bb_walker (function_info *, build_info &);
-  virtual edge before_dom_children (basic_block);
-  virtual void after_dom_children (basic_block);
+  edge before_dom_children (basic_block) final override;
+  void after_dom_children (basic_block) final override;
 
 private:
   // Information about the function we're building.
diff --git a/gcc/tree-into-ssa.cc b/gcc/tree-into-ssa.cc
index 46df57ae0e1..9631d8c6556 100644
--- a/gcc/tree-into-ssa.cc
+++ b/gcc/tree-into-ssa.cc
@@ -1462,8 +1462,8 @@ public:
   rewrite_dom_walker (cdi_direction direction)
 : dom_walker (direction, ALL_BLOCKS, NULL) {}
 
-  virtual edge before_dom_children (basic_block);
-  virtual void after_dom_children (basic_block);
+  edge before_dom_children (basic_block) final override;
+  void after_dom_children (basic_block) final override;
 };
 
 /* SSA Rewriting Step 1.  Initialization, create a block local stack
@@ -2148,8 +2148,8 @@ public:
   rewrite_update_dom_walker (cdi_direction direction)
 : dom_walker (direction, ALL_BLOCKS, NULL) {}
 
-  virtual edge before_dom_children (basic_block);
-  virtual void after_dom_children (basic_block);
+  edge before_dom_children (basic_block) final override;
+  void after_dom_children (basic_block) final override;
 };
 
 /* Initialization of block data structures for the incremental SSA
@@ -2300,7 +2300,7 @@ public:
   mark_def_dom_walker (cdi_direction direction);
   ~mark_def_dom_walker ();
 
-  virtual edge before_dom_children (basic_block);
+  edge before_dom_children (basic_block) final override;
 
 private:
   /* Notice that this bitmap is indexed using variable UIDs, so it must be
diff --git a/gcc/tree-ssa-dom.cc b/gcc/tree-ssa-dom.cc
index 9

[PATCH 04/10] tree-switch-conversion.h: use final/override for cluster vfunc impls

2022-05-23 Thread David Malcolm via Gcc-patches
gcc/ChangeLog:
* tree-switch-conversion.h: Add "final" and "override" to cluster
vfunc implementations as appropriate.

Signed-off-by: David Malcolm 
---
 gcc/tree-switch-conversion.h | 32 +---
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/gcc/tree-switch-conversion.h b/gcc/tree-switch-conversion.h
index 2b677d9f7e9..d22515eb296 100644
--- a/gcc/tree-switch-conversion.h
+++ b/gcc/tree-switch-conversion.h
@@ -130,19 +130,19 @@ public:
   {}
 
   cluster_type
-  get_type ()
+  get_type () final override
   {
 return SIMPLE_CASE;
   }
 
   tree
-  get_low ()
+  get_low () final override
   {
 return m_low;
   }
 
   tree
-  get_high ()
+  get_high () final override
   {
 return m_high;
   }
@@ -153,13 +153,13 @@ public:
   }
 
   void
-  debug ()
+  debug () final override
   {
 dump (stderr);
   }
 
   void
-  dump (FILE *f, bool details ATTRIBUTE_UNUSED = false)
+  dump (FILE *f, bool details ATTRIBUTE_UNUSED = false) final override
   {
 PRINT_CASE (f, get_low ());
 if (get_low () != get_high ())
@@ -170,12 +170,12 @@ public:
 fprintf (f, " ");
   }
 
-  void emit (tree, tree, tree, basic_block, location_t)
+  void emit (tree, tree, tree, basic_block, location_t) final override
   {
 gcc_unreachable ();
   }
 
-  bool is_single_value_p ()
+  bool is_single_value_p () final override
   {
 return tree_int_cst_equal (get_low (), get_high ());
   }
@@ -224,24 +224,24 @@ public:
   ~group_cluster ();
 
   tree
-  get_low ()
+  get_low () final override
   {
 return m_cases[0]->get_low ();
   }
 
   tree
-  get_high ()
+  get_high () final override
   {
 return m_cases[m_cases.length () - 1]->get_high ();
   }
 
   void
-  debug ()
+  debug () final override
   {
 dump (stderr);
   }
 
-  void dump (FILE *f, bool details = false);
+  void dump (FILE *f, bool details = false) final override;
 
   /* List of simple clusters handled by the group.  */
   vec m_cases;
@@ -261,13 +261,14 @@ public:
   {}
 
   cluster_type
-  get_type ()
+  get_type () final override
   {
 return JUMP_TABLE;
   }
 
   void emit (tree index_expr, tree index_type,
-tree default_label_expr, basic_block default_bb, location_t loc);
+tree default_label_expr, basic_block default_bb, location_t loc)
+final override;
 
   /* Find jump tables of given CLUSTERS, where all members of the vector
  are of type simple_cluster.  New clusters are returned.  */
@@ -366,7 +367,7 @@ public:
   {}
 
   cluster_type
-  get_type ()
+  get_type () final override
   {
 return BIT_TEST;
   }
@@ -388,7 +389,8 @@ public:
 There *MUST* be max_case_bit_tests or less unique case
 node targets.  */
   void emit (tree index_expr, tree index_type,
-tree default_label_expr, basic_block default_bb, location_t loc);
+tree default_label_expr, basic_block default_bb, location_t loc)
+ final override;
 
   /* Find bit tests of given CLUSTERS, where all members of the vector
  are of type simple_cluster.  New clusters are returned.  */
-- 
2.26.3



[PATCH 03/10] expr.cc: use final/override on op_by_pieces_d vfuncs

2022-05-23 Thread David Malcolm via Gcc-patches
gcc/ChangeLog:
* expr.cc: Add "final" and "override" to op_by_pieces_d vfunc
implementations as appropriate.

Signed-off-by: David Malcolm 
---
 gcc/expr.cc | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/gcc/expr.cc b/gcc/expr.cc
index 7197996cec7..ce58728862a 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -1357,8 +1357,8 @@ op_by_pieces_d::run ()
 class move_by_pieces_d : public op_by_pieces_d
 {
   insn_gen_fn m_gen_fun;
-  void generate (rtx, rtx, machine_mode);
-  bool prepare_mode (machine_mode, unsigned int);
+  void generate (rtx, rtx, machine_mode) final override;
+  bool prepare_mode (machine_mode, unsigned int) final override;
 
  public:
   move_by_pieces_d (rtx to, rtx from, unsigned HOST_WIDE_INT len,
@@ -1453,8 +1453,8 @@ move_by_pieces (rtx to, rtx from, unsigned HOST_WIDE_INT 
len,
 class store_by_pieces_d : public op_by_pieces_d
 {
   insn_gen_fn m_gen_fun;
-  void generate (rtx, rtx, machine_mode);
-  bool prepare_mode (machine_mode, unsigned int);
+  void generate (rtx, rtx, machine_mode) final override;
+  bool prepare_mode (machine_mode, unsigned int) final override;
 
  public:
   store_by_pieces_d (rtx to, by_pieces_constfn cfn, void *cfn_data,
@@ -1650,9 +1650,9 @@ class compare_by_pieces_d : public op_by_pieces_d
   rtx m_accumulator;
   int m_count, m_batch;
 
-  void generate (rtx, rtx, machine_mode);
-  bool prepare_mode (machine_mode, unsigned int);
-  void finish_mode (machine_mode);
+  void generate (rtx, rtx, machine_mode) final override;
+  bool prepare_mode (machine_mode, unsigned int) final override;
+  void finish_mode (machine_mode) final override;
  public:
   compare_by_pieces_d (rtx op0, rtx op1, by_pieces_constfn op1_cfn,
   void *op1_cfn_data, HOST_WIDE_INT len, int align,
-- 
2.26.3



[committed] jit: use 'final' and 'override' where appropriate

2022-05-23 Thread David Malcolm via Gcc-patches
Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.
Pushed to trunk as r13-715-g58c9c7407a1a99.

gcc/jit/ChangeLog:
* jit-recording.h: Add "final" and "override" to all vfunc
implementations that were missing them, as appropriate.

Signed-off-by: David Malcolm 
---
 gcc/jit/jit-recording.h | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/gcc/jit/jit-recording.h b/gcc/jit/jit-recording.h
index 2dd8a957ca7..0dfb42f2676 100644
--- a/gcc/jit/jit-recording.h
+++ b/gcc/jit/jit-recording.h
@@ -721,14 +721,14 @@ public:
   /* Strip off the "const", giving the underlying type.  */
   type *unqualified () final override { return m_other_type; }
 
-  virtual bool is_same_type_as (type *other)
+  bool is_same_type_as (type *other) final override
   {
 if (!other->is_const ())
   return false;
 return m_other_type->is_same_type_as (other->is_const ());
   }
 
-  virtual type *is_const () { return m_other_type; }
+  type *is_const () final override { return m_other_type; }
 
   void replay_into (replayer *) final override;
 
@@ -744,7 +744,7 @@ public:
   memento_of_get_volatile (type *other_type)
   : decorated_type (other_type) {}
 
-  virtual bool is_same_type_as (type *other)
+  bool is_same_type_as (type *other) final override
   {
 if (!other->is_volatile ())
   return false;
@@ -754,7 +754,7 @@ public:
   /* Strip off the "volatile", giving the underlying type.  */
   type *unqualified () final override { return m_other_type; }
 
-  virtual type *is_volatile () { return m_other_type; }
+  type *is_volatile () final override { return m_other_type; }
 
   void replay_into (replayer *) final override;
 
@@ -1051,7 +1051,7 @@ public:
 
   void replay_into (replayer *r) final override;
 
-  virtual bool is_union () const final override { return true; }
+  bool is_union () const final override { return true; }
 
 private:
   string * make_debug_string () final override;
-- 
2.26.3



[committed] analyzer: use 'final' and 'override' where appropriate

2022-05-23 Thread David Malcolm via Gcc-patches
Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.
Pushed to trunk as r13-714-g2ac1459f044ee5.

gcc/analyzer/ChangeLog:
* call-info.cc: Add "final" and "override" to all vfunc
implementations that were missing them, as appropriate.
* engine.cc: Likewise.
* region-model.cc: Likewise.
* sm-malloc.cc: Likewise.
* supergraph.h: Likewise.
* svalue.cc: Likewise.
* varargs.cc: Likewise.

Signed-off-by: David Malcolm 
---
 gcc/analyzer/call-info.cc|  2 +-
 gcc/analyzer/engine.cc   | 10 +-
 gcc/analyzer/region-model.cc |  2 +-
 gcc/analyzer/sm-malloc.cc|  6 --
 gcc/analyzer/supergraph.h|  3 ++-
 gcc/analyzer/svalue.cc   |  4 ++--
 gcc/analyzer/varargs.cc  |  2 +-
 7 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/gcc/analyzer/call-info.cc b/gcc/analyzer/call-info.cc
index 2d3fe0a790b..b3ff51e7460 100644
--- a/gcc/analyzer/call-info.cc
+++ b/gcc/analyzer/call-info.cc
@@ -96,7 +96,7 @@ call_info::add_events_to_path (checker_path *emission_path,
m_call_info (call_info)
 {}
 
-label_text get_desc (bool can_colorize) const
+label_text get_desc (bool can_colorize) const final override
 {
   return m_call_info->get_desc (can_colorize);
 }
diff --git a/gcc/analyzer/engine.cc b/gcc/analyzer/engine.cc
index c59374be81a..5ccfedf9a44 100644
--- a/gcc/analyzer/engine.cc
+++ b/gcc/analyzer/engine.cc
@@ -328,7 +328,7 @@ public:
   }
 
   state_machine::state_t get_state (const gimple *stmt ATTRIBUTE_UNUSED,
-   tree var)
+   tree var) final override
   {
 logger * const logger = get_logger ();
 LOG_FUNC (logger);
@@ -342,7 +342,7 @@ public:
 return current;
   }
   state_machine::state_t get_state (const gimple *stmt ATTRIBUTE_UNUSED,
-   const svalue *sval)
+   const svalue *sval) final override
   {
 logger * const logger = get_logger ();
 LOG_FUNC (logger);
@@ -355,7 +355,7 @@ public:
   void set_next_state (const gimple *stmt,
   tree var,
   state_machine::state_t to,
-  tree origin)
+  tree origin) final override
   {
 logger * const logger = get_logger ();
 LOG_FUNC (logger);
@@ -384,7 +384,7 @@ public:
   void set_next_state (const gimple *stmt,
   const svalue *sval,
   state_machine::state_t to,
-  tree origin)
+  tree origin) final override
   {
 logger * const logger = get_logger ();
 LOG_FUNC (logger);
@@ -1597,7 +1597,7 @@ public:
 return false;
   }
 
-  label_text describe_final_event (const evdesc::final_event &ev)
+  label_text describe_final_event (const evdesc::final_event &ev) final 
override
   {
 if (m_stack_pop_event)
   return ev.formatted_print
diff --git a/gcc/analyzer/region-model.cc b/gcc/analyzer/region-model.cc
index 6f6a061cf75..6b49719d521 100644
--- a/gcc/analyzer/region-model.cc
+++ b/gcc/analyzer/region-model.cc
@@ -1640,7 +1640,7 @@ public:
 
   const char *get_kind () const final override { return "reason_attr_access"; }
 
-  void emit () const
+  void emit () const final override
   {
 inform (DECL_SOURCE_LOCATION (m_callee_fndecl),
"parameter %i of %qD marked with attribute %qs",
diff --git a/gcc/analyzer/sm-malloc.cc b/gcc/analyzer/sm-malloc.cc
index abdce6b0e7e..3c0f8902075 100644
--- a/gcc/analyzer/sm-malloc.cc
+++ b/gcc/analyzer/sm-malloc.cc
@@ -1015,7 +1015,8 @@ public:
 
   const char *get_kind () const final override { return "possible_null_arg"; }
 
-  bool subclass_equal_p (const pending_diagnostic &base_other) const
+  bool subclass_equal_p (const pending_diagnostic &base_other)
+const final override
   {
 const possible_null_arg &sub_other
   = (const possible_null_arg &)base_other;
@@ -1119,7 +1120,8 @@ public:
 
   const char *get_kind () const final override { return "null_arg"; }
 
-  bool subclass_equal_p (const pending_diagnostic &base_other) const
+  bool subclass_equal_p (const pending_diagnostic &base_other)
+const final override
   {
 const null_arg &sub_other
   = (const null_arg &)base_other;
diff --git a/gcc/analyzer/supergraph.h b/gcc/analyzer/supergraph.h
index cc53fcb7f24..42c6df57435 100644
--- a/gcc/analyzer/supergraph.h
+++ b/gcc/analyzer/supergraph.h
@@ -308,7 +308,8 @@ class superedge : public dedge
 
   void dump (pretty_printer *pp) const;
   void dump () const;
-  void dump_dot (graphviz_out *gv, const dump_args_t &args) const;
+  void dump_dot (graphviz_out *gv, const dump_args_t &args)
+const final override;
 
   virtual void dump_label_to_pp (pretty_printer *pp,
 bool user_facing) const = 0;
diff --git a/gcc/analyzer/svalue.cc b/gcc/analyzer/svalue.cc
index ed289c6fc31..2f9149412b9 100644
--- 

[PATCH v2] DSE: Use the constant store source if possible

2022-05-23 Thread H.J. Lu via Gcc-patches
On Mon, May 23, 2022 at 12:38:06PM +0200, Richard Biener wrote:
> On Sat, May 21, 2022 at 5:02 AM H.J. Lu via Gcc-patches
>  wrote:
> >
> > When recording store for RTL dead store elimination, check if the source
> > register is set only once to a constant.  If yes, record the constant
> > as the store source.  It eliminates unrolled zero stores after memset 0
> > in a loop where a vector register is used as the zero store source.
> >
> > gcc/
> >
> > PR rtl-optimization/105638
> > * dse.cc (record_store): Use the constant source if the source
> > register is set only once.
> >
> > gcc/testsuite/
> >
> > PR rtl-optimization/105638
> > * g++.target/i386/pr105638.C: New test.
> > ---
> >  gcc/dse.cc   | 19 ++
> >  gcc/testsuite/g++.target/i386/pr105638.C | 44 
> >  2 files changed, 63 insertions(+)
> >  create mode 100644 gcc/testsuite/g++.target/i386/pr105638.C
> >
> > diff --git a/gcc/dse.cc b/gcc/dse.cc
> > index 30c11cee034..0433dd3d846 100644
> > --- a/gcc/dse.cc
> > +++ b/gcc/dse.cc
> > @@ -1508,6 +1508,25 @@ record_store (rtx body, bb_info_t bb_info)
> >
> >   if (tem && CONSTANT_P (tem))
> > const_rhs = tem;
> > + else
> > +   {
> > + /* If RHS is set only once to a constant, set CONST_RHS
> > +to the constant.  */
> > + df_ref def = DF_REG_DEF_CHAIN (REGNO (rhs));
> > + if (def != nullptr
> > + && !DF_REF_IS_ARTIFICIAL (def)
> > + && !DF_REF_NEXT_REG (def))
> > +   {
> > + rtx_insn *def_insn = DF_REF_INSN (def);
> > + rtx def_body = PATTERN (def_insn);
> > + if (GET_CODE (def_body) == SET)
> > +   {
> > + rtx def_src = SET_SRC (def_body);
> > + if (CONSTANT_P (def_src))
> > +   const_rhs = def_src;
> 
> doesn't DSE have its own tracking of stored values?  Shouldn't we

It tracks stored values only within the basic block.  When RTL loop
invariant motion hoists a constant initialization out of the loop into
a separate basic block, the constant store value becomes unknown
within the original basic block.

> improve _that_ if it is not enough?  I also wonder if you need to

My patch extends DSE stored value tracking to include the constant which
is set only once in another basic block.

> verify the SET isn't partial?
> 

Here is the v2 patch to check that the constant is set by a non-partial
unconditional load.

OK for master?

Thanks.

H.J.
---
RTL DSE tracks redundant constant stores within a basic block.  When RTL
loop invariant motion hoists a constant initialization out of the loop
into a separate basic block, the constant store value becomes unknown
within the original basic block.  When recording store for RTL DSE, check
if the source register is set only once to a constant by a non-partial
unconditional load.  If yes, record the constant as the constant store
source.  It eliminates unrolled zero stores after memset 0 in a loop
where a vector register is used as the zero store source.

gcc/

PR rtl-optimization/105638
* dse.cc (record_store): Use the constant source if the source
register is set only once.

gcc/testsuite/

PR rtl-optimization/105638
* g++.target/i386/pr105638.C: New test.
---
 gcc/dse.cc   | 22 
 gcc/testsuite/g++.target/i386/pr105638.C | 44 
 2 files changed, 66 insertions(+)
 create mode 100644 gcc/testsuite/g++.target/i386/pr105638.C

diff --git a/gcc/dse.cc b/gcc/dse.cc
index 30c11cee034..af8e88dac32 100644
--- a/gcc/dse.cc
+++ b/gcc/dse.cc
@@ -1508,6 +1508,28 @@ record_store (rtx body, bb_info_t bb_info)
 
  if (tem && CONSTANT_P (tem))
const_rhs = tem;
+ else
+   {
+ /* If RHS is set only once to a constant, set CONST_RHS
+to the constant.  */
+ df_ref def = DF_REG_DEF_CHAIN (REGNO (rhs));
+ if (def != nullptr
+ && !DF_REF_IS_ARTIFICIAL (def)
+ && !(DF_REF_FLAGS (def)
+  & (DF_REF_PARTIAL | DF_REF_CONDITIONAL))
+ && !DF_REF_NEXT_REG (def))
+   {
+ rtx_insn *def_insn = DF_REF_INSN (def);
+ rtx def_body = PATTERN (def_insn);
+ if (GET_CODE (def_body) == SET)
+   {
+ rtx def_src = SET_SRC (def_body);
+ if (CONSTANT_P (def_src)
+ && GET_MODE (def_src) == GET_MODE (rhs))
+   const_rhs = def_src;
+   }
+   }
+   }
}
 }
 
diff --git a/gcc/testsuite/g++.target/i386/pr105638.C 
b/gcc/testsuite/g++.target/i386/pr105638.C
new file mode 100644
index 000..ff40a459de1
--

[r13-707 Regression] FAIL: gcc.target/i386/pr45685.c scan-assembler-times cmov 6 on Linux/x86_64

2022-05-23 Thread skpandey--- via Gcc-patches
On Linux/x86_64,

68e0063397ba820e71adc220b2da0581dce29ffa is the first bad commit
commit 68e0063397ba820e71adc220b2da0581dce29ffa
Author: Richard Biener 
Date:   Mon Apr 11 13:36:53 2022 +0200

Force the selection operand of a GIMPLE COND_EXPR to be a register

caused

FAIL: gcc.target/i386/pr45685.c scan-assembler-times cmov 6

with GCC configured with

../../gcc/configure 
--prefix=/local/skpandey/gccwork/toolwork/gcc-bisect-master/master/r13-707/usr 
--enable-clocale=gnu --with-system-zlib --with-demangler-in-ld 
--with-fpmath=sse --enable-languages=c,c++,fortran --enable-cet --without-isl 
--enable-libmpx x86_64-linux --disable-bootstrap

To reproduce:

$ cd {build_dir}/gcc && make check 
RUNTESTFLAGS="i386.exp=gcc.target/i386/pr45685.c --target_board='unix{-m64\ 
-march=cascadelake}'"

(Please do not reply to this email, for question about this report, contact me 
at skpgkp2 at gmail dot com)


[PATCH] [PR/target 105666] RISC-V: Inhibit FP <--> int register moves via tune param

2022-05-23 Thread Vineet Gupta
Under extreme register pressure, compiler can use FP <--> int
moves as a cheap alternate to spilling to memory.
This was seen with SPEC2017 FP benchmark 507.cactu:
ML_BSSN_Advect.cc:ML_BSSN_Advect_Body()

|   fmv.d.x fa5,s9  # PDupwindNthSymm2Xt1, PDupwindNthSymm2Xt1
| .LVL325:
|   ld  s9,184(sp)  # _12469, %sfp
| ...
| .LVL339:
|   fmv.x.d s4,fa5  # PDupwindNthSymm2Xt1, PDupwindNthSymm2Xt1
|

The FMV instructions could be costlier (than stack spill) on certain
micro-architectures, thus this needs to be a per-cpu tunable
(default being to inhibit on all existing RV cpus).

Testsuite run with new test reports 10 failures without the fix
corresponding to the build variations of pr105666.c

|   === gcc Summary ===
|
| # of expected passes  123318   (+10)
| # of unexpected failures  34   (-10)
| # of unexpected successes 4
| # of expected failures780
| # of unresolved testcases 4
| # of unsupported tests2796

gcc/Changelog:

* config/riscv/riscv.cc: (struct riscv_tune_param): Add
  fmv_cost.
(rocket_tune_info): Add default fmv_cost 8.
(sifive_7_tune_info): Ditto.
(thead_c906_tune_info): Ditto.
(optimize_size_tune_info): Ditto.
(riscv_register_move_cost): Use fmv_cost for int<->fp moves.

gcc/testsuite/Changelog:

* gcc.target/riscv/pr105666.c: New test.

Signed-off-by: Vineet Gupta 
---
 gcc/config/riscv/riscv.cc |  9 
 gcc/testsuite/gcc.target/riscv/pr105666.c | 55 +++
 2 files changed, 64 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/pr105666.c

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index ee756aab6940..f3ac0d8865f0 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -220,6 +220,7 @@ struct riscv_tune_param
   unsigned short issue_rate;
   unsigned short branch_cost;
   unsigned short memory_cost;
+  unsigned short fmv_cost;
   bool slow_unaligned_access;
 };
 
@@ -285,6 +286,7 @@ static const struct riscv_tune_param rocket_tune_info = {
   1,   /* issue_rate */
   3,   /* branch_cost */
   5,   /* memory_cost */
+  8,   /* fmv_cost */
   true,/* 
slow_unaligned_access */
 };
 
@@ -298,6 +300,7 @@ static const struct riscv_tune_param sifive_7_tune_info = {
   2,   /* issue_rate */
   4,   /* branch_cost */
   3,   /* memory_cost */
+  8,   /* fmv_cost */
   true,/* 
slow_unaligned_access */
 };
 
@@ -311,6 +314,7 @@ static const struct riscv_tune_param thead_c906_tune_info = 
{
   1,/* issue_rate */
   3,/* branch_cost */
   5,/* memory_cost */
+  8,   /* fmv_cost */
   false,/* slow_unaligned_access */
 };
 
@@ -324,6 +328,7 @@ static const struct riscv_tune_param 
optimize_size_tune_info = {
   1,   /* issue_rate */
   1,   /* branch_cost */
   2,   /* memory_cost */
+  8,   /* fmv_cost */
   false,   /* slow_unaligned_access */
 };
 
@@ -4737,6 +4742,10 @@ static int
 riscv_register_move_cost (machine_mode mode,
  reg_class_t from, reg_class_t to)
 {
+  if ((from == FP_REGS && to == GR_REGS) ||
+  (from == GR_REGS && to == FP_REGS))
+return tune_param->fmv_cost;
+
   return riscv_secondary_memory_needed (mode, from, to) ? 8 : 2;
 }
 
diff --git a/gcc/testsuite/gcc.target/riscv/pr105666.c 
b/gcc/testsuite/gcc.target/riscv/pr105666.c
new file mode 100644
index ..904f3bc0763f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/pr105666.c
@@ -0,0 +1,55 @@
+/* Shamelessly plugged off gcc/testsuite/gcc.c-torture/execute/pr28982a.c.  
+
+   The idea is to induce high register pressure for both int/fp registers
+   so that they spill. By default FMV instructions would be used to stash
+   int reg to a fp reg (and vice-versa) but that could be costlier than
+   spilling to stack.  */
+
+/* { dg-do compile } */
+/* { dg-options "-march=rv64g -ffast-math" } */
+
+#define NITER 4
+#define NVARS 20
+#define MULTI(X) \
+  X( 0), X( 1), X( 2), X( 3), X( 4), X( 5), X( 6), X( 7), X( 8), X( 9), \
+  X(10), X(11), X(12), X(13), X(14), X(15), X(16), X(17), X(18), X(19)
+
+#define DECLAREI(INDEX) inc##INDEX = incs[INDEX]
+#define DECLAREF(INDEX) *ptr##INDEX = ptrs[INDEX], result##INDEX = 5
+#define LOOP(INDEX) result##INDEX += result##INDEX * (*ptr##INDEX), ptr##INDEX 
+= inc##IND

[PATCH] x86: Avoid uninitialized variable in PR target/104441 test

2022-05-23 Thread H.J. Lu via Gcc-patches
PR target/104441
* gcc.target/i386/pr104441-1a.c (load8bit_4x4_avx2): Initialize
src23.
---
 gcc/testsuite/gcc.target/i386/pr104441-1a.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/i386/pr104441-1a.c 
b/gcc/testsuite/gcc.target/i386/pr104441-1a.c
index 83734f710bd..0931029f2bb 100644
--- a/gcc/testsuite/gcc.target/i386/pr104441-1a.c
+++ b/gcc/testsuite/gcc.target/i386/pr104441-1a.c
@@ -8,7 +8,7 @@ __attribute__((always_inline, target("avx2")))
 static __m256i
 load8bit_4x4_avx2(const uint8_t *const src, const uint32_t stride)
 {
-  __m128i src01, src23;
+  __m128i src01, src23 = _mm_setzero_si128();
   src01 = _mm_cvtsi32_si128(*(int32_t*)(src + 0 * stride));
   src23 = _mm_insert_epi32(src23, *(int32_t *)(src + 3 * stride), 1);
   return _mm256_setr_m128i(src01, src23);
-- 
2.36.1



[PATCH v3] x86: Document -mcet-switch

2022-05-23 Thread H.J. Lu via Gcc-patches
When -fcf-protection=branch is used, the compiler will generate jump
tables for switch statements where the indirect jump is prefixed with
the NOTRACK prefix, so it can jump to non-ENDBR targets.  Since the
indirect jump targets are generated by the compiler and stored in
read-only memory, this does not result in a direct loss of hardening.
But if the jump table index is attacker-controlled, the indirect jump
may not be constrained by CET.

Document -mcet-switch to generate jump tables for switch statements with
ENDBR and skip the NOTRACK prefix for indirect jump.  This option should
be used when the NOTRACK prefix is disabled.

PR target/104816
* config/i386/i386.opt: Remove Undocumented.
* doc/invoke.texi: Document -mcet-switch.
---
 gcc/config/i386/i386.opt |  2 +-
 gcc/doc/invoke.texi  | 14 +-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index a6b0e28f238..0dbaacb57ed 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1047,7 +1047,7 @@ Enable shadow stack built-in functions from Control-flow 
Enforcement
 Technology (CET).
 
 mcet-switch
-Target Undocumented Var(flag_cet_switch) Init(0)
+Target Var(flag_cet_switch) Init(0)
 Turn on CET instrumentation for switch statements that use a jump table and
 an indirect jump.
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index d8095e3128f..1f38e91b50b 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -1425,7 +1425,8 @@ See RS/6000 and PowerPC Options.
 -msse4a  -m3dnow  -m3dnowa  -mpopcnt  -mabm  -mbmi  -mtbm  -mfma4  -mxop @gol
 -madx  -mlzcnt  -mbmi2  -mfxsr  -mxsave  -mxsaveopt  -mrtm  -mhle  -mlwp @gol
 -mmwaitx  -mclzero  -mpku  -mthreads  -mgfni  -mvaes  -mwaitpkg @gol
--mshstk -mmanual-endbr -mforce-indirect-call  -mavx512vbmi2 -mavx512bf16 
-menqcmd @gol
+-mshstk -mmanual-endbr -mcet-switch -mforce-indirect-call @gol
+-mavx512vbmi2 -mavx512bf16 -menqcmd @gol
 -mvpclmulqdq  -mavx512bitalg  -mmovdiri  -mmovdir64b  -mavx512vpopcntdq @gol
 -mavx5124fmaps  -mavx512vnni  -mavx5124vnniw  -mprfchw  -mrdpid @gol
 -mrdseed  -msgx -mavx512vp2intersect -mserialize -mtsxldtrk@gol
@@ -32719,6 +32720,17 @@ function attribute. This is useful when used with the 
option
 @option{-fcf-protection=branch} to control ENDBR insertion at the
 function entry.
 
+@item -mcet-switch
+@opindex mcet-switch
+By default, CET instrumentation is turned off on switch statements that
+use a jump table and indirect branch track is disabled.  Since jump
+tables are stored in read-only memory, this does not result in a direct
+loss of hardening.  But if the jump table index is attacker-controlled,
+the indirect jump may not be constrained by CET.  This option turns on
+CET instrumentation to enable indirect branch track for switch statements
+with jump tables which leads to the jump targets reachable via any indirect
+jumps.
+
 @item -mcall-ms2sysv-xlogues
 @opindex mcall-ms2sysv-xlogues
 @opindex mno-call-ms2sysv-xlogues
-- 
2.36.1



Re: [2/2] PR96463 -- changes to type checking vec_perm_expr in middle end

2022-05-23 Thread Prathamesh Kulkarni via Gcc-patches
On Mon, 9 May 2022 at 21:21, Prathamesh Kulkarni
 wrote:
>
> On Mon, 9 May 2022 at 19:22, Richard Sandiford
>  wrote:
> >
> > Prathamesh Kulkarni  writes:
> > > On Tue, 3 May 2022 at 18:25, Richard Sandiford
> > >  wrote:
> > >>
> > >> Prathamesh Kulkarni  writes:
> > >> > On Tue, 4 Jan 2022 at 19:12, Richard Sandiford
> > >> >  wrote:
> > >> >>
> > >> >> Richard Biener  writes:
> > >> >> > On Tue, 4 Jan 2022, Richard Sandiford wrote:
> > >> >> >
> > >> >> >> Richard Biener  writes:
> > >> >> >> > On Fri, 17 Dec 2021, Richard Sandiford wrote:
> > >> >> >> >
> > >> >> >> >> Prathamesh Kulkarni  writes:
> > >> >> >> >> > Hi,
> > >> >> >> >> > The attached patch rearranges order of type-check for 
> > >> >> >> >> > vec_perm_expr
> > >> >> >> >> > and relaxes type checking for
> > >> >> >> >> > lhs = vec_perm_expr
> > >> >> >> >> >
> > >> >> >> >> > when:
> > >> >> >> >> > rhs1 == rhs2,
> > >> >> >> >> > lhs is variable length vector,
> > >> >> >> >> > rhs1 is fixed length vector,
> > >> >> >> >> > TREE_TYPE (lhs) == TREE_TYPE (rhs1)
> > >> >> >> >> >
> > >> >> >> >> > I am not sure tho if this check is correct ? My intent was to 
> > >> >> >> >> > capture
> > >> >> >> >> > case when vec_perm_expr is used to "extend" fixed length 
> > >> >> >> >> > vector to
> > >> >> >> >> > it's VLA equivalent.
> > >> >> >> >>
> > >> >> >> >> VLAness isn't really the issue.  We want the same thing to work 
> > >> >> >> >> for
> > >> >> >> >> -msve-vector-bits=256, -msve-vector-bits=512, etc., even though 
> > >> >> >> >> the
> > >> >> >> >> vectors are fixed-length in that case.
> > >> >> >> >>
> > >> >> >> >> The principle is that for:
> > >> >> >> >>
> > >> >> >> >>   A = VEC_PERM_EXPR ;
> > >> >> >> >>
> > >> >> >> >> the requirements are:
> > >> >> >> >>
> > >> >> >> >> - A, B, C and D must be vectors
> > >> >> >> >> - A, B and C must have the same element type
> > >> >> >> >> - D must have an integer element type
> > >> >> >> >> - A and D must have the same number of elements (NA)
> > >> >> >> >> - B and C must have the same number of elements (NB)
> > >> >> >> >>
> > >> >> >> >> The semantics are that we create a joined vector BC (all 
> > >> >> >> >> elements of B
> > >> >> >> >> followed by all element of C) and that:
> > >> >> >> >>
> > >> >> >> >>   A[i] = BC[D[i] % (NB+NB)]
> > >> >> >> >>
> > >> >> >> >> for 0 ≤ i < NA.
> > >> >> >> >>
> > >> >> >> >> This operation makes sense even if NA != NB.
> > >> >> >> >
> > >> >> >> > But note that we don't currently expect NA != NB and the optab 
> > >> >> >> > just
> > >> >> >> > has a single mode.
> > >> >> >>
> > >> >> >> True, but we only need this for constant permutes.  They are 
> > >> >> >> already
> > >> >> >> special in that they allow the index elements to be wider than the 
> > >> >> >> data
> > >> >> >> elements.
> > >> >> >
> > >> >> > OK, then we should reflect this in the stmt verification and only 
> > >> >> > relax
> > >> >> > the constant permute vector case and also amend the
> > >> >> > TARGET_VECTORIZE_VEC_PERM_CONST accordingly.
> > >> >>
> > >> >> Sounds good.
> > >> >>
> > >> >> > For non-constant permutes the docs say the mode of vec_perm is
> > >> >> > the common mode of operands 1 and 2 whilst the mode of operand 0
> > >> >> > is unspecified - even unconstrained by the docs.  I'm not sure
> > >> >> > if vec_perm expansion is expected to eventually FAIL.  Updating the
> > >> >> > docs of vec_perm would be appreciated as well.
> > >> >>
> > >> >> Yeah, I guess de facto operand 0 has to be the same mode as operands
> > >> >> 1 and 2.  Maybe that was just an oversight, or maybe it seemed obvious
> > >> >> or self-explanatory at the time. :-)
> > >> >>
> > >> >> > As said I prefer to not mangle the existing stmt checking too much
> > >> >> > at this stage so minimal adjustment is prefered there.
> > >> >>
> > >> >> The PR is only an enhancement request rather than a bug, so I think 
> > >> >> the
> > >> >> patch would need to wait for GCC 13 whatever happens.
> > >> > Hi,
> > >> > In attached patch, the type checking is relaxed only if mask is 
> > >> > constant.
> > >> > Does this look OK ?
> > >> >
> > >> > Thanks,
> > >> > Prathamesh
> > >> >>
> > >> >> Thanks,
> > >> >> Richard
> > >> >
> > >> > diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc
> > >> > index e321d929fd0..02b88f67855 100644
> > >> > --- a/gcc/tree-cfg.cc
> > >> > +++ b/gcc/tree-cfg.cc
> > >> > @@ -4307,6 +4307,24 @@ verify_gimple_assign_ternary (gassign *stmt)
> > >> >break;
> > >> >
> > >> >  case VEC_PERM_EXPR:
> > >> > +  /* If permute is constant, then we allow for lhs and rhs
> > >> > +  to have different vector types, provided:
> > >> > +  (1) lhs, rhs1, rhs2, and rhs3 have same element type.
> > >>
> > >> This isn't a requirement for rhs3.
> > >>
> > >> > +  (2) rhs3 vector has integer element type.
> > >> > +  (3) len(lhs) == len(rhs3) && len(rhs1) == len(rhs2).  */
> > >> > +
> > >> > +  if (TREE_CONSTANT (rhs3)
> > >> > +   && VECTO

Re: [PATCH] tree-optimization/105629 - spaceship recognition regression

2022-05-23 Thread Jakub Jelinek via Gcc-patches
On Mon, May 23, 2022 at 12:53:37PM +0200, Richard Biener wrote:
> With the extra GENERIC folding we now do to
> (unsigned int) __v._M_value & 1 != (unsigned int) __v._M_value
> we end up with a sign-extending conversion to unsigned int
> rather than the sign-conversion to unsigned char we expect.
> Relaxing that fixes the regression.
> 
> Bootstrapped and tested on x86_64-unknown-linux-gnu, OK?
> 
> Thanks,
> Richard.
> 
> 2022-05-23  Richard Biener  
> 
>   PR tree-optimization/105629
>   * tree-ssa-phiopt.cc (spaceship_replacement): Allow
>   a sign-extending conversion.
> ---
>  gcc/tree-ssa-phiopt.cc | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/gcc/tree-ssa-phiopt.cc b/gcc/tree-ssa-phiopt.cc
> index 8c9c46d41f1..e61d9736937 100644
> --- a/gcc/tree-ssa-phiopt.cc
> +++ b/gcc/tree-ssa-phiopt.cc
> @@ -2217,7 +2217,7 @@ spaceship_replacement (basic_block cond_bb, basic_block 
> middle_bb,
>  
>if (!TYPE_UNSIGNED (ty2) || !INTEGRAL_TYPE_P (ty2))
>   return false;
> -  if (TYPE_PRECISION (ty1) != TYPE_PRECISION (ty2))
> +  if (TYPE_PRECISION (ty1) > TYPE_PRECISION (ty2))
>   return false;
>if (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (orig_use_lhs))
>   return false;

LGTM, thanks.

Jakub



[PATCH v3 4/5] xtensa: Add setmemsi insn pattern

2022-05-23 Thread Takayuki 'January June' Suwa via Gcc-patches

This patch introduces setmemsi insn pattern of two kinds, unrolled loop and
small loop, for fixed small length and constant initialization value.

gcc/ChangeLog:

* gcc/config/xtensa/xtensa-protos.h
(xtensa_expand_block_set_unrolled_loop,
xtensa_expand_block_set_small_loop): New prototypes.
* gcc/config/xtensa/xtensa.cc (xtensa_sizeof_MOVI,
xtensa_expand_block_set_unrolled_loop,
xtensa_expand_block_set_small_loop): New functions.
* gcc/config/xtensa/xtensa.md (setmemsi): New expansion pattern.
* gcc/config/xtensa/xtensa.opt (mlongcalls): Add target mask.
---
 gcc/config/xtensa/xtensa-protos.h |   2 +
 gcc/config/xtensa/xtensa.cc   | 211 ++
 gcc/config/xtensa/xtensa.md   |  16 +++
 gcc/config/xtensa/xtensa.opt  |   2 +-
 4 files changed, 230 insertions(+), 1 deletion(-)

diff --git a/gcc/config/xtensa/xtensa-protos.h 
b/gcc/config/xtensa/xtensa-protos.h

index 4bc42da2320..30e4b54394a 100644
--- a/gcc/config/xtensa/xtensa-protos.h
+++ b/gcc/config/xtensa/xtensa-protos.h
@@ -41,6 +41,8 @@ extern void xtensa_expand_conditional_branch (rtx *, 
machine_mode);

 extern int xtensa_expand_conditional_move (rtx *, int);
 extern int xtensa_expand_scc (rtx *, machine_mode);
 extern int xtensa_expand_block_move (rtx *);
+extern int xtensa_expand_block_set_unrolled_loop (rtx *);
+extern int xtensa_expand_block_set_small_loop (rtx *);
 extern void xtensa_split_operand_pair (rtx *, machine_mode);
 extern int xtensa_emit_move_sequence (rtx *, machine_mode);
 extern rtx xtensa_copy_incoming_a7 (rtx);
diff --git a/gcc/config/xtensa/xtensa.cc b/gcc/config/xtensa/xtensa.cc
index d2aabf38339..c7b54babc37 100644
--- a/gcc/config/xtensa/xtensa.cc
+++ b/gcc/config/xtensa/xtensa.cc
@@ -1373,6 +1373,217 @@ xtensa_expand_block_move (rtx *operands)
 }


+/* Try to expand a block set operation to a sequence of RTL move
+   instructions.  If not optimizing, or if the block size is not a
+   constant, or if the block is too large, or if the value to
+   initialize the block with is not a constant, the expansion
+   fails and GCC falls back to calling memset().
+
+   operands[0] is the destination
+   operands[1] is the length
+   operands[2] is the initialization value
+   operands[3] is the alignment */
+
+static int
+xtensa_sizeof_MOVI (HOST_WIDE_INT imm)
+{
+  return (TARGET_DENSITY && IN_RANGE (imm, -32, 95)) ? 2 : 3;
+}
+
+int
+xtensa_expand_block_set_unrolled_loop (rtx *operands)
+{
+  rtx dst_mem = operands[0];
+  HOST_WIDE_INT bytes, value, align;
+  int expand_len, funccall_len;
+  rtx x, reg;
+  int offset;
+
+  if (!CONST_INT_P (operands[1]) || !CONST_INT_P (operands[2]))
+return 0;
+
+  bytes = INTVAL (operands[1]);
+  if (bytes <= 0)
+return 0;
+  value = (int8_t)INTVAL (operands[2]);
+  align = INTVAL (operands[3]);
+  if (align > MOVE_MAX)
+align = MOVE_MAX;
+
+  /* Insn expansion: holding the init value.
+ Either MOV(.N) or L32R w/litpool.  */
+  if (align == 1)
+expand_len = xtensa_sizeof_MOVI (value);
+  else if (value == 0 || value == -1)
+expand_len = TARGET_DENSITY ? 2 : 3;
+  else
+expand_len = 3 + 4;
+  /* Insn expansion: a series of aligned memory stores.
+ Consist of S8I, S16I or S32I(.N).  */
+  expand_len += (bytes / align) * (TARGET_DENSITY
+  && align == 4 ? 2 : 3);
+  /* Insn expansion: the remainder, sub-aligned memory stores.
+ A combination of S8I and S16I as needed.  */
+  expand_len += ((bytes % align + 1) / 2) * 3;
+
+  /* Function call: preparing two arguments.  */
+  funccall_len = xtensa_sizeof_MOVI (value);
+  funccall_len += xtensa_sizeof_MOVI (bytes);
+  /* Function call: calling memset().  */
+  funccall_len += TARGET_LONGCALLS ? (3 + 4 + 3) : 3;
+
+  /* Apply expansion bonus (2x) if optimizing for speed.  */
+  if (optimize > 1 && !optimize_size)
+funccall_len *= 2;
+
+  /* Decide whether to expand or not, based on the sum of the length
+ of instructions.  */
+  if (expand_len > funccall_len)
+return 0;
+
+  x = XEXP (dst_mem, 0);
+  if (!REG_P (x))
+dst_mem = replace_equiv_address (dst_mem, force_reg (Pmode, x));
+  switch (align)
+{
+case 1:
+  break;
+case 2:
+  value = (int16_t)((uint8_t)value * 0x0101U);
+  break;
+case 4:
+  value = (int32_t)((uint8_t)value * 0x01010101U);
+  break;
+default:
+  gcc_unreachable ();
+}
+  reg = force_reg (SImode, GEN_INT (value));
+
+  offset = 0;
+  do
+{
+  int unit_size = MIN (bytes, align);
+  machine_mode unit_mode = (unit_size >= 4 ? SImode :
+  (unit_size >= 2 ? HImode :
+QImode));
+  unit_size = GET_MODE_SIZE (unit_mode);
+
+  emit_move_insn (adjust_address (dst_mem, unit_mode, offset),
+ unit_mode == SImode ? reg
+ : convert_to_mode (unit_mode, reg, true));
+
+  offset += unit

Back porting to GCC11/GCC12: Re: [patch][gcc13][i386][pr101891]Adjust -fzero-call-used-regs to always use XOR

2022-05-23 Thread Qing Zhao via Gcc-patches
Hi,

I have added the patch to GCC11 and GCC12 in my local area and bootstrapped and 
regress tested on both x86 and aarch64, no any issues.

Can I committed them to both GCC11 and GCC12 branches?

Thanks.




> On May 10, 2022, at 8:38 AM, Qing Zhao via Gcc-patches 
>  wrote:
>
>
>
>> On May 10, 2022, at 1:12 AM, Richard Biener  wrote:
>>
>> On Mon, 9 May 2022, Uros Bizjak wrote:
>>
>>> On Mon, May 9, 2022 at 5:44 PM Qing Zhao  wrote:

 Another question:

 I think that this patch might need to be back ported to Gcc12 and GCC11.

 What?s your opinion on this?
>>>
>>> It is not a regression, so following general rules, the patch should
>>> not be backported. OTOH, the patch creates functionally equivalent
>>> code, better in some security aspects. The functionality is also
>>> hidden behind some non-default flag, so I think if release managers
>>> (CC'd) are OK with the backport, I'd give it a technical approval.
>>>
 If so, when can I backport it?
>>>
>>> Let's keep it in the mainline for a week or two, before backporting it
>>> to non-EoL branches.
>>
>> OK from my POV after a week or two on trunk.
>
> Sure, I will do the back porting after two weeks.
>
> thanks.
>
> Qing
>>
>> Richard.
>>
>>> Uros.
>>>

 thanks.

 Qing

> On May 7, 2022, at 4:06 AM, Uros Bizjak  wrote:
>
> On Fri, May 6, 2022 at 6:42 PM Qing Zhao  wrote:
>>
>>
>>
>>> On May 6, 2022, at 10:58 AM, Uros Bizjak  wrote:
>>>
>>> On Fri, May 6, 2022 at 4:29 PM Qing Zhao  wrote:

 Hi,

 As Kee?s requested in this PR: 
 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101891

 =

 Currently -fzero-call-used-regs will use a pattern of:

 XOR regA,regA
 MOV regA,regB
 MOV regA,regC
 ...
 RET

 However, this introduces both a register ordering dependency (e.g. the 
 CPU cannot clear regB without clearing regA first), and while greatly 
 reduces available ROP gadgets, it does technically leave a set of 
 "MOV" ROP gadgets at the end of functions (e.g. "MOV regA,regC; RET").

 Please switch to always using XOR:

 XOR regA,regA
 XOR regB,regB
 XOR regC,regC
 ...
 RET

 ===

 This patch switch all MOV to XOR on i386.

 Bootstrapped and regresstion tested on x86_64-linux-gnu.

 Okay for gcc13?

 Thanks.

 Qing

 ==
>>>
 gcc/ChangeLog:

 * config/i386/i386.cc (zero_all_mm_registers): Use SET to zero instead
 of MOV for zeroing scratch registers.
 (ix86_zero_call_used_regs): Likewise.

 gcc/testsuite/ChangeLog:

 * gcc.target/i386/zero-scratch-regs-1.c: Add -fno-stack-protector
 -fno-PIC.
 * gcc.target/i386/zero-scratch-regs-10.c: Adjust mov to xor.
 * gcc.target/i386/zero-scratch-regs-13.c: Add -msse.
 * gcc.target/i386/zero-scratch-regs-14.c: Adjust mov to xor.
 * gcc.target/i386/zero-scratch-regs-15.c: Add -fno-stack-protector
 -fno-PIC.
 * gcc.target/i386/zero-scratch-regs-16.c: Likewise.
 * gcc.target/i386/zero-scratch-regs-17.c: Likewise.
 * gcc.target/i386/zero-scratch-regs-18.c: Add -fno-stack-protector
 -fno-PIC, adjust mov to xor.
 * gcc.target/i386/zero-scratch-regs-19.c: Add -fno-stack-protector
 -fno-PIC.
 * gcc.target/i386/zero-scratch-regs-2.c: Adjust mov to xor.
 * gcc.target/i386/zero-scratch-regs-20.c: Add -msse.
 * gcc.target/i386/zero-scratch-regs-21.c: Add -fno-stack-protector
 -fno-PIC, Adjust mov to xor.
 * gcc.target/i386/zero-scratch-regs-22.c: Adjust mov to xor.
 * gcc.target/i386/zero-scratch-regs-23.c: Likewise.
 * gcc.target/i386/zero-scratch-regs-26.c: Likewise.
 * gcc.target/i386/zero-scratch-regs-27.c: Likewise.
 * gcc.target/i386/zero-scratch-regs-28.c: Likewise.
 * gcc.target/i386/zero-scratch-regs-3.c: Add -fno-stack-protector.
 * gcc.target/i386/zero-scratch-regs-31.c: Adjust mov to xor.
 * gcc.target/i386/zero-scratch-regs-4.c: Add -fno-stack-protector
 -fno-PIC.
 * gcc.target/i386/zero-scratch-regs-5.c: Adjust mov to xor.
 * gcc.target/i386/zero-scratch-regs-6.c: Add -fno-stack-protector.
 * gcc.target/i386/zero-scratch-regs-7.c: Likewise.
 * gcc.target/i386/zero-scratch-regs-8.c: Adjust mov to xor.
 * gcc.target/i386/zero-scratch-regs-9.c: Add -fno-stack-protector.
>>>
>>> Please use something like the attached (functionally equivalent) patch
>>> for the last hunk of your patch.
>>
>> Sure, I will update the code.
>>>
>>> Als

RE: [PATCH/RFC] PR tree-optimization/96912: Recognize VEC_COND_EXPR in match.pd

2022-05-23 Thread Roger Sayle


Hi Richard,

Currently for pr96912, we end up with:

W foo (W x, W y, V m)
{
  W t;
  vector(16)  _1;
  vector(16) signed char _2;
  W _7;
  vector(2) long long int _9;
  vector(2) long long int _10;

   [local count: 1073741824]:
  _1 = m_3(D) < { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
  _2 = VIEW_CONVERT_EXPR(_1);
  t_4 = VIEW_CONVERT_EXPR(_2);
  _9 = x_5(D) ^ y_6(D);
  _10 = t_4 & _9;
  _7 = x_5(D) ^ _10;
  return _7;
}

The mask mode is V16QI and the data mode is V2DI (please forgive my RTL 
terminology).
The assignment t_4 view-converts the mask into the "data type" for the bitwise
operations.  The use x86's pblendvb, the "vcond_expr" operation needs to be 
mask's
mode (V16QI) rather than the data's mode (V2DI).  Hence the unsigned_type_for 
of the truth_type_for [if they have different NUNITS].  Obviously, converting 
the
mask to V2DI and performing a V2DI blend, won't produce the same result.

The most useful clause of vector_mask_p is actually the VECTOR_BOOLEAN_ TYPE_P
test that catches all "mask types", such as those that result from vector 
comparisons.
Note that vector_mask_p is tested against the operand of a view_convert expr.
The remaining cases handle truth_*_expr like operations on those comparisons.
One follow-up patch is to additionally allow VIEW_CONVERT_EXPR if both source
and destination are of  VECTOR_TYPE_P and known_eq TYPE_VECTOR_SUBUNITS.
Likewise, a C cst_vector_mask_p could check each element rather than the catch
all "integer_zerop || integer_all_onesp".

I agree with gimple-isel replacing VEC_COND_EXPR when it's supported in 
hardware,
just like .FMA is inserted to replace the universal MULT_EXPR/PLUS_EXPR tree 
codes,
the question is whether vec_cond_expr (and vec_duplicate) can always be expanded
moderately efficiently by the middle-end.  For one thing, we have vector cost 
models
that should be able to  distinguish between efficient and inefficient 
implementations.
And a vec_duplicate expander for SPARC should be able to generate an optimal 
sequence of instructions, even if there's isn't native hardware (instruction) 
support.
For example, scalar multiplication by 0x0101010101010101 may be a good way to 
vec_duplicate QI mode to V8QI mode (via DI mode), at least with -Os.  As you've
mentioned, the VEC_PERM infrastructure should be useful here.

[p.s. it's unfortunate that some of my patches appear controversial.  By 
randomly
selecting Bugzilla (regression) PRs that have been open for a long time, I seem 
to be
selecting/enriching for bugs for which there's no simple solution, and that 
maintainers
have already thought about for a long time but without coming up a satisfactory 
solution].

Thanks again for your assistance/direction.
Cheers,
Roger
--

> -Original Message-
> From: Richard Biener 
> Sent: 23 May 2022 14:36
> To: Roger Sayle 
> Cc: GCC Patches 
> Subject: Re: [PATCH/RFC] PR tree-optimization/96912: Recognize
> VEC_COND_EXPR in match.pd
> 
> On Mon, May 23, 2022 at 3:06 PM Roger Sayle
>  wrote:
> >
> >
> > Hi Richard,
> > I was wondering what you think of the following patch as a solution to
> > PR tree-optimization/96912, i.e. the ability to recognize pblendvb
> > from regular code rather than as a target specific builtin?
> >
> > The obvious point of contention is that the current middle-end
> > philosophy around vector expressions is that the middle-end should
> > continually check for backend support, whereas I prefer the "old
> > school" view that trees are an abstraction and that RTL expansion is
> > the point where these abstract operations get converted/lowered into
> instructions supported by the target.
> 
> That's still true for trees aka GENERIC but GIMPLE is more like RTL here.
> Note before vector lowering GIMPLE is like GENERIC but after the "RTL
> expansion" of vectors is considered done.  I _think_ this was done to get the
> opportunity to optimize the lowered code (and maybe cheat by doing that
> lowering sub-optimally) given that user-written "generic"
> vector code tends to run into limitations.
> 
> > [The exceptions being built-in functions, IFN_* etc.] Should tree.texi
> > document which tree codes can't be used without checking the backend.
> 
> I suppose yes, but it's not really "which tree codes" but which types.  Even 
> for
> PLUS_EXPR you have to check for target support when vector types are
> involved.
> 
> Note when being too lazy before vector lowering you could end up transforming
> previously supported IL into unsupported and thus triggering vector lowering 
> to
> perform elementwise operations, severly slowing down code which is why you
> might find checks like if (target supports new code || target didn't support 
> old
> code)
> 
> > Bootstrapped and regression tested, but this obviously depends upon
> > RTL expansion being able to perform the inverse operation/lowering if
> required.
> 
> So the case in question "historically" was a task for RTL combine.  If we now
> bring that to GIMPLE we should i

Re: [PATCH/RFC] PR tree-optimization/96912: Recognize VEC_COND_EXPR in match.pd

2022-05-23 Thread Richard Biener via Gcc-patches
On Mon, May 23, 2022 at 3:06 PM Roger Sayle  wrote:
>
>
> Hi Richard,
> I was wondering what you think of the following patch as a solution to
> PR tree-optimization/96912, i.e. the ability to recognize pblendvb from
> regular code rather than as a target specific builtin?
>
> The obvious point of contention is that the current middle-end philosophy
> around vector expressions is that the middle-end should continually check
> for backend support, whereas I prefer the "old school" view that trees
> are an abstraction and that RTL expansion is the point where these abstract
> operations get converted/lowered into instructions supported by the target.

That's still true for trees aka GENERIC but GIMPLE is more like RTL here.
Note before vector lowering GIMPLE is like GENERIC but after the
"RTL expansion" of vectors is considered done.  I _think_ this was done
to get the opportunity to optimize the lowered code (and maybe cheat
by doing that lowering sub-optimally) given that user-written "generic"
vector code tends to run into limitations.

> [The exceptions being built-in functions, IFN_* etc.] Should tree.texi
> document
> which tree codes can't be used without checking the backend.

I suppose yes, but it's not really "which tree codes" but which
types.  Even for PLUS_EXPR you have to check for target support
when vector types are involved.

Note when being too lazy before vector lowering you could end up
transforming previously supported IL into unsupported and thus
triggering vector lowering to perform elementwise operations, severly
slowing down code which is why you might find checks like
if (target supports new code || target didn't support old code)

> Bootstrapped and regression tested, but this obviously depends upon RTL
> expansion being able to perform the inverse operation/lowering if required.

So the case in question "historically" was a task for RTL combine.  If
we now bring that to GIMPLE we should indeed verify if the target
can efficiently(!) do the operation we like to use.  In this particular
case it would be vec_cond_mask support for the created VEC_COND_EXPR.
We also have to avoid doing this after ISEL.

Note all original types are data types while you need a mask type for
the selector which in turn means you will almost never match
unless you hit the

+(match vector_mask_p
+ VECTOR_CST@0
+ (if (integer_zerop (@0) || integer_all_onesp (@0

case?

+(simplify
+ (bit_xor:c (bit_and:c (bit_xor:c @0 @1) (view_convert vector_mask_p@2)) @0)
+ (if (VECTOR_TYPE_P (type)
+  && VECTOR_TYPE_P (TREE_TYPE (@2)))
+  (with { tree masktype = truth_type_for (TREE_TYPE (@2));

I think you want to check the truth_type_for (type) instead, check that
you can V_C_E @2 to it by checking it's a vector mode and the same
as the truth_type mode.

+  tree vecttype = maybe_ne (TYPE_VECTOR_SUBPARTS (masktype),
+   TYPE_VECTOR_SUBPARTS (type))
+ ? unsigned_type_for (masktype)
+ : type; }
+   (view_convert (vec_cond:vecttype (view_convert:masktype @2)
+   (view_convert:vecttype @1)
+   (view_convert:vecttype @0))

and then have

(vec_cond (view_convert:masktype @2) @1 @0)

Richard.

>
> 2022-05-23  Roger Sayle  
>
> gcc/ChangeLog
> PR tree-optimization/96912
> * match.pd (vector_mask_p): New predicate to identify vectors
> where every element must be zero or all ones.
> (bit_xor (bit_and (bit_xor ...) ...) ...): Recognize a VEC_COND_EXPR
> expressed as logical vector operations.
>
> gcc/testsuite/ChangeLog
> PR tree-optimization/96912
> * gcc.target/i386/pr96912.c: New test case.
>
>
> Thoughts?  How would you solve this PR?  Are there convenience predicates
> for testing whether a target supports vec_cond_expr, vec_duplicate, etc?
>
> Cheers,
> Roger
> --
>


demangler: C++ modules support

2022-05-23 Thread Nathan Sidwell

This adds demangling support for C++ modules.  A new 'W' component
along with augmented behaviour of 'S' components.

I guess I should go actually fix the ABI document itself now ...

include/
* demangle.h (enum demangle_component_type): Add module components.
libiberty/
* cp-demangle.c (d_make_comp): Adjust.
(d_name, d_prefix): Adjust subst handling. Add module handling.
(d_maybe_module_name): New.
	(d_unqualified_name): Add incoming module parm. Handle it.  Adjust all 
callers.

(d_special_name): Add 'GI' support.
(d_count_template_scopes): Adjust.
(d_print_comp_inner): Print module.
* testsuite/demangle-expected: New test cases

--
Nathan SidwellFrom b7feb71d45e4cd894d7706c21a21a3871070d098 Mon Sep 17 00:00:00 2001
From: Nathan Sidwell 
Date: Tue, 8 Mar 2022 12:54:03 -0800
Subject: [PATCH] demangler: C++ modules support

This adds demangling support for C++ modules.  A new 'W' component
along with augmented behaviour of 'S' components.

	include/
	* demangle.h (enum demangle_component_type): Add module components.
	libiberty/
	* cp-demangle.c (d_make_comp): Adjust.
	(d_name, d_prefix): Adjust subst handling. Add module handling.
	(d_maybe_module_name): New.
	(d_unqualified_name): Add incoming module parm. Handle it.  Adjust all callers.
	(d_special_name): Add 'GI' support.
	(d_count_template_scopes): Adjust.
	(d_print_comp_inner): Print module.
	* testsuite/demangle-expected: New test cases
---
 include/demangle.h|   7 +-
 libiberty/cp-demangle.c   | 142 +-
 libiberty/testsuite/demangle-expected |  67 
 3 files changed, 188 insertions(+), 28 deletions(-)

diff --git a/include/demangle.h b/include/demangle.h
index 44a27374d4f..e2aa4a971ff 100644
--- a/include/demangle.h
+++ b/include/demangle.h
@@ -451,7 +451,12 @@ enum demangle_component_type
   DEMANGLE_COMPONENT_NOEXCEPT,
   DEMANGLE_COMPONENT_THROW_SPEC,
 
-  DEMANGLE_COMPONENT_STRUCTURED_BINDING
+  DEMANGLE_COMPONENT_STRUCTURED_BINDING,
+
+  DEMANGLE_COMPONENT_MODULE_NAME,
+  DEMANGLE_COMPONENT_MODULE_PARTITION,
+  DEMANGLE_COMPONENT_MODULE_ENTITY,
+  DEMANGLE_COMPONENT_MODULE_INIT,
 };
 
 /* Types which are only used internally.  */
diff --git a/libiberty/cp-demangle.c b/libiberty/cp-demangle.c
index cf451c5aff2..d06d80d1fee 100644
--- a/libiberty/cp-demangle.c
+++ b/libiberty/cp-demangle.c
@@ -429,10 +429,12 @@ static struct demangle_component *d_name (struct d_info *, int substable);
 
 static struct demangle_component *d_nested_name (struct d_info *);
 
+static int d_maybe_module_name (struct d_info *, struct demangle_component **);
+
 static struct demangle_component *d_prefix (struct d_info *, int);
 
 static struct demangle_component *d_unqualified_name (struct d_info *,
-		  struct demangle_component *scope);
+	struct demangle_component *scope, struct demangle_component *module);
 
 static struct demangle_component *d_source_name (struct d_info *);
 
@@ -984,6 +986,7 @@ d_make_comp (struct d_info *di, enum demangle_component_type type,
 case DEMANGLE_COMPONENT_COMPOUND_NAME:
 case DEMANGLE_COMPONENT_VECTOR_TYPE:
 case DEMANGLE_COMPONENT_CLONE:
+case DEMANGLE_COMPONENT_MODULE_ENTITY:
   if (left == NULL || right == NULL)
 	return NULL;
   break;
@@ -1022,6 +1025,7 @@ d_make_comp (struct d_info *di, enum demangle_component_type type,
 case DEMANGLE_COMPONENT_TRINARY_ARG2:
 case DEMANGLE_COMPONENT_TPARM_OBJ:
 case DEMANGLE_COMPONENT_STRUCTURED_BINDING:
+case DEMANGLE_COMPONENT_MODULE_INIT:
   if (left == NULL)
 	return NULL;
   break;
@@ -1030,6 +1034,8 @@ d_make_comp (struct d_info *di, enum demangle_component_type type,
 	 empty.  */
 case DEMANGLE_COMPONENT_ARRAY_TYPE:
 case DEMANGLE_COMPONENT_INITIALIZER_LIST:
+case DEMANGLE_COMPONENT_MODULE_NAME:
+case DEMANGLE_COMPONENT_MODULE_PARTITION:
   if (right == NULL)
 	return NULL;
   break;
@@ -1422,6 +1428,7 @@ d_name (struct d_info *di, int substable)
 {
   char peek = d_peek_char (di);
   struct demangle_component *dc = NULL;
+  struct demangle_component *module = NULL;
   int subst = 0;
 
   switch (peek)
@@ -1435,7 +1442,7 @@ d_name (struct d_info *di, int substable)
   break;
 
 case 'U':
-  dc = d_unqualified_name (di, NULL);
+  dc = d_unqualified_name (di, NULL, NULL);
   break;
 
 case 'S':
@@ -1446,12 +1453,21 @@ d_name (struct d_info *di, int substable)
 	dc = d_make_name (di, "std", 3);
 	di->expansion += 3;
 	  }
-	else
+
+	if (d_peek_char (di) == 'S')
 	  {
-	dc = d_substitution (di, 0);
-	if (!dc)
+	module = d_substitution (di, 0);
+	if (!module)
 	  return NULL;
-	subst = 1;
+	if (!(module->type == DEMANGLE_COMPONENT_MODULE_NAME
+		  || module->type == DEMANGLE_COMPONENT_MODULE_PARTITION))
+	  {
+		if (dc)
+		  return NULL;
+		subst = 1;
+		dc = module;
+		module = NULL;
+	  }
 	  }
 

[PATCH/RFC] PR tree-optimization/96912: Recognize VEC_COND_EXPR in match.pd

2022-05-23 Thread Roger Sayle

Hi Richard,
I was wondering what you think of the following patch as a solution to
PR tree-optimization/96912, i.e. the ability to recognize pblendvb from
regular code rather than as a target specific builtin?

The obvious point of contention is that the current middle-end philosophy
around vector expressions is that the middle-end should continually check
for backend support, whereas I prefer the "old school" view that trees
are an abstraction and that RTL expansion is the point where these abstract
operations get converted/lowered into instructions supported by the target.
[The exceptions being built-in functions, IFN_* etc.] Should tree.texi
document
which tree codes can't be used without checking the backend.


Bootstrapped and regression tested, but this obviously depends upon RTL
expansion being able to perform the inverse operation/lowering if required.


2022-05-23  Roger Sayle  

gcc/ChangeLog
PR tree-optimization/96912
* match.pd (vector_mask_p): New predicate to identify vectors
where every element must be zero or all ones.
(bit_xor (bit_and (bit_xor ...) ...) ...): Recognize a VEC_COND_EXPR
expressed as logical vector operations.

gcc/testsuite/ChangeLog
PR tree-optimization/96912
* gcc.target/i386/pr96912.c: New test case.


Thoughts?  How would you solve this PR?  Are there convenience predicates
for testing whether a target supports vec_cond_expr, vec_duplicate, etc?

Cheers,
Roger
--

diff --git a/gcc/match.pd b/gcc/match.pd
index c2fed9b..e365f28 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -4221,6 +4221,35 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
(if (integer_all_onesp (@1) && integer_zerop (@2))
 @0
 
+/* A integer vector where every element must be 0 or -1.  */
+(match vector_mask_p
+ @0
+ (if (VECTOR_BOOLEAN_TYPE_P (type
+(match vector_mask_p
+ VECTOR_CST@0
+ (if (integer_zerop (@0) || integer_all_onesp (@0
+(match vector_mask_p
+ (vec_cond @0 vector_mask_p@1 vector_mask_p@2))
+(match vector_mask_p
+ (bit_not vector_mask_p@0))
+(for op (bit_and bit_ior bit_xor)
+ (match vector_mask_p
+  (op vector_mask_p@0 vector_mask_p@1)))
+
+/* Recognize VEC_COND_EXPR.  */
+(simplify
+ (bit_xor:c (bit_and:c (bit_xor:c @0 @1) (view_convert vector_mask_p@2)) @0)
+ (if (VECTOR_TYPE_P (type)
+  && VECTOR_TYPE_P (TREE_TYPE (@2)))
+  (with { tree masktype = truth_type_for (TREE_TYPE (@2));
+  tree vecttype = maybe_ne (TYPE_VECTOR_SUBPARTS (masktype),
+   TYPE_VECTOR_SUBPARTS (type))
+ ? unsigned_type_for (masktype)
+ : type; }
+   (view_convert (vec_cond:vecttype (view_convert:masktype @2)
+   (view_convert:vecttype @1)
+   (view_convert:vecttype @0))
+
 /* A few simplifications of "a ? CST1 : CST2". */
 /* NOTE: Only do this on gimple as the if-chain-to-switch
optimization depends on the gimple to have if statements in it. */
/* { dg-do compile { target { ! ia32 } } } */
/* { dg-options "-O2 -msse4" } */

typedef char V __attribute__((vector_size(16)));
typedef long long W __attribute__((vector_size(16)));

W
foo (W x, W y, V m)
{
  W t = (m < 0);
  return (~t & x) | (t & y);
}

V
bar (V x, V y, V m)
{
  V t = (m < 0);
  return (~t & x) | (t & y);
}

/* { dg-final { scan-assembler-times "pblend" 2 } } */


Re: Adjust affected targets for vec_perm_const hook

2022-05-23 Thread Richard Sandiford via Gcc-patches
Prathamesh Kulkarni  writes:
> Hi Richard,
> The attached patch addresses formatting nits for affected targets.
> Tested with make all-gcc stage1 (except for gcn).

OK, thanks.

> Sorry if this sounds like a naive question, but what target triplet
> should I use to build gcn port ?

I think it's amdgcn-amdhsa.

Richard

> Thanks,
> Prathamesh
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index f4d2a800f39..e6a24a0f9e1 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -24145,9 +24145,13 @@ aarch64_expand_vec_perm_const_1 (struct 
> expand_vec_perm_d *d)
>  /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
>  
>  static bool
> -aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
> -   rtx op1, const vec_perm_indices &sel)
> +aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
> +   rtx target, rtx op0, rtx op1,
> +   const vec_perm_indices &sel)
>  {
> +  if (vmode != op_mode)
> +return false;
> +
>struct expand_vec_perm_d d;
>  
>/* Check whether the mask can be applied to a single vector.  */
> diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
> index 2afe0445ed5..70c2d50f0cc 100644
> --- a/gcc/config/arm/arm.cc
> +++ b/gcc/config/arm/arm.cc
> @@ -31813,9 +31813,13 @@ arm_expand_vec_perm_const_1 (struct 
> expand_vec_perm_d *d)
>  /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
>  
>  static bool
> -arm_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0, rtx 
> op1,
> +arm_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
> +   rtx target, rtx op0, rtx op1,
> const vec_perm_indices &sel)
>  {
> +  if (vmode != op_mode)
> +return false;
> +
>struct expand_vec_perm_d d;
>int i, nelt, which;
>  
> diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc
> index e2e9335ad75..4642d5d55bf 100644
> --- a/gcc/config/gcn/gcn.cc
> +++ b/gcc/config/gcn/gcn.cc
> @@ -4131,10 +4131,13 @@ gcn_make_vec_perm_address (unsigned int *perm)
> permutations.  */
>  
>  static bool
> -gcn_vectorize_vec_perm_const (machine_mode vmode, rtx dst,
> -   rtx src0, rtx src1,
> +gcn_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
> +   rtx dst, rtx src0, rtx src1,
> const vec_perm_indices & sel)
>  {
> +  if (vmode != op_mode)
> +return false;
> +
>unsigned int nelt = GET_MODE_NUNITS (vmode);
>  
>gcc_assert (VECTOR_MODE_P (vmode));
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index 806e1f5aaa3..adf68547119 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -22060,9 +22060,13 @@ canonicalize_perm (struct expand_vec_perm_d *d)
>  /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
>  
>  bool
> -ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
> -rtx op1, const vec_perm_indices &sel)
> +ix86_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
> +rtx target, rtx op0, rtx op1,
> +const vec_perm_indices &sel)
>  {
> +  if (vmode != op_mode)
> +return false;
> +
>struct expand_vec_perm_d d;
>unsigned char perm[MAX_VECT_LEN];
>unsigned int i, nelt, which;
> diff --git a/gcc/config/i386/i386-expand.h b/gcc/config/i386/i386-expand.h
> index 9d320c29552..6c650196c9c 100644
> --- a/gcc/config/i386/i386-expand.h
> +++ b/gcc/config/i386/i386-expand.h
> @@ -48,8 +48,9 @@ rtx gen_push (rtx arg);
>  rtx gen_pop (rtx arg);
>  rtx ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
>machine_mode mode, int ignore);
> -bool ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
> - rtx op1, const vec_perm_indices &sel);
> +bool ix86_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
> + rtx target, rtx op0, rtx op1,
> + const vec_perm_indices &sel);
>  bool ix86_notrack_prefixed_insn_p (rtx_insn *);
>  machine_mode ix86_split_reduction (machine_mode mode);
>  void ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode, rtx op0,
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index 175ce013e5d..50112a8efee 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -15836,7 +15836,7 @@
> sel[7] = 15;
>   }
> vec_perm_indices indices (sel, 2, 8);
> -   bool ok = targetm.vectorize.vec_perm_const (V8SImode, target,
> +   bool ok = targetm.vectorize.vec_perm_const (V8SImode, V8SImode, 
> target,
> arg0, arg1, indices);
> gcc_assert (ok);

Re: [0/9] [middle-end] Add param to vec_perm_const hook to specify mode of input operand

2022-05-23 Thread Richard Sandiford via Gcc-patches
Prathamesh Kulkarni  writes:
> On Wed, 18 May 2022 at 17:27, Richard Sandiford
>  wrote:
>>
>> Prathamesh Kulkarni  writes:
>> > Hi,
>> > The attached patch adds another parameter machine_mode op_mode to 
>> > vec_perm_const
>> > hook to specify mode of input operands. The motivation for doing this
>> > is PR96463,
>> > where we create vec_perm_expr of the form:
>> > lhs = vec_perm_expr
>> > where lhs and rhs have different vector types but same element type
>> > (lhs is SVE and rhs is corresponding advsimd vector).
>> >
>> > It seems the following targets were affected: aarch64, i386, arm, ia64,
>> > mips, rs6000, s390, sparc, gcn.
>> >
>> > Bootstrapped+tested on x86_64-linux-gnu, aarch64-linux-gnu.
>> > For other targets, I did make all-gcc stage1, which seems to build OK.
>> >
>> > Thanks,
>> > Prathamesh
>> >
>> > diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
>> > index c5006afc00d..31ff6ef3f92 100644
>> > --- a/gcc/doc/tm.texi
>> > +++ b/gcc/doc/tm.texi
>> > @@ -6088,7 +6088,7 @@ for the given scalar type @var{type}.  
>> > @var{is_packed} is false if the scalar
>> >  access using @var{type} is known to be naturally aligned.
>> >  @end deftypefn
>> >
>> > -@deftypefn {Target Hook} bool TARGET_VECTORIZE_VEC_PERM_CONST 
>> > (machine_mode @var{mode}, rtx @var{output}, rtx @var{in0}, rtx @var{in1}, 
>> > const vec_perm_indices @var{&sel})
>> > +@deftypefn {Target Hook} bool TARGET_VECTORIZE_VEC_PERM_CONST 
>> > (machine_mode @var{mode}, machine_mode @var{op_mode}, rtx @var{output}, 
>> > rtx @var{in0}, rtx @var{in1}, const vec_perm_indices @var{&sel})
>> >  This hook is used to test whether the target can permute up to two
>> >  vectors of mode @var{mode} using the permutation vector @code{sel}, and
>> >  also to emit such a permutation.  In the former case @var{in0}, @var{in1}
>>
>> Like Andre says, the documentation should describe op_mode (and mode).
>>
>> > diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc
>> > index 68dc679cc6a..aef9d4c5d28 100644
>> > --- a/gcc/optabs-query.cc
>> > +++ b/gcc/optabs-query.cc
>> > @@ -417,8 +417,8 @@ can_vec_perm_var_p (machine_mode mode)
>> > with here.  */
>> >
>> >  bool
>> > -can_vec_perm_const_p (machine_mode mode, const vec_perm_indices &sel,
>> > -   bool allow_variable_p)
>> > +can_vec_perm_const_p (machine_mode mode, machine_mode op_mode,
>> > +   const vec_perm_indices &sel, bool allow_variable_p)
>> >  {
>>
>> The function comment should describe the new parameter.
>>
>> >/* If the target doesn't implement a vector mode for the vector type,
>> >   then no operations are supported.  */
>> > @@ -448,7 +448,7 @@ can_vec_perm_const_p (machine_mode mode, const 
>> > vec_perm_indices &sel,
>> >
>> >if (targetm.vectorize.vec_perm_const != NULL)
>> >  {
>> > -  if (targetm.vectorize.vec_perm_const (mode, NULL_RTX, NULL_RTX,
>> > +  if (targetm.vectorize.vec_perm_const (mode, op_mode, NULL_RTX, 
>> > NULL_RTX,
>> >   NULL_RTX, sel))
>> >   return true;
>> >
>> > @@ -462,6 +462,13 @@ can_vec_perm_const_p (machine_mode mode, const 
>> > vec_perm_indices &sel,
>> >return false;
>> >  }
>> >
>> > +bool
>> > +can_vec_perm_const_p (machine_mode mode, const vec_perm_indices &sel,
>> > +   bool allow_variable_p)
>> > +{
>> > +  return can_vec_perm_const_p (mode, mode, sel, allow_variable_p);
>> > +}
>> > +
>>
>> I can understand why you went for this, but now that we've opened
>> the door to mismatched modes, I think it would be better if all callers
>> specified the input mode explicitly.
>>
>> > diff --git a/gcc/optabs.cc b/gcc/optabs.cc
>> > index 3d8fa3abdfe..55f10c41789 100644
>> > --- a/gcc/optabs.cc
>> > +++ b/gcc/optabs.cc
>> > @@ -6250,7 +6250,9 @@ expand_vec_perm_const (machine_mode mode, rtx v0, 
>> > rtx v1,
>> >if (single_arg_p)
>> >   v1 = v0;
>> >
>> > -  if (targetm.vectorize.vec_perm_const (mode, target, v0, v1, 
>> > indices))
>> > +  gcc_checking_assert (GET_MODE (v0) == GET_MODE (v1));
>> > +  machine_mode op_mode = GET_MODE (v0);
>> > +  if (targetm.vectorize.vec_perm_const (mode, op_mode, target, v0, 
>> > v1, indices))
>> >   return target;
>> >  }
>> >
>>
>> (FWIW, I agree the assert is worth having.)
> Hi,
> I updated the patch with doc and adjusted callers to explicitly pass op_mode.
> Bootstrapped + tested on x86_64-linux-gnu and aarch64-linux-gnu.
> Does it look OK to commit ?
>
> […]
> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> index c5006afc00d..f53068c5c53 100644
> --- a/gcc/doc/tm.texi
> +++ b/gcc/doc/tm.texi
> @@ -6088,13 +6088,13 @@ for the given scalar type @var{type}.  
> @var{is_packed} is false if the scalar
>  access using @var{type} is known to be naturally aligned.
>  @end deftypefn
>  
> -@deftypefn {Target Hook} bool TARGET_VECTORIZE_VEC_PERM_CONST (machine_mode 
> @var{mode}, rtx @var{output}, rtx @var{in0}, rtx @var{in1}, const 
> vec_perm_indices @var{&sel})

Re: [PATCH] PR tree-optimization/105668: Provide RTL expansion for VEC_COND_EXPR.

2022-05-23 Thread Richard Biener via Gcc-patches
On Mon, May 23, 2022 at 8:44 AM Roger Sayle  wrote:
>
>
> This resolves PR tree-optimization/105668, a P1 ice-on-valid regression
> triggered by my recent patch to add a vec_cmpeqv1tiv1ti define_expand
> to the i386 backend.  The existence of this optab currently leads GCC
> to incorrectly assume the existence of a corresponding vcond_mask for
> V1TImode.
>
> I believe the best solution (of the three possible fixes) is to allow
> gimple_expand_vec_cond_expr to fail (return NULL) when a suitable optab
> to generate a IFN_VCOND_MASK isn't available, but instead allow RTL
> expansion to provide a default implementation using vector mode logic
> operations.  On x86_64, the equivalent of a pblend can be generated in
> three instructions using pand, pandn and pxor.  In fact, this fallback
> implementation is already used in ix86_expand_sse_movcc when the -march
> doesn't provide a suitable instruction.  This patch provides that
> functionality to all targets in the middle-end, allowing the vectorizer(s)
> to safely assume support for VEC_COND_EXPR (when the target has suitable
> vector logic instructions).
>
> I should point out (for the record) that the new expand_vec_cond_expr
> function in expr.cc is very different from the function of the same name
> removed by Matin Liska in June 2020.
> https://gcc.gnu.org/pipermail/gcc-patches/2020-June/547097.html
> https://gcc.gnu.org/git/?p=gcc.git;a=commitdiff;h=502d63b6d6141597bb18fd23c8
> 7736a1b384cf8f
> That function simply expanded the vcond_mask optab and failed if it
> wasn't available, which is currently the task of the gimple-isel pass.
> The implementation here is a traditional RTL expander, sythesizing the
> desired vector conditional move using bit-wise XOR and AND instructions
> of the mask vector.
>
> At some point in the future, gimple-isel could be enhanced to consider
> alternative vector modes, as a V1TI blend/vec_cond_expr may be implemented
> using V2DI, V4SI, V8HI or V16QI.  Alas, I couldn't figure out how to
> conveniently iterate over the desired modes, so this enhancement is left
> for a follow-up patch.
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check, both with and without --target_board=unix{-m32} with
> no new failures.  Ok for mainline?

No, first of all the purpose of ISEL is to get rid of _all_ VEC_COND_EXPRs.
So iff then this fallback would have to reside in the ISEL pass, replacing the
GIMPLE with target supported GIMPLE.

But then it is the task of tree-vect-generic.cc to turn not target supported
GIMPLE into target supported GIMPLE - and the issue in the PR in question
is that at its point we basically have _1 < _2 ? _3 : _4 which _is_ supported
by the target but passes inbetween vector lowering and ISEL hide
_1 < _2 via a PHI node and so the GIMPLE is no longer target supported.

That would be the thing to fix - I'll note we put us into the corner of
needing to keep the SSA def of the VEC_COND_EXPR condition
"next" (as in SSA def) to the VEC_COND_EXPR, something that's
difficult to maintain, especially when so man passes run inbetween.

So the solution might be to somehow move the two closer together,
maybe as much as merging the VEC_COND_EXPR part into
vector lowering itself (with the disadvantage of more difficult
to deal with IL).  Or alternatively have vectors lowered earlier
for those produced by user code and have "final" lowering done
as part of RTL expansion (so in ISEL then).

Richard.

>
> 2022-05-23  Roger Sayle  
>
> gcc/ChangeLog
> PR tree-optimization/105668
> * expr.cc (expand_vec_cond_expr): New function to expand
> VEC_COND_EXPR using vector mode logical instructions.
> (expand_expr_real_2) : Call the above.
> * gimple-isel.cc (gimple_expand_vec_cond_expr): Instead of
> asserting, retun NULL when the target's get_vcond_mask_icode
> returns CODE_FOR_nothing.
>
> gcc/testsuite/ChangeLog
> PR tree-optimization/105668
> * gcc.target/i386/pr105668.c: New test case.
>
> Roger
> --
>


Re: [x86 PING] Peephole pand;pxor into pandn

2022-05-23 Thread Uros Bizjak via Gcc-patches
On Mon, May 23, 2022 at 12:49 PM Roger Sayle  wrote:
>
>
> Hi Uros,
> Hopefully, if I explain even more of the context, you'll better understand why
> this harmless (and at worse seemingly redundant) peephole2 is actually 
> critical
> for addressing significant regressions in the compiler without introducing new
> testsuite failures.  I wouldn't ask (again), if I didn't feel it's important.
>
> Basically, I'm trying to unblock Hongtao's patch (for PR target/104610)
> which in your own review, explained is better handled by/during STV:
> https://gcc.gnu.org/pipermail/gcc-patches/2022-May/594070.html
>
> Unfortunately, that patch of mine to STV (that I want to ping next) that 
> solves
> the P2 code quality regression PR target/70321, is itself blocked by another
> review of yours:
> https://gcc.gnu.org/pipermail/gcc-patches/2022-April/593200.html
> where this fix (alone) leads to a regression of the test case pr65105-5.c.
>
> This pending regression has nothing to do with TARGET_BMI's andn, but
> the idiom "if ((x & y) != y)" on ia32, where x and y are DImode, and 
> stv/reload
> has decided to place these values in SSE registers.
>
> After combine we have an *anddi3_doubleword and *cmpdi3_doubleword:
> (insn 22 21 23 4 (parallel [
> (set (reg:DI 97)
> (and:DI (reg/v:DI 92 [ p2 ])
> (reg:DI 88 [ _25 ])))
> (clobber (reg:CC 17 flags))
> ]) "pr65105-5.c":20:18 530 {*anddi3_doubleword}
>  (expr_list:REG_UNUSED (reg:CC 17 flags)
> (nil)))
> (insn 23 22 24 4 (set (reg:CCZ 17 flags)
> (compare:CCZ (reg/v:DI 92 [ p2 ])
> (reg:DI 97))) "pr65105-5.c":20:8 29 {*cmpdi_doubleword}
>  (expr_list:REG_DEAD (reg:DI 97)
> (nil)))

But originally, during combine we have (pr65105-5.c):

Trying 22 -> 23:
   22: {r97:DI=r92:DI&r88:DI;clobber flags:CC;}
  REG_UNUSED flags:CC
   23: {r98:DI=r92:DI^r97:DI;clobber flags:CC;}
  REG_DEAD r97:DI
  REG_UNUSED flags:CC
Successfully matched this instruction:
(parallel [
(set (reg:DI 98)
(and:DI (not:DI (reg:DI 88 [ _25 ]))
(reg/v:DI 92 [ p2 ])))
(clobber (reg:CC 17 flags))
])
allowing combination of insns 22 and 23
original costs 8 + 8 = 16
replacement cost 16
deferring deletion of insn with uid = 22.
modifying insn i323: {r98:DI=~r88:DI&r92:DI;clobber flags:CC;}
  REG_UNUSED flags:CC
deferring rescan insn with uid = 23.

so combine is creating:

(insn 23 22 24 4 (parallel [
(set (reg:DI 98)
(and:DI (not:DI (reg:DI 88 [ _25 ]))
(reg/v:DI 92 [ p2 ])))
(clobber (reg:CC 17 flags))
]) "pr65105-5.c":20:8 552 {*andndi3_doubleword}
 (expr_list:REG_UNUSED (reg:CC 17 flags)
(nil)))

why is this not the case anymore with your patch?

Uros.


Re: [PATCH] Use more ARRAY_SIZE.

2022-05-23 Thread Martin Liška
On 5/23/22 09:56, Iain Buclaw wrote:
> Excerpts from Martin Liška's message of Mai 11, 2022 10:17 am:
>> On 5/9/22 14:03, Richard Biener wrote:
>>> On Thu, May 5, 2022 at 4:30 PM Martin Liška  wrote:

 On 5/5/22 14:58, Iain Buclaw wrote:
> This D front-end change doesn't look right to me, besides the slight

 Hello.

 Sorry, I've re-read the patch and fixed some places where the macro usage
 was wrong.

 Patch can bootstrap on x86_64-linux-gnu and survives regression tests.
>>>
>>> The middle-end parts are OK.  I'd say in files where ARRAY_SIZE is already
>>> used it's OK to introduce more uses.  Otherwise I defer to the more specific
>>> maintainers if they like this or not.
>>
>> All right, CCing the following maintainers for other parts:
>>
>> - David for JIT and Analyzer
>> - Tobias for Fortran part
>> - Jason for C-family part
>>
> 
> Hi Martin,

Hello.

> 
> When running through contrib/config-list.mk, I noticed that this also
> broke the build for the following obsolete targets:

My periodic testers confirm that and I'm going to install the following patch.

Cheers,
Martin

> 
> tilegx-linux-gnu
> tilegxbe-linux-gnu
> tilepro-linux-gnu
> 
> ---
> gcc/config/tilepro/gen-mul-tables.cc: In function ‘void 
> find_sequences(ExpressionTree&, ExpressionTreeMap&)’:
> gcc/config/tilepro/gen-mul-tables.cc:465:26: error: ‘ARRAY_SIZE’ was not 
> declared in this scope
>   465 |   for (size_t f = 0; f < ARRAY_SIZE (ops); f++)
>   |  ^~
> gcc/config/tilepro/gen-mul-tables.cc: In function ‘void 
> create_insn_code_compression_table()’:
> gcc/config/tilepro/gen-mul-tables.cc:567:26: error: ‘ARRAY_SIZE’ was not 
> declared in this scope
>   567 |   for (size_t i = 0; i < ARRAY_SIZE (ops); i++)
>   |  ^~
> ---
From 63798f67dcc848dcd110ce222b97304565c9ea29 Mon Sep 17 00:00:00 2001
From: Martin Liska 
Date: Mon, 23 May 2022 13:54:53 +0200
Subject: [PATCH] tilepro: fix missing ARRAY_SIZE macro

gcc/ChangeLog:

	* config/tilepro/gen-mul-tables.cc (ARRAY_SIZE): Add new macro.
---
 gcc/config/tilepro/gen-mul-tables.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/gcc/config/tilepro/gen-mul-tables.cc b/gcc/config/tilepro/gen-mul-tables.cc
index 798766a723b..52183982f65 100644
--- a/gcc/config/tilepro/gen-mul-tables.cc
+++ b/gcc/config/tilepro/gen-mul-tables.cc
@@ -90,6 +90,8 @@ typedef long long MUL_TYPE;
 #define MIN(x, y)  ((x) <= (y) ? (x) : (y))
 #define MAX(x, y)  ((x) >= (y) ? (x) : (y))
 
+#define ARRAY_SIZE(a) (sizeof (a) / sizeof ((a)[0]))
+
 /* For this program a unary op is one which has only one nonconstant
operand.  So shift left by 5 is considered unary.  */
 typedef MUL_TYPE (*unary_op_func) (MUL_TYPE);
-- 
2.36.1



Re: [x86 PING] Peephole pand;pxor into pandn

2022-05-23 Thread Uros Bizjak via Gcc-patches
On Mon, May 23, 2022 at 12:49 PM Roger Sayle  wrote:
>
>
> Hi Uros,
> Hopefully, if I explain even more of the context, you'll better understand why
> this harmless (and at worse seemingly redundant) peephole2 is actually 
> critical
> for addressing significant regressions in the compiler without introducing new
> testsuite failures.  I wouldn't ask (again), if I didn't feel it's important.
>
> Basically, I'm trying to unblock Hongtao's patch (for PR target/104610)
> which in your own review, explained is better handled by/during STV:
> https://gcc.gnu.org/pipermail/gcc-patches/2022-May/594070.html
>
> Unfortunately, that patch of mine to STV (that I want to ping next) that 
> solves
> the P2 code quality regression PR target/70321, is itself blocked by another
> review of yours:
> https://gcc.gnu.org/pipermail/gcc-patches/2022-April/593200.html
> where this fix (alone) leads to a regression of the test case pr65105-5.c.

Is it possible to start with a STV patch? If there are only a few
introduced regressions, we can afford them in this stage of
development, and fix regressions later with a follow-up patches. THis
way, it is much easier for me to see the effect of the patch, and its
benefit can be weighted appropriately. I was indeed under the
impression that we try to peephole a combination that appears once in
a blue moon, but if the situation appears regularly, this is a
completely different matter.

> This pending regression has nothing to do with TARGET_BMI's andn, but
> the idiom "if ((x & y) != y)" on ia32, where x and y are DImode, and 
> stv/reload
> has decided to place these values in SSE registers.
>
> After combine we have an *anddi3_doubleword and *cmpdi3_doubleword:
> (insn 22 21 23 4 (parallel [
> (set (reg:DI 97)
> (and:DI (reg/v:DI 92 [ p2 ])
> (reg:DI 88 [ _25 ])))
> (clobber (reg:CC 17 flags))
> ]) "pr65105-5.c":20:18 530 {*anddi3_doubleword}
>  (expr_list:REG_UNUSED (reg:CC 17 flags)
> (nil)))
> (insn 23 22 24 4 (set (reg:CCZ 17 flags)
> (compare:CCZ (reg/v:DI 92 [ p2 ])
> (reg:DI 97))) "pr65105-5.c":20:8 29 {*cmpdi_doubleword}
>  (expr_list:REG_DEAD (reg:DI 97)
> (nil)))

One possible approach is to introduce intermediate compound (but
non-existent) instruction that is created by combine pass, and is
later split to real instructions. But a real testcase is needed, so
the correct strategy is used.

> After STV we have:
> (insn 22 21 45 4 (set (subreg:V2DI (reg:DI 97) 0)
> (and:V2DI (subreg:V2DI (reg/v:DI 92 [ p2 ]) 0)
> (subreg:V2DI (reg:DI 88 [ _25 ]) 0))) "pr65105-5.c":20:18 6640 
> {*andv2di3}
>  (expr_list:REG_UNUSED (reg:CC 17 flags)
> (nil)))
> (insn 45 22 46 4 (set (reg:V2DI 103)
> (xor:V2DI (subreg:V2DI (reg/v:DI 92 [ p2 ]) 0)
> (subreg:V2DI (reg:DI 97) 0))) "pr65105-5.c":20:8 -1
>  (nil))
> (insn 46 45 23 4 (set (reg:V2DI 103)
> (vec_select:V2DI (vec_concat:V4DI (reg:V2DI 103)
> (reg:V2DI 103))
> (parallel [
> (const_int 0 [0])
> (const_int 2 [0x2])
> ]))) "pr65105-5.c":20:8 -1
>  (nil))
> (insn 23 46 24 4 (set (reg:CC 17 flags)
> (unspec:CC [
> (reg:V2DI 103) repeated x2
> ] UNSPEC_PTEST)) "pr65105-5.c":20:8 7425 {sse4_1_ptestv2di}
>  (expr_list:REG_DEAD (reg:DI 97)
> (nil)))
>
> where the XOR has been introduce to implement the equality,
> as P == Q is effectively implemented as (P ^ Q) == 0.  At this point,
> the only remaining pass that can optimize the pand followed by
> the pxor is peephole2.
>
> The requirement to optimize this is from gcc.target/i386/pr65105-5.c
> where the desired implementation is explicitly looking for pandn+ptest:
>
> /* { dg-do compile { target ia32 } } */
> /* { dg-options "-O2 -march=core-avx2 -mno-stackrealign" } */
> /* { dg-final { scan-assembler "pandn" } } */
> /* { dg-final { scan-assembler "pxor" } } */
> /* { dg-final { scan-assembler "ptest" } } */
>
>
> Confusingly, I've even more patches in the queue/backlog for this part
> of the compiler (it's an air traffic control problem, fallout from stage 4).
>
> And of course, very many thanks for the various andn related patches
> that have already been approved/committed to the backend, to avoid
> potential regressions related to code size (-Os and -Oz).  It's a long road
> with many steps.
>
> Might you reconsider?  Pretty  please?

No problem for me, but the testcase would really help.

Uros.


Re: [PATCH] ipa-visibility: Optimize TLS access [PR99619]

2022-05-23 Thread Alexander Monakov via Gcc-patches
On Mon, 16 May 2022, Alexander Monakov wrote:

> On Mon, 9 May 2022, Jan Hubicka wrote:
> 
> > > On second thought, it might be better to keep the assert, and place the 
> > > loop
> > > under 'if (optimize)'?
> > 
> > The problem is that at IPA level it does not make sense to check
> > optimize flag as it is function specific.  (shlib is OK to check it
> > anywhere since it is global.)
> > 
> > So I think we really want to run the code only at the WPA time
> > (symtab_state>=IPA_SSA) and we want to see what is optimization flag of
> > those function referring the variable since that is what decided codegen
> > we will produce.
> 
> Perhaps I misunderstood the issue. Are you saying that there might be no -O
> option on lto1 command line, because lto1 is supposed to take optimization
> level from function summaries, but during pass_ipa_whole_program_visibility
> there's no "current function" so 'optimize' is at its default value (zero)?
> 
> And the solution is to iterate over referring functions to see if at least
> one of them satisfies 'opt_for_fn (decl, optimize) > 0'?

Do you want to see a patch implementing the above solution?

Alexander


[PATCH] Remove forward_propagate_into_cond

2022-05-23 Thread Richard Biener via Gcc-patches
This is a first cleanup opportunity from the COND_EXPR gimplification
which allows us to remove now redundant forward_propagate_into_cond.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

2022-05-23  Richard Biener  

* tree-ssa-forwprop.cc (forward_propagate_into_cond): Remove.
(pass_forwprop::execute): Do not propagate into COND_EXPR conditions.
---
 gcc/tree-ssa-forwprop.cc | 79 +---
 1 file changed, 2 insertions(+), 77 deletions(-)

diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
index b582529c404..d698a483ff1 100644
--- a/gcc/tree-ssa-forwprop.cc
+++ b/gcc/tree-ssa-forwprop.cc
@@ -511,9 +511,7 @@ forward_propagate_into_comparison (gimple_stmt_iterator 
*gsi)
 /* Propagate from the ssa name definition statements of COND_EXPR
in GIMPLE_COND statement STMT into the conditional if that simplifies it.
Returns zero if no statement was changed, one if there were
-   changes and two if cfg_cleanup needs to run.
-
-   This must be kept in sync with forward_propagate_into_cond.  */
+   changes and two if cfg_cleanup needs to run.  */
 
 static int
 forward_propagate_into_gimple_cond (gcond *stmt)
@@ -573,70 +571,6 @@ forward_propagate_into_gimple_cond (gcond *stmt)
   return 0;
 }
 
-
-/* Propagate from the ssa name definition statements of COND_EXPR
-   in the rhs of statement STMT into the conditional if that simplifies it.
-   Returns true zero if the stmt was changed.  */
-
-static bool
-forward_propagate_into_cond (gimple_stmt_iterator *gsi_p)
-{
-  gimple *stmt = gsi_stmt (*gsi_p);
-  tree tmp = NULL_TREE;
-  tree cond = gimple_assign_rhs1 (stmt);
-  enum tree_code code = gimple_assign_rhs_code (stmt);
-
-  /* We can do tree combining on SSA_NAME and comparison expressions.  */
-  if (COMPARISON_CLASS_P (cond))
-tmp = forward_propagate_into_comparison_1 (stmt, TREE_CODE (cond),
-  TREE_TYPE (cond),
-  TREE_OPERAND (cond, 0),
-  TREE_OPERAND (cond, 1));
-  else if (TREE_CODE (cond) == SSA_NAME)
-{
-  enum tree_code def_code;
-  tree name = cond;
-  gimple *def_stmt = get_prop_source_stmt (name, true, NULL);
-  if (!def_stmt || !can_propagate_from (def_stmt))
-   return 0;
-
-  def_code = gimple_assign_rhs_code (def_stmt);
-  if (TREE_CODE_CLASS (def_code) == tcc_comparison)
-   tmp = fold_build2_loc (gimple_location (def_stmt),
-  def_code,
-  TREE_TYPE (cond),
-  gimple_assign_rhs1 (def_stmt),
-  gimple_assign_rhs2 (def_stmt));
-}
-
-  if (tmp
-  && is_gimple_val (tmp))
-{
-  if (dump_file)
-   {
- fprintf (dump_file, "  Replaced '");
- print_generic_expr (dump_file, cond);
- fprintf (dump_file, "' with '");
- print_generic_expr (dump_file, tmp);
- fprintf (dump_file, "'\n");
-   }
-
-  if ((code == VEC_COND_EXPR) ? integer_all_onesp (tmp)
- : integer_onep (tmp))
-   gimple_assign_set_rhs_from_tree (gsi_p, gimple_assign_rhs2 (stmt));
-  else if (integer_zerop (tmp))
-   gimple_assign_set_rhs_from_tree (gsi_p, gimple_assign_rhs3 (stmt));
-  else
-   gimple_assign_set_rhs1 (stmt, unshare_expr (tmp));
-  stmt = gsi_stmt (*gsi_p);
-  update_stmt (stmt);
-
-  return true;
-}
-
-  return 0;
-}
-
 /* We've just substituted an ADDR_EXPR into stmt.  Update all the
relevant data structures to match.  */
 
@@ -3720,16 +3654,7 @@ pass_forwprop::execute (function *fun)
tree rhs1 = gimple_assign_rhs1 (stmt);
enum tree_code code = gimple_assign_rhs_code (stmt);
 
-   if (code == COND_EXPR)
- {
-   /* In this case the entire COND_EXPR is in rhs1. */
-   if (forward_propagate_into_cond (&gsi))
- {
-   changed = true;
-   stmt = gsi_stmt (gsi);
- }
- }
-   else if (TREE_CODE_CLASS (code) == tcc_comparison)
+   if (TREE_CODE_CLASS (code) == tcc_comparison)
  {
int did_something;
did_something = forward_propagate_into_comparison 
(&gsi);
-- 
2.35.3


[PATCH] tree-optimization/105629 - spaceship recognition regression

2022-05-23 Thread Richard Biener via Gcc-patches
With the extra GENERIC folding we now do to
(unsigned int) __v._M_value & 1 != (unsigned int) __v._M_value
we end up with a sign-extending conversion to unsigned int
rather than the sign-conversion to unsigned char we expect.
Relaxing that fixes the regression.

Bootstrapped and tested on x86_64-unknown-linux-gnu, OK?

Thanks,
Richard.

2022-05-23  Richard Biener  

PR tree-optimization/105629
* tree-ssa-phiopt.cc (spaceship_replacement): Allow
a sign-extending conversion.
---
 gcc/tree-ssa-phiopt.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/tree-ssa-phiopt.cc b/gcc/tree-ssa-phiopt.cc
index 8c9c46d41f1..e61d9736937 100644
--- a/gcc/tree-ssa-phiopt.cc
+++ b/gcc/tree-ssa-phiopt.cc
@@ -2217,7 +2217,7 @@ spaceship_replacement (basic_block cond_bb, basic_block 
middle_bb,
 
   if (!TYPE_UNSIGNED (ty2) || !INTEGRAL_TYPE_P (ty2))
return false;
-  if (TYPE_PRECISION (ty1) != TYPE_PRECISION (ty2))
+  if (TYPE_PRECISION (ty1) > TYPE_PRECISION (ty2))
return false;
   if (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (orig_use_lhs))
return false;
-- 
2.35.3


Re: [PATCH] Simplify vec_unpack of uniform_vector_p constructors in match.pd.

2022-05-23 Thread Richard Biener via Gcc-patches
On Sat, May 21, 2022 at 5:31 PM Roger Sayle  wrote:
>
>
> This patch simplifies vec_unpack_hi_expr/vec_unpack_lo_expr of a uniform
> constructor or vec_duplicate operand.  The motivation is from PR 105621
> where after optimization, we're left with:
>
>   vect_cst__21 = {c_8(D), c_8(D), c_8(D), c_8(D)};
>   vect_iftmp.7_4 = [vec_unpack_hi_expr] vect_cst__21;
>
> It turns out that there are no constant folding/simplification patterns
> in match.pd, but the above can be simplified further to the equivalent:
>
>   _20 = (long int) c_8(D);
>   vect_iftmp.7_4 = [vec_duplicate_expr] _20;
>
> which on x86-64 results in one less instruction, replacing pshufd $0
> then punpackhq, with punpcklqdq.  This transformation is also useful
> for helping CSE to spot that unpack_hi and unpack_lo are equivalent.
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check with no new failures.  Ok for mainline?

I think we need a way to query whether the target can do a VEC_DUPLICATE_EXPR.
Currently we only ever have them for VL vectors and expand via
expand_vector_broadcast which eventually simply gives up when there's no
vec_duplicate or vec_init optabs suitable.

IIRC with the VEC_PERM extension we should be able to handle
VEC_DUPLICATE via VEC_PERM?  (but we don't yet accept a scalar
input, just V1?)

I see most targets have picked up vec_duplicate but sparc, but still
we'd need to check the specific mode.  I think we can disregart
vec_init checking and only apply the transforms when vec_duplicate
is available.

Richard.

>
> 2022-05-21  Roger Sayle  
>
> gcc/ChangeLog
> * match.pd (simplify vec_unpack_hi): Simplify VEC_UNPACK_*_EXPR
> of uniform vector constructors and vec_duplicate.
>
> gcc/testsuite/ChangeLog
> * g++.dg/vect/pr105621.cc: New test case.
>
>
> Thanks in advance,
> Roger
> --
>


RE: [x86 PING] Peephole pand;pxor into pandn

2022-05-23 Thread Roger Sayle


Hi Uros,
Hopefully, if I explain even more of the context, you'll better understand why
this harmless (and at worse seemingly redundant) peephole2 is actually critical
for addressing significant regressions in the compiler without introducing new
testsuite failures.  I wouldn't ask (again), if I didn't feel it's important.

Basically, I'm trying to unblock Hongtao's patch (for PR target/104610)
which in your own review, explained is better handled by/during STV: 
https://gcc.gnu.org/pipermail/gcc-patches/2022-May/594070.html

Unfortunately, that patch of mine to STV (that I want to ping next) that solves
the P2 code quality regression PR target/70321, is itself blocked by another
review of yours:
https://gcc.gnu.org/pipermail/gcc-patches/2022-April/593200.html
where this fix (alone) leads to a regression of the test case pr65105-5.c.

This pending regression has nothing to do with TARGET_BMI's andn, but
the idiom "if ((x & y) != y)" on ia32, where x and y are DImode, and stv/reload
has decided to place these values in SSE registers.

After combine we have an *anddi3_doubleword and *cmpdi3_doubleword:
(insn 22 21 23 4 (parallel [
(set (reg:DI 97)
(and:DI (reg/v:DI 92 [ p2 ])
(reg:DI 88 [ _25 ])))
(clobber (reg:CC 17 flags))
]) "pr65105-5.c":20:18 530 {*anddi3_doubleword}
 (expr_list:REG_UNUSED (reg:CC 17 flags)
(nil)))
(insn 23 22 24 4 (set (reg:CCZ 17 flags)
(compare:CCZ (reg/v:DI 92 [ p2 ])
(reg:DI 97))) "pr65105-5.c":20:8 29 {*cmpdi_doubleword}
 (expr_list:REG_DEAD (reg:DI 97)
(nil)))

After STV we have:
(insn 22 21 45 4 (set (subreg:V2DI (reg:DI 97) 0)
(and:V2DI (subreg:V2DI (reg/v:DI 92 [ p2 ]) 0)
(subreg:V2DI (reg:DI 88 [ _25 ]) 0))) "pr65105-5.c":20:18 6640 
{*andv2di3}
 (expr_list:REG_UNUSED (reg:CC 17 flags)
(nil)))
(insn 45 22 46 4 (set (reg:V2DI 103)
(xor:V2DI (subreg:V2DI (reg/v:DI 92 [ p2 ]) 0)
(subreg:V2DI (reg:DI 97) 0))) "pr65105-5.c":20:8 -1
 (nil))
(insn 46 45 23 4 (set (reg:V2DI 103)
(vec_select:V2DI (vec_concat:V4DI (reg:V2DI 103)
(reg:V2DI 103))
(parallel [
(const_int 0 [0])
(const_int 2 [0x2])
]))) "pr65105-5.c":20:8 -1
 (nil))
(insn 23 46 24 4 (set (reg:CC 17 flags)
(unspec:CC [
(reg:V2DI 103) repeated x2
] UNSPEC_PTEST)) "pr65105-5.c":20:8 7425 {sse4_1_ptestv2di}
 (expr_list:REG_DEAD (reg:DI 97)
(nil)))

where the XOR has been introduce to implement the equality,
as P == Q is effectively implemented as (P ^ Q) == 0.  At this point, 
the only remaining pass that can optimize the pand followed by
the pxor is peephole2.

The requirement to optimize this is from gcc.target/i386/pr65105-5.c
where the desired implementation is explicitly looking for pandn+ptest:

/* { dg-do compile { target ia32 } } */
/* { dg-options "-O2 -march=core-avx2 -mno-stackrealign" } */
/* { dg-final { scan-assembler "pandn" } } */
/* { dg-final { scan-assembler "pxor" } } */
/* { dg-final { scan-assembler "ptest" } } */


Confusingly, I've even more patches in the queue/backlog for this part
of the compiler (it's an air traffic control problem, fallout from stage 4).

And of course, very many thanks for the various andn related patches
that have already been approved/committed to the backend, to avoid
potential regressions related to code size (-Os and -Oz).  It's a long road
with many steps.

Might you reconsider?  Pretty  please?
Roger
--

> -Original Message-
> From: Uros Bizjak 
> Sent: 23 May 2022 10:11
> To: Roger Sayle 
> Cc: gcc-patches@gcc.gnu.org
> Subject: Re: [x86 PING] Peephole pand;pxor into pandn
> 
> On Mon, May 23, 2022 at 10:59 AM Roger Sayle
>  wrote:
> >
> >
> > Hi Uros,
> >
> > Thanks for the speedy review.  The point of this patch is that (with
> > pending changes to STV) the pand;pxor sequence isn't created until
> > after combine, and hence doesn't/won't get caught by any of the
> > current pre-reload/combine splitters.
> 
> IMO this happens due to inconsistencies between integer and vector set, where
> integer andn is absent without BMI. However, we don't re-run the combine after
> reload, and I don't think it is worth to reimplement it via peephole2 
> patterns.
> Please note that AVX allows much more combinations that are not catched by
> your patch, and considering that combine already does the transformation, I
> don't see a compelling reason for this very specialized peephole2.
> 
> Let's keep the patch shelved until a testcase shows the benefits of the patch.
> 
> Uros.
> 
> >
> >
> > > -Original Message-
> > > From: Uros Bizjak 
> > > Sent: 23 May 2022 09:51
> > > To: Roger Sayle 
> > > Cc: gcc-patches@gcc.gnu.org
> > > Subject: Re: [x86 PING] Peephole pand;pxor into pandn
> > >
> > > On Mon, May 23, 2022 at 10:44 AM Roger Sayle
> > >  wrote:
> > > >
> > > 

Re: [PATCH] DSE: Use the constant source if possible

2022-05-23 Thread Richard Biener via Gcc-patches
On Sat, May 21, 2022 at 5:02 AM H.J. Lu via Gcc-patches
 wrote:
>
> When recording store for RTL dead store elimination, check if the source
> register is set only once to a constant.  If yes, record the constant
> as the store source.  It eliminates unrolled zero stores after memset 0
> in a loop where a vector register is used as the zero store source.
>
> gcc/
>
> PR rtl-optimization/105638
> * dse.cc (record_store): Use the constant source if the source
> register is set only once.
>
> gcc/testsuite/
>
> PR rtl-optimization/105638
> * g++.target/i386/pr105638.C: New test.
> ---
>  gcc/dse.cc   | 19 ++
>  gcc/testsuite/g++.target/i386/pr105638.C | 44 
>  2 files changed, 63 insertions(+)
>  create mode 100644 gcc/testsuite/g++.target/i386/pr105638.C
>
> diff --git a/gcc/dse.cc b/gcc/dse.cc
> index 30c11cee034..0433dd3d846 100644
> --- a/gcc/dse.cc
> +++ b/gcc/dse.cc
> @@ -1508,6 +1508,25 @@ record_store (rtx body, bb_info_t bb_info)
>
>   if (tem && CONSTANT_P (tem))
> const_rhs = tem;
> + else
> +   {
> + /* If RHS is set only once to a constant, set CONST_RHS
> +to the constant.  */
> + df_ref def = DF_REG_DEF_CHAIN (REGNO (rhs));
> + if (def != nullptr
> + && !DF_REF_IS_ARTIFICIAL (def)
> + && !DF_REF_NEXT_REG (def))
> +   {
> + rtx_insn *def_insn = DF_REF_INSN (def);
> + rtx def_body = PATTERN (def_insn);
> + if (GET_CODE (def_body) == SET)
> +   {
> + rtx def_src = SET_SRC (def_body);
> + if (CONSTANT_P (def_src))
> +   const_rhs = def_src;

doesn't DSE have its own tracking of stored values?  Shouldn't we
improve _that_ if it is not enough?  I also wonder if you need to
verify the SET isn't partial?

Richard.

> +   }
> +   }
> +   }
> }
>  }
>
> diff --git a/gcc/testsuite/g++.target/i386/pr105638.C 
> b/gcc/testsuite/g++.target/i386/pr105638.C
> new file mode 100644
> index 000..ff40a459de1
> --- /dev/null
> +++ b/gcc/testsuite/g++.target/i386/pr105638.C
> @@ -0,0 +1,44 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-std=gnu++20 -O2 -march=skylake" } */
> +/* { dg-final { scan-assembler-not "vpxor" } } */
> +
> +#include 
> +#include 
> +#include 
> +
> +class FastBoard {
> +public:
> +typedef std::pair movescore_t;
> +typedef std::tr1::array scoredlist_t;
> +
> +protected:
> +std::vector m_critical;
> +
> +int m_boardsize;
> +};
> +
> +class FastState {
> +public:
> +FastBoard board;
> +
> +int movenum;
> +protected:
> +FastBoard::scoredlist_t scoredmoves;
> +};
> +
> +class KoState : public FastState {
> +private:
> +std::vector ko_hash_history;
> +std::vector hash_history;
> +};
> +
> +class GameState : public KoState {
> +public:
> +void foo ();
> +private:
> +std::vector game_history;
> +};
> +
> +void GameState::foo() {
> +game_history.resize(movenum);
> +}
> --
> 2.36.1
>


Re: [EXTERNAL] Re: [PATCH] Guard against applying scale with 0 denominator

2022-05-23 Thread Richard Biener via Gcc-patches
On Sat, May 21, 2022 at 12:28 AM Eugene Rozenfeld
 wrote:
>
> Thank you for the feedback Richard. I attached a patch that saves/restores 
> counts if the epilog doesn't use a scalar loop.

OK.

Thanks,
Richard.

> Eugene
>
> -Original Message-
> From: Richard Biener 
> Sent: Thursday, May 12, 2022 12:34 AM
> To: Eugene Rozenfeld 
> Cc: Jan Hubicka ; gcc-patches@gcc.gnu.org
> Subject: Re: [EXTERNAL] Re: [PATCH] Guard against applying scale with 0 
> denominator
>
> On Thu, May 12, 2022 at 3:37 AM Eugene Rozenfeld 
>  wrote:
> >
> > In my case this is not exactly what the FIXME in the comment above
> > says. The count is 0 even before the initial scaling happens. I hit this 
> > case with some changes I'm working on to enable per-instruction 
> > discriminators for AutoFDO.
> >
> > I looked into saving/restoring the old counts but it would be awkward to 
> > do. The initial scaling happens here:
> >
> > if (skip_vector)
> > {
> >   split_edge (loop_preheader_edge (loop));
> >
> >   /* Due to the order in which we peel prolog and epilog, we first
> >  propagate probability to the whole loop.  The purpose is to
> >  avoid adjusting probabilities of both prolog and vector loops
> >  separately.  Note in this case, the probability of epilog loop
> >  needs to be scaled back later.  */
> >   basic_block bb_before_loop = loop_preheader_edge (loop)->src;
> >   if (prob_vector.initialized_p ())
> > {
> >   scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
> >   scale_loop_profile (loop, prob_vector, 0);
> > }
> > }
> >
> > The scaling happens before we do the cloning for the epilog so to
> > save/restore the counts we would need to maintain a mapping between the 
> > original basic blocks and the cloned basic blocks in the epilog.
>
> There is one already - after the epilogue is copied you can use 
> get_bb_original (epilouge_bb) to get at the block it was copied from.
> It could also be that we can rely on the basic-block order to be preserved 
> between the copies (I _think_ that would work ... but then assert () for this 
> using get_bb_original ()).  That means the simplest fix would be to have an 
> auto_vec and initialize that from the BB counts in loop body order (we 
> also have exactly two BBs in all peeled loops ...
>
> But note we only scaled the scalar if-converted loop but eventually used the 
> not if-coverted copy for the epilogue (if not vectorizing the epilogue), so 
> indiscriminate scaling back looks wrong in some cases (I'd have to 
> double-check the details here).
>
> > I'd like to get my simple fix in since it makes things better even if
> > it doesn't address the issue mentioned In the FIXME.
>
> But don't you need to check that
> bbs[i]->count.apply_probability (prob_vector) is not zero instead of checking 
> that bbs[i].count is not zero?
>
> Richard.
>
> > -Original Message-
> > From: Richard Biener 
> > Sent: Monday, May 09, 2022 12:42 AM
> > To: Eugene Rozenfeld ; Jan Hubicka
> > 
> > Cc: gcc-patches@gcc.gnu.org
> > Subject: [EXTERNAL] Re: [PATCH] Guard against applying scale with 0
> > denominator
> >
> > On Fri, May 6, 2022 at 10:32 PM Eugene Rozenfeld via Gcc-patches 
> >  wrote:
> > >
> > > Calling count.apply_scale with a 0 denominator causes an assert.
> > > This change guards against that.
> > >
> > > Tested on x86_64-pc-linux-gnu.
> > >
> > > gcc/ChangeLog:
> > > * tree-loop-vect-manip.cc (vect_do_peeling): Guard against 
> > > applying scale with 0 denominator.
> > > ---
> > >  gcc/tree-vect-loop-manip.cc | 9 +
> > >  1 file changed, 5 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/gcc/tree-vect-loop-manip.cc
> > > b/gcc/tree-vect-loop-manip.cc index 1d4337eb261..db54ae69e45 100644
> > > --- a/gcc/tree-vect-loop-manip.cc
> > > +++ b/gcc/tree-vect-loop-manip.cc
> > > @@ -2989,10 +2989,11 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree 
> > > niters, tree nitersm1,
> > >  get lost if we scale down to 0.  */
> > >   basic_block *bbs = get_loop_body (epilog);
> > >   for (unsigned int i = 0; i < epilog->num_nodes; i++)
> > > -   bbs[i]->count = bbs[i]->count.apply_scale
> > > -(bbs[i]->count,
> > > - bbs[i]->count.apply_probability
> > > -   (prob_vector));
> > > +   if (bbs[i]->count.nonzero_p ())
> > > + bbs[i]->count = bbs[i]->count.apply_scale
> > > +  (bbs[i]->count,
> > > +   bbs[i]->count.apply_probability
> > > + (prob_vector));
> >
> > So exactly what the FIXME in the comment above says happens.   It
> > might be better
> > to save/restore the old counts if the intent is to get them back.  I'm not 
> > exactly sure where the other scaling happens though.
> >
> > Richard.
> >
> >
> >
> > >   

Re: [x86 PATCH] PR tree-optimization/105668: Provide vcond_mask_v1tiv1ti pattern.

2022-05-23 Thread Richard Biener via Gcc-patches
On Mon, May 23, 2022 at 10:00 AM Uros Bizjak  wrote:
>
> On Mon, May 23, 2022 at 9:16 AM Roger Sayle  
> wrote:
> >
> >
> > This patch is an alternate/supplementary fix to PR tree-optimization/105668
> > that provides a vcond_mask_v1titi optab/define_expand to the i386 backend.
> > An undocumented feature/bug of GCC's vectorization is that any target that
> > provides a vec_cmpeq has to also provide a matching
> > vcond_mask.  This backend patch preserves the status quo,
> > rather than fixes the underlying problem.
>
> IIRC, I also hit this issue a while ago. I was under impression it was
> fixed in the meantime, but looks I was wrong.

It's generally prefered to have vec_cmp* and vcond_mask over
vcond when the target will end up doing the compare and select
in different instructions (IIRC the x86 ISA has no combined
compare & select instructions).

So it might be interesting to see if we can remove the vcond{,u,eq}
expanders (and fill in missing vec_cmp and vcond_mask patterns).

Richard.

> > One aspect of this clean-up is that ix86_expand_sse_movcc provides
> > fallback implementations using pand/pandn/por that effectively make
> > V2DImode and V1TImode vcond_mask available on any TARGET_SSE2, not
> > just TARGET_SSE4_2.  This allows a simplification as V2DI mode can
> > be handled by using a VI_128 mode iterator instead of a VI124_128
> > mode iterator, and instead this define_expand is effectively renamed
> > to provide a V1TImode vcond_mask expander (as V1TI isn't in VI_128).
> >
> > This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> > and make -k check, both with and without --target_board=unix{-m32} with
> > no new failures.  The new test case is identical to the middle-end patch,
> > so if both patches are approved, this'll be committed only once.
> > Ok for mainline?
>
> OK.
>
> Thanks,
> Uros.
>
> >
> >
> > 2022-05-23  Roger Sayle  
> >
> > gcc/ChangeLog
> > PR tree-optimization/105668
> > * config/i386/i386-expand.cc (ix86_expand_sse_movcc): Support
> > V1TImode, just like V2DImode.
> > * config/i386/sse.md (vcond_mask_Msseintvecmodelower>):
> > Use VI_128 mode iterator instead of VI124_128 to include V2DI.
> > (vcond_mask_v2div2di): Delete.
> > (vcond_mask_v1tiv1ti): New define_expand.
> >
> > gcc/testsuite/ChangeLog
> > PR tree-optimization/105668
> > * gcc.target/i386/pr105668.c: New test case.
> >
> >
> > Roger
> > --
> >


[PATCH v3 0/3] RISC-V: Support z[f/d]inx extension

2022-05-23 Thread jiawei
From: Jia-Wei Chen 

Zfinx extension[1] had already finished public review. Here is the 
implementation patch set that reuse floating point pattern and ban
the use of fpr when use zfinx as a target.

Current works can be find in follow links, will keep update zhinx
and zhinxmin soon after zfh/zfhmin implemented in gcc.
  https://github.com/pz9115/riscv-gcc/tree/zfinx-rebase
  https://github.com/pz9115/riscv-binutils-gdb/tree/zfinx-rebase

For test you can use qemu or spike that support zfinx extension, the
qemu will go upstream soon and spike is still in review:
  https://github.com/plctlab/plct-qemu/tree/plct-zfinx-dev
  https://github.com/plctlab/plct-spike/tree/plct-upstream-zfinx

Thanks for Tariq Kurd, Kito Cheng, Jim Willson, 
Jeremy Bennett helped us a lot with this work.

[1] https://github.com/riscv/riscv-zfinx/blob/main/zfinx-1.0.0-rc.pdf

Version log:

v2: As Kito Cheng's comment, add Changelog part in patches, update imply 
info in riscv-common.c, remove useless check and update annotation in 
riscv.c.

v3: Update with new isa-spec version 20191213, make zfinx imply zicsr as
default, fix the lack of fcsr use in zfinx.

jiawei (3):
  RISC-V: Minimal support of zfinx extension.
  RISC-V: Target support for zfinx extension.
  RISC-V: Limit regs use  for zfinx extension.

 gcc/common/config/riscv/riscv-common.cc |  9 
 gcc/config/riscv/arch-canonicalize  |  3 ++
 gcc/config/riscv/constraints.md |  4 +-
 gcc/config/riscv/riscv-builtins.cc  |  4 +-
 gcc/config/riscv/riscv-c.cc |  2 +-
 gcc/config/riscv/riscv-opts.h   |  6 +++
 gcc/config/riscv/riscv.cc   | 14 -
 gcc/config/riscv/riscv.md   | 72 -
 gcc/config/riscv/riscv.opt  |  3 ++
 9 files changed, 75 insertions(+), 42 deletions(-)

-- 
2.25.1



[PATCH v3 2/3] RISC-V: Target support for z[f/d]inx extension.

2022-05-23 Thread jiawei
From: Jia-Wei Chen 

Support 'TARGET_ZFINX' with float instruction pattern and builtin function.
Reuse 'TARGET_HADR_FLOAT' and 'TARGET_DOUBLE_FLOAT' patterns.

gcc/ChangeLog:

* config/riscv/riscv-builtins.cc (AVAIL): Add TARGET_ZFINX.
(riscv_atomic_assign_expand_fenv): Ditto.
* config/riscv/riscv-c.cc (riscv_cpu_cpp_builtins): Add TARGET_ZFINX.
* config/riscv/riscv.md (TARGET_HARD_FLOAT): Add TARGET_ZFINX.
(TARGET_HARD_FLOAT || TARGET_ZFINX): Add TARGET_ZFINX.
(TARGET_DOUBLE_FLOAT || TARGET_ZDINX): Add TARGET_ZDINX.

Co-Authored-By: Sinan Lin.
---
 gcc/config/riscv/riscv-builtins.cc |  4 +-
 gcc/config/riscv/riscv-c.cc|  2 +-
 gcc/config/riscv/riscv.md  | 76 +++---
 3 files changed, 41 insertions(+), 41 deletions(-)

diff --git a/gcc/config/riscv/riscv-builtins.cc 
b/gcc/config/riscv/riscv-builtins.cc
index 0658f8d3047..21896d747f5 100644
--- a/gcc/config/riscv/riscv-builtins.cc
+++ b/gcc/config/riscv/riscv-builtins.cc
@@ -85,7 +85,7 @@ struct riscv_builtin_description {
   unsigned int (*avail) (void);
 };
 
-AVAIL (hard_float, TARGET_HARD_FLOAT)
+AVAIL (hard_float, TARGET_HARD_FLOAT || TARGET_ZFINX)
 
 /* Construct a riscv_builtin_description from the given arguments.
 
@@ -279,7 +279,7 @@ riscv_expand_builtin (tree exp, rtx target, rtx subtarget 
ATTRIBUTE_UNUSED,
 void
 riscv_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
 {
-  if (!TARGET_HARD_FLOAT)
+  if (!(TARGET_HARD_FLOAT || TARGET_ZFINX))
 return;
 
   tree frflags = GET_BUILTIN_DECL (CODE_FOR_riscv_frflags);
diff --git a/gcc/config/riscv/riscv-c.cc b/gcc/config/riscv/riscv-c.cc
index eb7ef09297e..a9c43a64fd4 100644
--- a/gcc/config/riscv/riscv-c.cc
+++ b/gcc/config/riscv/riscv-c.cc
@@ -58,7 +58,7 @@ riscv_cpu_cpp_builtins (cpp_reader *pfile)
   if (TARGET_HARD_FLOAT)
 builtin_define_with_int_value ("__riscv_flen", UNITS_PER_FP_REG * 8);
 
-  if (TARGET_HARD_FLOAT && TARGET_FDIV)
+  if ((TARGET_HARD_FLOAT || TARGET_ZFINX) && TARGET_FDIV)
 {
   builtin_define ("__riscv_fdiv");
   builtin_define ("__riscv_fsqrt");
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index d9b451be0b4..f81e315666e 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -300,8 +300,8 @@
 (define_mode_iterator ANYI [QI HI SI (DI "TARGET_64BIT")])
 
 ;; Iterator for hardware-supported floating-point modes.
-(define_mode_iterator ANYF [(SF "TARGET_HARD_FLOAT")
-   (DF "TARGET_DOUBLE_FLOAT")])
+(define_mode_iterator ANYF [(SF "TARGET_HARD_FLOAT || TARGET_ZFINX")
+   (DF "TARGET_DOUBLE_FLOAT || TARGET_ZDINX")])
 
 ;; Iterator for floating-point modes that can be loaded into X registers.
 (define_mode_iterator SOFTF [SF (DF "TARGET_64BIT")])
@@ -448,7 +448,7 @@
   [(set (match_operand:ANYF0 "register_operand" "=f")
(plus:ANYF (match_operand:ANYF 1 "register_operand" " f")
   (match_operand:ANYF 2 "register_operand" " f")))]
-  "TARGET_HARD_FLOAT"
+  "TARGET_HARD_FLOAT || TARGET_ZFINX"
   "fadd.\t%0,%1,%2"
   [(set_attr "type" "fadd")
(set_attr "mode" "")])
@@ -579,7 +579,7 @@
   [(set (match_operand:ANYF 0 "register_operand" "=f")
(minus:ANYF (match_operand:ANYF 1 "register_operand" " f")
(match_operand:ANYF 2 "register_operand" " f")))]
-  "TARGET_HARD_FLOAT"
+  "TARGET_HARD_FLOAT || TARGET_ZFINX"
   "fsub.\t%0,%1,%2"
   [(set_attr "type" "fadd")
(set_attr "mode" "")])
@@ -749,7 +749,7 @@
   [(set (match_operand:ANYF   0 "register_operand" "=f")
(mult:ANYF (match_operand:ANYF1 "register_operand" " f")
  (match_operand:ANYF 2 "register_operand" " f")))]
-  "TARGET_HARD_FLOAT"
+  "TARGET_HARD_FLOAT || TARGET_ZFINX"
   "fmul.\t%0,%1,%2"
   [(set_attr "type" "fmul")
(set_attr "mode" "")])
@@ -1056,7 +1056,7 @@
   [(set (match_operand:ANYF   0 "register_operand" "=f")
(div:ANYF (match_operand:ANYF 1 "register_operand" " f")
  (match_operand:ANYF 2 "register_operand" " f")))]
-  "TARGET_HARD_FLOAT && TARGET_FDIV"
+  "(TARGET_HARD_FLOAT || TARGET_ZFINX) && TARGET_FDIV"
   "fdiv.\t%0,%1,%2"
   [(set_attr "type" "fdiv")
(set_attr "mode" "")])
@@ -1071,7 +1071,7 @@
 (define_insn "sqrt2"
   [(set (match_operand:ANYF0 "register_operand" "=f")
(sqrt:ANYF (match_operand:ANYF 1 "register_operand" " f")))]
-  "TARGET_HARD_FLOAT && TARGET_FDIV"
+  "(TARGET_HARD_FLOAT || TARGET_ZFINX) && TARGET_FDIV"
 {
 return "fsqrt.\t%0,%1";
 }
@@ -1086,7 +1086,7 @@
(fma:ANYF (match_operand:ANYF 1 "register_operand" " f")
  (match_operand:ANYF 2 "register_operand" " f")
  (match_operand:ANYF 3 "register_operand" " f")))]
-  "TARGET_HARD_FLOAT"
+  "TARGET_HARD_FLOAT || TARGET_ZFINX"
   "fmadd.\t%0,%1,%2,%3"
   [(set_attr "type" "fmadd")
(set_attr "mode" "")])
@@ -1097

[PATCH v3 3/3] RISC-V: Limit regs use for z[f/d]inx extension.

2022-05-23 Thread jiawei
From: Jia-Wei Chen 

Limit zfinx abi support with 'ilp32','ilp32e','lp64' only.

Use GPR instead FPR when 'zfinx' enable, Only use even registers in RV32 when 
'zdinx' enable.

gcc/ChangeLog:

* config/riscv/constraints.md (TARGET_HARD_FLOAT ? FP_REGS :
 ((TARGET_ZFINX || TARGET_ZDINX) ? GR_REGS : NO_REGS)):
  Use gpr when zfinx or zdinx enable.
* config/riscv/riscv.c (riscv_hard_regno_mode_ok): Add TARGET_ZFINX.
(riscv_option_override): Ditto.
(riscv_abi): Add ABI limit for zfinx with ilp32/lp64.

Co-Authored-By: Sinan Lin.
---
 gcc/config/riscv/constraints.md |  4 ++--
 gcc/config/riscv/riscv.cc   | 14 +-
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/gcc/config/riscv/constraints.md b/gcc/config/riscv/constraints.md
index bafa4188ccb..0b3d55fee19 100644
--- a/gcc/config/riscv/constraints.md
+++ b/gcc/config/riscv/constraints.md
@@ -21,8 +21,8 @@
 
 ;; Register constraints
 
-(define_register_constraint "f" "TARGET_HARD_FLOAT ? FP_REGS : NO_REGS"
-  "A floating-point register (if available).")
+(define_register_constraint "f" "TARGET_HARD_FLOAT ? FP_REGS : ((TARGET_ZFINX 
|| TARGET_ZDINX) ? GR_REGS : NO_REGS)"
+  "A floating-point register (if available, reuse GPR as FPR when use zfinx).")
 
 (define_register_constraint "j" "SIBCALL_REGS"
   "@internal")
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index ee756aab694..01deef54480 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -4789,6 +4789,13 @@ riscv_hard_regno_mode_ok (unsigned int regno, 
machine_mode mode)
!= call_used_or_fixed_reg_p (regno + i))
   return false;
 
+  /* Only use even registers in RV32 ZDINX */
+  if (!TARGET_64BIT && TARGET_ZDINX){
+if (GET_MODE_CLASS (mode) == MODE_FLOAT &&
+ GET_MODE_UNIT_SIZE (mode) == GET_MODE_SIZE (DFmode))
+return !(regno & 1);
+  }
+
   return true;
 }
 
@@ -4980,7 +4987,7 @@ riscv_option_override (void)
 error ("%<-mdiv%> requires %<-march%> to subsume the % extension");
 
   /* Likewise floating-point division and square root.  */
-  if (TARGET_HARD_FLOAT && (target_flags_explicit & MASK_FDIV) == 0)
+  if ((TARGET_HARD_FLOAT || TARGET_ZFINX) && (target_flags_explicit & 
MASK_FDIV) == 0)
 target_flags |= MASK_FDIV;
 
   /* Handle -mtune, use -mcpu if -mtune is not given, and use default -mtune
@@ -5026,6 +5033,11 @@ riscv_option_override (void)
   if (TARGET_RVE && riscv_abi != ABI_ILP32E)
 error ("rv32e requires ilp32e ABI");
 
+  // Zfinx require abi ilp32,ilp32e or lp64.
+  if (TARGET_ZFINX && riscv_abi != ABI_ILP32
+  && riscv_abi != ABI_LP64 && riscv_abi != ABI_ILP32E)
+error ("z*inx requires ABI ilp32, ilp32e or lp64");
+
   /* We do not yet support ILP32 on RV64.  */
   if (BITS_PER_WORD != POINTER_SIZE)
 error ("ABI requires %<-march=rv%d%>", POINTER_SIZE);
-- 
2.25.1



[PATCH v3 1/3] RISC-V: Minimal support of z[f/d]inx extension.

2022-05-23 Thread jiawei
From: Jia-Wei Chen 

Minimal support of zfinx extension, include 'zfinx' and 'zdinx'
corresponding to 'f' and 'd', the 'zdinx' will imply 'zfinx'
same as 'd' imply 'f'.

gcc/ChangeLog:

* common/config/riscv/riscv-common.cc: Add z[f/d]inx extension info.
* config/riscv/arch-canonicalize: Add imply info.
* config/riscv/riscv-opts.h (MASK_ZFINX): New.
(MASK_ZDINX): Ditto.
(TARGET_ZFINX): Ditto.
(TARGET_ZDINX): Ditto.
* config/riscv/riscv.opt: New.
  
Co-Authored-By: Sinan Lin
---
 gcc/common/config/riscv/riscv-common.cc | 9 +
 gcc/config/riscv/arch-canonicalize  | 3 +++
 gcc/config/riscv/riscv-opts.h   | 6 ++
 gcc/config/riscv/riscv.opt  | 3 +++
 4 files changed, 21 insertions(+)

diff --git a/gcc/common/config/riscv/riscv-common.cc 
b/gcc/common/config/riscv/riscv-common.cc
index 1501242e296..124bccb23ce 100644
--- a/gcc/common/config/riscv/riscv-common.cc
+++ b/gcc/common/config/riscv/riscv-common.cc
@@ -50,6 +50,9 @@ static const riscv_implied_info_t riscv_implied_info[] =
   {"d", "f"},
   {"f", "zicsr"},
   {"d", "zicsr"},
+  {"zdinx", "zfinx"},
+  {"zfinx", "zicsr"},
+
   {"zk", "zkn"},
   {"zk", "zkr"},
   {"zk", "zkt"},
@@ -154,6 +157,9 @@ static const struct riscv_ext_version 
riscv_ext_version_table[] =
   {"zbc", ISA_SPEC_CLASS_NONE, 1, 0},
   {"zbs", ISA_SPEC_CLASS_NONE, 1, 0},
 
+  {"zfinx", ISA_SPEC_CLASS_NONE, 1, 0},
+  {"zdinx", ISA_SPEC_CLASS_NONE, 1, 0},
+
   {"zbkb",  ISA_SPEC_CLASS_NONE, 1, 0},
   {"zbkc",  ISA_SPEC_CLASS_NONE, 1, 0},
   {"zbkx",  ISA_SPEC_CLASS_NONE, 1, 0},
@@ -1099,6 +1105,9 @@ static const riscv_ext_flag_table_t 
riscv_ext_flag_table[] =
   {"zbc",&gcc_options::x_riscv_zb_subext, MASK_ZBC},
   {"zbs",&gcc_options::x_riscv_zb_subext, MASK_ZBS},
 
+  {"zfinx",&gcc_options::x_riscv_zf_subext, MASK_ZFINX},
+  {"zdinx",&gcc_options::x_riscv_zf_subext, MASK_ZDINX},
+
   {"zbkb",   &gcc_options::x_riscv_zk_subext, MASK_ZBKB},
   {"zbkc",   &gcc_options::x_riscv_zk_subext, MASK_ZBKC},
   {"zbkx",   &gcc_options::x_riscv_zk_subext, MASK_ZBKX},
diff --git a/gcc/config/riscv/arch-canonicalize 
b/gcc/config/riscv/arch-canonicalize
index 41bab69193c..e4cfae40b8a 100755
--- a/gcc/config/riscv/arch-canonicalize
+++ b/gcc/config/riscv/arch-canonicalize
@@ -41,6 +41,9 @@ LONG_EXT_PREFIXES = ['z', 's', 'h', 'x']
 IMPLIED_EXT = {
   "d" : ["f", "zicsr"],
   "f" : ["zicsr"],
+  "zdinx" : ["zfinx", "zicsr"],
+  "zfinx" : ["zicsr"],
+
   "zk" : ["zkn", "zkr", "zkt"],
   "zkn" : ["zbkb", "zbkc", "zbkx", "zkne", "zknd", "zknh"],
   "zks" : ["zbkb", "zbkc", "zbkx", "zksed", "zksh"],
diff --git a/gcc/config/riscv/riscv-opts.h b/gcc/config/riscv/riscv-opts.h
index 15bb5e76854..4faf62616d3 100644
--- a/gcc/config/riscv/riscv-opts.h
+++ b/gcc/config/riscv/riscv-opts.h
@@ -83,6 +83,12 @@ enum stack_protector_guard {
 #define TARGET_ZBC((riscv_zb_subext & MASK_ZBC) != 0)
 #define TARGET_ZBS((riscv_zb_subext & MASK_ZBS) != 0)
 
+#define MASK_ZFINX  (1 << 0)
+#define MASK_ZDINX  (1 << 0)
+
+#define TARGET_ZFINX((riscv_zf_subext & MASK_ZFINX) != 0)
+#define TARGET_ZDINX((riscv_zf_subext & MASK_ZDINX) != 0)
+
 #define MASK_ZBKB (1 << 0)
 #define MASK_ZBKC (1 << 1)
 #define MASK_ZBKX (1 << 2)
diff --git a/gcc/config/riscv/riscv.opt b/gcc/config/riscv/riscv.opt
index 84c8cf5a2de..18fd11e3a51 100644
--- a/gcc/config/riscv/riscv.opt
+++ b/gcc/config/riscv/riscv.opt
@@ -200,6 +200,9 @@ int riscv_zi_subext
 TargetVariable
 int riscv_zb_subext
 
+TargetVariable
+int riscv_zf_subext
+
 TargetVariable
 int riscv_zk_subext
 
-- 
2.25.1



[PATCH v3 0/3] RISC-V: Support z[f/d]inx extension

2022-05-23 Thread jiawei
From: Jia-Wei Chen 

Zfinx extension[1] had already finished public review. Here is the 
implementation patch set that reuse floating point pattern and ban
the use of fpr when use zfinx as a target.

Current works can be find in follow links, will keep update zhinx
and zhinxmin soon after zfh/zfhmin implemented in gcc.
  https://github.com/pz9115/riscv-gcc/tree/zfinx-rebase
  https://github.com/pz9115/riscv-binutils-gdb/tree/zfinx-rebase

For test you can use qemu or spike that support zfinx extension, the
qemu will go upstream soon and spike is still in review:
  https://github.com/plctlab/plct-qemu/tree/plct-zfinx-dev
  https://github.com/plctlab/plct-spike/tree/plct-upstream-zfinx

Thanks for Tariq Kurd, Kito Cheng, Jim Willson, 
Jeremy Bennett helped us a lot with this work.

[1] https://github.com/riscv/riscv-zfinx/blob/main/zfinx-1.0.0-rc.pdf

Version log:

v2: As Kito Cheng's comment, add Changelog part in patches, update imply 
info in riscv-common.c, remove useless check and update annotation in 
riscv.c.

v3: Update with new isa-spec version 20191213, make zfinx imply zicsr as
default, fix the lack of fcsr use in zfinx.

jiawei (3):
  RISC-V: Minimal support of zfinx extension.
  RISC-V: Target support for zfinx extension.
  RISC-V: Limit regs use  for zfinx extension.

 gcc/common/config/riscv/riscv-common.cc |  9 
 gcc/config/riscv/arch-canonicalize  |  3 ++
 gcc/config/riscv/constraints.md |  4 +-
 gcc/config/riscv/riscv-builtins.cc  |  4 +-
 gcc/config/riscv/riscv-c.cc |  2 +-
 gcc/config/riscv/riscv-opts.h   |  6 +++
 gcc/config/riscv/riscv.cc   | 14 -
 gcc/config/riscv/riscv.md   | 72 -
 gcc/config/riscv/riscv.opt  |  3 ++
 9 files changed, 75 insertions(+), 42 deletions(-)

-- 
2.25.1



Re: [x86 PING] Peephole pand;pxor into pandn

2022-05-23 Thread Uros Bizjak via Gcc-patches
On Mon, May 23, 2022 at 10:59 AM Roger Sayle  wrote:
>
>
> Hi Uros,
>
> Thanks for the speedy review.  The point of this patch is that (with
> pending changes to STV) the pand;pxor sequence isn't created until
> after combine, and hence doesn't/won't get caught by any of the
> current pre-reload/combine splitters.

IMO this happens due to inconsistencies between integer and vector
set, where integer andn is absent without BMI. However, we don't
re-run the combine after reload, and I don't think it is worth to
reimplement it via peephole2 patterns. Please note that AVX allows
much more combinations that are not catched by your patch, and
considering that combine already does the transformation, I don't see
a compelling reason for this very specialized peephole2.

Let's keep the patch shelved until a testcase shows the benefits of the patch.

Uros.

>
>
> > -Original Message-
> > From: Uros Bizjak 
> > Sent: 23 May 2022 09:51
> > To: Roger Sayle 
> > Cc: gcc-patches@gcc.gnu.org
> > Subject: Re: [x86 PING] Peephole pand;pxor into pandn
> >
> > On Mon, May 23, 2022 at 10:44 AM Roger Sayle
> >  wrote:
> > >
> > >
> > > This is a ping of a patch from April (a dependency of another stage1 
> > > patch):
> > > https://gcc.gnu.org/pipermail/gcc-patches/2022-April/593123.html
> > >
> > > This patch has been refreshed/retested against gcc 13 trunk on
> > > x86_64-pc-linux-gnu with make bootstrap and make -k check, both with
> > > and without --target_board=unix{-m32}, with no new failures.
> > > Ok for mainline?
> >
> > I think this should be handled in a pre-reload splitter (or perhaps combine
> > splitter). We have so many variants of SSE/AVX logic instructions that the
> > transform after reload barely makes sense (please see the number of regno
> > checks in the proposed patch).
> >
> > Uros.
> >
> > > 2022-05-23  Roger Sayle  
> > >
> > > gcc/ChangeLog
> > > * config/i386/sse.md (peephole2): Convert suitable pand followed
> > > by pxor into pandn, i.e. (X&Y)^X into X & ~Y.
> > >
> > > Many thanks in advance,
> > > Roger
> > > --
> > >
>


Re: [Patch] OpenMP: Handle descriptors in target's firstprivate [PR104949]

2022-05-23 Thread Tobias Burnus

Hi Jakub,

On 19.05.22 15:59, Jakub Jelinek wrote:

I guess ok like this for now, but handling the further deep copy cases
(allocatable members of derived types) wouldn't be very nice, I think
generally we need a target hook to handle the stuff that is target specific
and express it say in further clauses or their modified copies (perhaps some
flags on them, or new clause types) which will allow the pointer attachments
to be done.


I concur – although, the question is how to to it best – i.e. what is statically
known vs. only known at run time. The current patch requires some in-depth
knowledge both of the internal structure (array size) and also the handling of
what is passed to libgomp. But it can be done statically.

Thus, I think it is okay to handle this case of firstprivate differently from:

For the Fortran patch regarding deep-copying of derived types, it is different:
it is a complicated deeply nested structure and with polymorphic types or
recursive types – or array derived types with allocatable derived components.

In this case, omp-low.cc only calls a three lang hooks and defers most to the
language hooks. Namely: Has deep copying, how many (run-time determined) - do
a malloc - and last hook: fill the three arrays (data, sizes, kinds).

I think that patch can be extended to handle deep firstprivate as well. As the
FE lang hook code controls the data/sizes/kinds array handling, it can also
handle the firstprivate bits.

(I need at some point to cleanup the patch and submit it piecewise, starting
with some generic Fortran patches.)

(Cross ref: See omp-low.cc changes at
https://gcc.gnu.org/pipermail/gcc-patches/2022-April/593562.html)


But eventually it would be nice to have a target hook that emulates the
cross-device copy construction.  And we probably need also something to
emulate destruction...


Yes – we also need something for map as OpenMP 5.x (x=1 or 2, I forgot; to be
extended in 6.0) permits more with regards to dynamic types and calling virtual
functions. (Likewise, but not relevant to mapping: Also dereferencing function
pointers.)

The submitted patch was now committed as
https://gcc.gnu.org/r13-706-g49d1a2f91325fa8cc011149e27e5093a988b3a49.

Thanks for the comments!

Tobias

-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955


RE: [x86 PING] Peephole pand;pxor into pandn

2022-05-23 Thread Roger Sayle


Hi Uros,

Thanks for the speedy review.  The point of this patch is that (with
pending changes to STV) the pand;pxor sequence isn't created until
after combine, and hence doesn't/won't get caught by any of the
current pre-reload/combine splitters.


> -Original Message-
> From: Uros Bizjak 
> Sent: 23 May 2022 09:51
> To: Roger Sayle 
> Cc: gcc-patches@gcc.gnu.org
> Subject: Re: [x86 PING] Peephole pand;pxor into pandn
> 
> On Mon, May 23, 2022 at 10:44 AM Roger Sayle
>  wrote:
> >
> >
> > This is a ping of a patch from April (a dependency of another stage1 patch):
> > https://gcc.gnu.org/pipermail/gcc-patches/2022-April/593123.html
> >
> > This patch has been refreshed/retested against gcc 13 trunk on
> > x86_64-pc-linux-gnu with make bootstrap and make -k check, both with
> > and without --target_board=unix{-m32}, with no new failures.
> > Ok for mainline?
> 
> I think this should be handled in a pre-reload splitter (or perhaps combine
> splitter). We have so many variants of SSE/AVX logic instructions that the
> transform after reload barely makes sense (please see the number of regno
> checks in the proposed patch).
> 
> Uros.
> 
> > 2022-05-23  Roger Sayle  
> >
> > gcc/ChangeLog
> > * config/i386/sse.md (peephole2): Convert suitable pand followed
> > by pxor into pandn, i.e. (X&Y)^X into X & ~Y.
> >
> > Many thanks in advance,
> > Roger
> > --
> >



Re: [x86 PING] Peephole pand;pxor into pandn

2022-05-23 Thread Uros Bizjak via Gcc-patches
On Mon, May 23, 2022 at 10:44 AM Roger Sayle  wrote:
>
>
> This is a ping of a patch from April (a dependency of another stage1 patch):
> https://gcc.gnu.org/pipermail/gcc-patches/2022-April/593123.html
>
> This patch has been refreshed/retested against gcc 13 trunk on
> x86_64-pc-linux-gnu with make bootstrap and make -k check,
> both with and without --target_board=unix{-m32}, with no new failures.
> Ok for mainline?

I think this should be handled in a pre-reload splitter (or perhaps
combine splitter). We have so many variants of SSE/AVX logic
instructions that the transform after reload barely makes sense
(please see the number of regno checks in the proposed patch).

Uros.

> 2022-05-23  Roger Sayle  
>
> gcc/ChangeLog
> * config/i386/sse.md (peephole2): Convert suitable pand followed
> by pxor into pandn, i.e. (X&Y)^X into X & ~Y.
>
> Many thanks in advance,
> Roger
> --
>


Adjust affected targets for vec_perm_const hook

2022-05-23 Thread Prathamesh Kulkarni via Gcc-patches
Hi Richard,
The attached patch addresses formatting nits for affected targets.
Tested with make all-gcc stage1 (except for gcn).
Sorry if this sounds like a naive question, but what target triplet
should I use to build gcn port ?

Thanks,
Prathamesh
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index f4d2a800f39..e6a24a0f9e1 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -24145,9 +24145,13 @@ aarch64_expand_vec_perm_const_1 (struct 
expand_vec_perm_d *d)
 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
 
 static bool
-aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
- rtx op1, const vec_perm_indices &sel)
+aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
+ rtx target, rtx op0, rtx op1,
+ const vec_perm_indices &sel)
 {
+  if (vmode != op_mode)
+return false;
+
   struct expand_vec_perm_d d;
 
   /* Check whether the mask can be applied to a single vector.  */
diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index 2afe0445ed5..70c2d50f0cc 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -31813,9 +31813,13 @@ arm_expand_vec_perm_const_1 (struct expand_vec_perm_d 
*d)
 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
 
 static bool
-arm_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0, rtx op1,
+arm_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
+ rtx target, rtx op0, rtx op1,
  const vec_perm_indices &sel)
 {
+  if (vmode != op_mode)
+return false;
+
   struct expand_vec_perm_d d;
   int i, nelt, which;
 
diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc
index e2e9335ad75..4642d5d55bf 100644
--- a/gcc/config/gcn/gcn.cc
+++ b/gcc/config/gcn/gcn.cc
@@ -4131,10 +4131,13 @@ gcn_make_vec_perm_address (unsigned int *perm)
permutations.  */
 
 static bool
-gcn_vectorize_vec_perm_const (machine_mode vmode, rtx dst,
- rtx src0, rtx src1,
+gcn_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
+ rtx dst, rtx src0, rtx src1,
  const vec_perm_indices & sel)
 {
+  if (vmode != op_mode)
+return false;
+
   unsigned int nelt = GET_MODE_NUNITS (vmode);
 
   gcc_assert (VECTOR_MODE_P (vmode));
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 806e1f5aaa3..adf68547119 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -22060,9 +22060,13 @@ canonicalize_perm (struct expand_vec_perm_d *d)
 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
 
 bool
-ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
-  rtx op1, const vec_perm_indices &sel)
+ix86_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
+  rtx target, rtx op0, rtx op1,
+  const vec_perm_indices &sel)
 {
+  if (vmode != op_mode)
+return false;
+
   struct expand_vec_perm_d d;
   unsigned char perm[MAX_VECT_LEN];
   unsigned int i, nelt, which;
diff --git a/gcc/config/i386/i386-expand.h b/gcc/config/i386/i386-expand.h
index 9d320c29552..6c650196c9c 100644
--- a/gcc/config/i386/i386-expand.h
+++ b/gcc/config/i386/i386-expand.h
@@ -48,8 +48,9 @@ rtx gen_push (rtx arg);
 rtx gen_pop (rtx arg);
 rtx ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
 machine_mode mode, int ignore);
-bool ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
-   rtx op1, const vec_perm_indices &sel);
+bool ix86_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
+   rtx target, rtx op0, rtx op1,
+   const vec_perm_indices &sel);
 bool ix86_notrack_prefixed_insn_p (rtx_insn *);
 machine_mode ix86_split_reduction (machine_mode mode);
 void ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode, rtx op0,
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 175ce013e5d..50112a8efee 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -15836,7 +15836,7 @@
  sel[7] = 15;
}
  vec_perm_indices indices (sel, 2, 8);
- bool ok = targetm.vectorize.vec_perm_const (V8SImode, target,
+ bool ok = targetm.vectorize.vec_perm_const (V8SImode, V8SImode, 
target,
  arg0, arg1, indices);
  gcc_assert (ok);
  emit_move_insn (operands[0],
@@ -24569,7 +24569,7 @@
  sel[3] = 7;
}
  vec_perm_indices indices (sel, arg0 != arg1 ? 2 : 1, 4);
- bool ok = targetm.vectorize.vec_perm_const (V4SImode, target,
+ bool ok = targetm.vector

Re: [0/9] [middle-end] Add param to vec_perm_const hook to specify mode of input operand

2022-05-23 Thread Prathamesh Kulkarni via Gcc-patches
On Wed, 18 May 2022 at 17:27, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > Hi,
> > The attached patch adds another parameter machine_mode op_mode to 
> > vec_perm_const
> > hook to specify mode of input operands. The motivation for doing this
> > is PR96463,
> > where we create vec_perm_expr of the form:
> > lhs = vec_perm_expr
> > where lhs and rhs have different vector types but same element type
> > (lhs is SVE and rhs is corresponding advsimd vector).
> >
> > It seems the following targets were affected: aarch64, i386, arm, ia64,
> > mips, rs6000, s390, sparc, gcn.
> >
> > Bootstrapped+tested on x86_64-linux-gnu, aarch64-linux-gnu.
> > For other targets, I did make all-gcc stage1, which seems to build OK.
> >
> > Thanks,
> > Prathamesh
> >
> > diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> > index c5006afc00d..31ff6ef3f92 100644
> > --- a/gcc/doc/tm.texi
> > +++ b/gcc/doc/tm.texi
> > @@ -6088,7 +6088,7 @@ for the given scalar type @var{type}.  
> > @var{is_packed} is false if the scalar
> >  access using @var{type} is known to be naturally aligned.
> >  @end deftypefn
> >
> > -@deftypefn {Target Hook} bool TARGET_VECTORIZE_VEC_PERM_CONST 
> > (machine_mode @var{mode}, rtx @var{output}, rtx @var{in0}, rtx @var{in1}, 
> > const vec_perm_indices @var{&sel})
> > +@deftypefn {Target Hook} bool TARGET_VECTORIZE_VEC_PERM_CONST 
> > (machine_mode @var{mode}, machine_mode @var{op_mode}, rtx @var{output}, rtx 
> > @var{in0}, rtx @var{in1}, const vec_perm_indices @var{&sel})
> >  This hook is used to test whether the target can permute up to two
> >  vectors of mode @var{mode} using the permutation vector @code{sel}, and
> >  also to emit such a permutation.  In the former case @var{in0}, @var{in1}
>
> Like Andre says, the documentation should describe op_mode (and mode).
>
> > diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc
> > index 68dc679cc6a..aef9d4c5d28 100644
> > --- a/gcc/optabs-query.cc
> > +++ b/gcc/optabs-query.cc
> > @@ -417,8 +417,8 @@ can_vec_perm_var_p (machine_mode mode)
> > with here.  */
> >
> >  bool
> > -can_vec_perm_const_p (machine_mode mode, const vec_perm_indices &sel,
> > -   bool allow_variable_p)
> > +can_vec_perm_const_p (machine_mode mode, machine_mode op_mode,
> > +   const vec_perm_indices &sel, bool allow_variable_p)
> >  {
>
> The function comment should describe the new parameter.
>
> >/* If the target doesn't implement a vector mode for the vector type,
> >   then no operations are supported.  */
> > @@ -448,7 +448,7 @@ can_vec_perm_const_p (machine_mode mode, const 
> > vec_perm_indices &sel,
> >
> >if (targetm.vectorize.vec_perm_const != NULL)
> >  {
> > -  if (targetm.vectorize.vec_perm_const (mode, NULL_RTX, NULL_RTX,
> > +  if (targetm.vectorize.vec_perm_const (mode, op_mode, NULL_RTX, 
> > NULL_RTX,
> >   NULL_RTX, sel))
> >   return true;
> >
> > @@ -462,6 +462,13 @@ can_vec_perm_const_p (machine_mode mode, const 
> > vec_perm_indices &sel,
> >return false;
> >  }
> >
> > +bool
> > +can_vec_perm_const_p (machine_mode mode, const vec_perm_indices &sel,
> > +   bool allow_variable_p)
> > +{
> > +  return can_vec_perm_const_p (mode, mode, sel, allow_variable_p);
> > +}
> > +
>
> I can understand why you went for this, but now that we've opened
> the door to mismatched modes, I think it would be better if all callers
> specified the input mode explicitly.
>
> > diff --git a/gcc/optabs.cc b/gcc/optabs.cc
> > index 3d8fa3abdfe..55f10c41789 100644
> > --- a/gcc/optabs.cc
> > +++ b/gcc/optabs.cc
> > @@ -6250,7 +6250,9 @@ expand_vec_perm_const (machine_mode mode, rtx v0, rtx 
> > v1,
> >if (single_arg_p)
> >   v1 = v0;
> >
> > -  if (targetm.vectorize.vec_perm_const (mode, target, v0, v1, indices))
> > +  gcc_checking_assert (GET_MODE (v0) == GET_MODE (v1));
> > +  machine_mode op_mode = GET_MODE (v0);
> > +  if (targetm.vectorize.vec_perm_const (mode, op_mode, target, v0, v1, 
> > indices))
> >   return target;
> >  }
> >
>
> (FWIW, I agree the assert is worth having.)
Hi,
I updated the patch with doc and adjusted callers to explicitly pass op_mode.
Bootstrapped + tested on x86_64-linux-gnu and aarch64-linux-gnu.
Does it look OK to commit ?

Thanks,
Prathamesh

>
> Thanks,
> Richard
>
> > @@ -6264,7 +6266,7 @@ expand_vec_perm_const (machine_mode mode, rtx v0, rtx 
> > v1,
> >v0_qi = gen_lowpart (qimode, v0);
> >v1_qi = gen_lowpart (qimode, v1);
> >if (targetm.vectorize.vec_perm_const != NULL
> > -   && targetm.vectorize.vec_perm_const (qimode, target_qi, v0_qi,
> > +   && targetm.vectorize.vec_perm_const (qimode, qimode, target_qi, 
> > v0_qi,
> >  v1_qi, qimode_indices))
> >   return gen_lowpart (mode, target_qi);
> >  }
> > diff --git a/gcc/target.def b/gcc/target.def
> > index d85adf36a39..2713c31dc3f 100644
> 

[x86 PING] Peephole pand;pxor into pandn

2022-05-23 Thread Roger Sayle

This is a ping of a patch from April (a dependency of another stage1 patch):
https://gcc.gnu.org/pipermail/gcc-patches/2022-April/593123.html

This patch has been refreshed/retested against gcc 13 trunk on
x86_64-pc-linux-gnu with make bootstrap and make -k check,
both with and without --target_board=unix{-m32}, with no new failures.
Ok for mainline?

2022-05-23  Roger Sayle  

gcc/ChangeLog
* config/i386/sse.md (peephole2): Convert suitable pand followed
by pxor into pandn, i.e. (X&Y)^X into X & ~Y.

Many thanks in advance,
Roger
--

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 191371b..4203fe0 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -17021,6 +17021,44 @@
(match_dup 2)))]
   "operands[3] = gen_reg_rtx (mode);")
 
+;; Combine pand;pxor into pandn.  (X&Y)^X -> X & ~Y.
+(define_peephole2
+  [(set (match_operand:VMOVE 0 "register_operand")
+   (and:VMOVE (match_operand:VMOVE 1 "register_operand")
+  (match_operand:VMOVE 2 "register_operand")))
+   (set (match_operand:VMOVE 3 "register_operand")
+   (xor:VMOVE (match_operand:VMOVE 4 "register_operand")
+  (match_operand:VMOVE 5 "register_operand")))]
+  "TARGET_SSE
+   && REGNO (operands[1]) != REGNO (operands[2])
+   && REGNO (operands[4]) != REGNO (operands[5])
+   && (REGNO (operands[0]) == REGNO (operands[3])
+   || peep2_reg_dead_p (2, operands[0]))"
+  [(set (match_dup 3)
+   (and:VMOVE (not:VMOVE (match_dup 6)) (match_dup 7)))]
+{
+  if (REGNO (operands[0]) != REGNO (operands[1])
+  && ((REGNO (operands[4]) == REGNO (operands[0])
+  && REGNO (operands[5]) == REGNO (operands[1]))
+ || (REGNO (operands[4]) == REGNO (operands[1])
+ && REGNO (operands[5]) == REGNO (operands[0]
+{
+  operands[6] = operands[2];
+  operands[7] = operands[1];
+}
+  else if (REGNO (operands[0]) != REGNO (operands[2])
+  && ((REGNO (operands[4]) == REGNO (operands[0])
+   && REGNO (operands[5]) == REGNO (operands[2]))
+  || (REGNO (operands[4]) == REGNO (operands[2])
+  && REGNO (operands[5]) == REGNO (operands[0]
+{
+  operands[6] = operands[1];
+  operands[7] = operands[2];
+}
+  else
+FAIL;
+})
+
 (define_insn "*andnot3_mask"
   [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v")
(vec_merge:VI48_AVX512VL


Re: [PATCH] Minor improvement to genpreds.cc

2022-05-23 Thread Richard Biener via Gcc-patches
On Sun, May 22, 2022 at 11:03 AM Roger Sayle  wrote:
>
>
> This simple patch implements Richard Biener's suggestion in comment #6
> of PR tree-optimization/52171 (from February 2013) that the insn-preds
> code generated by genpreds can avoid using strncmp when matching constant
> strings of length one.
>
> The effect of this patch is best explained by the diff of insn-preds.cc:
> <   if (!strncmp (str + 1, "g", 1))
> ---
> >   if (str[1] == 'g')
> 3104c3104
> <   if (!strncmp (str + 1, "m", 1))
> ---
> >   if (str[1] == 'm')
> 3106c3106
> <   if (!strncmp (str + 1, "c", 1))
> ---
> >   if (str[1] == 'c')
> ...
>
> The equivalent optimization is performed by GCC (but perhaps not by the
> host compiler), but generating simpler/smaller code may encourage further
> optimizations (such as use of a switch statement).
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check with no new failures.  Ok for mainline?

OK.

Richard.

>
> 2022-05-22  Roger Sayle  
>
> gcc/ChangeLog
> * genpreds.cc (write_lookup_constraint_1): Avoid generating a call
> to strncmp for strings of length one.
>
> Roger
> --
>


Re: [x86 PATCH] PR tree-optimization/105668: Provide vcond_mask_v1tiv1ti pattern.

2022-05-23 Thread Uros Bizjak via Gcc-patches
On Mon, May 23, 2022 at 10:00 AM Uros Bizjak  wrote:
>
> On Mon, May 23, 2022 at 9:16 AM Roger Sayle  
> wrote:
> >
> >
> > This patch is an alternate/supplementary fix to PR tree-optimization/105668
> > that provides a vcond_mask_v1titi optab/define_expand to the i386 backend.
> > An undocumented feature/bug of GCC's vectorization is that any target that
> > provides a vec_cmpeq has to also provide a matching
> > vcond_mask.  This backend patch preserves the status quo,
> > rather than fixes the underlying problem.
>
> IIRC, I also hit this issue a while ago. I was under impression it was
> fixed in the meantime, but looks I was wrong.
>
> > One aspect of this clean-up is that ix86_expand_sse_movcc provides
> > fallback implementations using pand/pandn/por that effectively make
> > V2DImode and V1TImode vcond_mask available on any TARGET_SSE2, not
> > just TARGET_SSE4_2.  This allows a simplification as V2DI mode can
> > be handled by using a VI_128 mode iterator instead of a VI124_128
> > mode iterator, and instead this define_expand is effectively renamed
> > to provide a V1TImode vcond_mask expander (as V1TI isn't in VI_128).
> >
> > This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> > and make -k check, both with and without --target_board=unix{-m32} with
> > no new failures.  The new test case is identical to the middle-end patch,
> > so if both patches are approved, this'll be committed only once.
> > Ok for mainline?
>
> OK.
>
> Thanks,
> Uros.
>
> >
> >
> > 2022-05-23  Roger Sayle  
> >
> > gcc/ChangeLog
> > PR tree-optimization/105668
> > * config/i386/i386-expand.cc (ix86_expand_sse_movcc): Support
> > V1TImode, just like V2DImode.
> > * config/i386/sse.md (vcond_mask_Msseintvecmodelower>):
> > Use VI_128 mode iterator instead of VI124_128 to include V2DI.
> > (vcond_mask_v2div2di): Delete.
> > (vcond_mask_v1tiv1ti): New define_expand.
> >
> > gcc/testsuite/ChangeLog
> > PR tree-optimization/105668
> > * gcc.target/i386/pr105668.c: New test case.


diff --git a/gcc/testsuite/gcc.target/i386/pr105668.c
b/gcc/testsuite/gcc.target/i386/pr105668.c
new file mode 100644
index 000..359c2b6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr105668.c
@@ -0,0 +1,16 @@
+/* { dg-do compile { target { ! ia32 } } } */

Please use { target int128 } here.

+/* { dg-options "-O -ftracer -fno-tree-fre" } */
+
+typedef __int128 __attribute__((__vector_size__ (16))) V;

> >
> > Roger
> > --
> >


Re: [ping] Re: [RFA] gcc.misc-tests/outputs.exp: Use link test to check for -gsplit-dwarf support

2022-05-23 Thread Richard Sandiford via Gcc-patches
Joel Brobecker via Gcc-patches  writes:
> Hello,
>
> Gentle ping on this patch.
>
> Thank you!
>
> On Mon, Apr 25, 2022 at 09:04:51AM -0700, Joel Brobecker wrote:
>> Hello,
>> 
>> We have noticed that, when running the GCC testsuite on AArch64
>> RTEMS 6, we have about 150 tests failing due to a link failure.
>> When investigating, we found that all the tests were failing
>> due to the use of -gsplit-dwarf.
>> 
>> On this platform, using -gsplit-dwarf currently causes an error
>> during the link:
>> 
>> | /[...]/ld: a.out section `.unexpected_sections' will not fit
>> |in region `UNEXPECTED_SECTIONS'
>> | /[...]/ld: region `UNEXPECTED_SECTIONS' overflowed by 56 bytes
>> 
>> The error is a bit cryptic, but the source of the issue is that
>> the linker does not currently support the sections generated
>> by -gsplit-dwarf (.debug_gnu_pubnames, .debug_gnu_pubtypes).
>> This means that the -gsplit-dwarf feature itself really isn't
>> supported on this platform, at least for the moment.
>> 
>> This commit enhances the -gsplit-dwarf support check to be
>> a compile-and-link check, rather than just a compile check.
>> This allows it to properly detect that this feature isn't
>> supported on platforms such as AArch64 RTEMS where the compilation
>> works, but not the link.
>> 
>> Tested on aarch64-rtems, where a little over 150 tests are now
>> passing, instead of failing, as well as on x86_64-linux, where
>> the results are identical, and where the .log file was also manually
>> inspected to make sure that the use of the -gsplit-dwarf option
>> was preserved.
>> 
>> gcc/testsuite/ChangeLog:
>> 
>> * gcc.misc-tests/outputs.exp: Make the -gsplit-dwarf test
>> a compile-and-link test rather than a compile-only test.

OK, thanks.

Richard

>> OK to push on master?
>> 
>> Thank you,
>> -- 
>> Joel
>> 
>> ---
>>  gcc/testsuite/gcc.misc-tests/outputs.exp | 4 ++--
>>  1 file changed, 2 insertions(+), 2 deletions(-)
>> 
>> diff --git a/gcc/testsuite/gcc.misc-tests/outputs.exp 
>> b/gcc/testsuite/gcc.misc-tests/outputs.exp
>> index bc1fbe4eb7f..afae735e92d 100644
>> --- a/gcc/testsuite/gcc.misc-tests/outputs.exp
>> +++ b/gcc/testsuite/gcc.misc-tests/outputs.exp
>> @@ -36,8 +36,8 @@ gcc_parallel_test_enable 0
>>  # having to deal with .dSYM directories, as long as -gsplit-dwarf is
>>  # not supported on platforms that use .dSYM directories.
>>  set gsplit_dwarf "-g -gsplit-dwarf"
>> -if ![check_no_compiler_messages gsplitdwarf object {
>> -void foo (void) { }
>> +if ![check_no_compiler_messages gsplitdwarf executable {
>> +int main (void) { return 0; }
>>  } "$gsplit_dwarf"] {
>>  set gsplit_dwarf ""
>>  }
>> -- 
>> 2.32.0
>> 


Re: [PATCH] testsuite: mallign: Handle word size of 1 byte

2022-05-23 Thread Richard Sandiford via Gcc-patches
Dimitar Dimitrov  writes:
> On Sun, May 08, 2022 at 10:31:04AM +0300, Dimitar Dimitrov wrote:
>> This patch fixes a spurious warning for pru-unknown-elf target:
>>   gcc/testsuite/gcc.dg/mallign.c:12:27: warning: ignoring return value of 
>> 'malloc' declared with attribute 'warn_unused_result' [-Wunused-result]
>> 
>> For 8-bit targets the resulting mask ignores all bits in the value
>> returned by malloc.  Fix by first checking the target word size.
>> 
>> Sanity checked that there are no new failures on x86_64-pc-linux-gnu.
>> 
>> Ok for trunk?
>
> Ping. Does this count as an obvious fix?
>
>> 
>> gcc/testsuite/ChangeLog:
>> 
>>  * gcc.dg/mallign.c: Skip check if sizeof(word)==1.

OK, thanks.  Sorry for the slow review.

Richard

>> 
>> Signed-off-by: Dimitar Dimitrov 
>> ---
>>  gcc/testsuite/gcc.dg/mallign.c | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>> 
>> diff --git a/gcc/testsuite/gcc.dg/mallign.c b/gcc/testsuite/gcc.dg/mallign.c
>> index 349cdaa343f..9a18a00c3b0 100644
>> --- a/gcc/testsuite/gcc.dg/mallign.c
>> +++ b/gcc/testsuite/gcc.dg/mallign.c
>> @@ -9,7 +9,7 @@ typedef int word __attribute__((mode(word)));
>>  
>>  int main()
>>  {
>> -if ((__UINTPTR_TYPE__)malloc (1) & (sizeof(word)-1))
>> +if ((sizeof(word)>1) && ((__UINTPTR_TYPE__)malloc (1) & 
>> (sizeof(word)-1)))
>>  abort ();
>>  return 0;
>>  }   
>>
>> -- 
>> 2.35.1
>> 


Re: [x86 PATCH] PR tree-optimization/105668: Provide vcond_mask_v1tiv1ti pattern.

2022-05-23 Thread Uros Bizjak via Gcc-patches
On Mon, May 23, 2022 at 9:16 AM Roger Sayle  wrote:
>
>
> This patch is an alternate/supplementary fix to PR tree-optimization/105668
> that provides a vcond_mask_v1titi optab/define_expand to the i386 backend.
> An undocumented feature/bug of GCC's vectorization is that any target that
> provides a vec_cmpeq has to also provide a matching
> vcond_mask.  This backend patch preserves the status quo,
> rather than fixes the underlying problem.

IIRC, I also hit this issue a while ago. I was under impression it was
fixed in the meantime, but looks I was wrong.

> One aspect of this clean-up is that ix86_expand_sse_movcc provides
> fallback implementations using pand/pandn/por that effectively make
> V2DImode and V1TImode vcond_mask available on any TARGET_SSE2, not
> just TARGET_SSE4_2.  This allows a simplification as V2DI mode can
> be handled by using a VI_128 mode iterator instead of a VI124_128
> mode iterator, and instead this define_expand is effectively renamed
> to provide a V1TImode vcond_mask expander (as V1TI isn't in VI_128).
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check, both with and without --target_board=unix{-m32} with
> no new failures.  The new test case is identical to the middle-end patch,
> so if both patches are approved, this'll be committed only once.
> Ok for mainline?

OK.

Thanks,
Uros.

>
>
> 2022-05-23  Roger Sayle  
>
> gcc/ChangeLog
> PR tree-optimization/105668
> * config/i386/i386-expand.cc (ix86_expand_sse_movcc): Support
> V1TImode, just like V2DImode.
> * config/i386/sse.md (vcond_mask_Msseintvecmodelower>):
> Use VI_128 mode iterator instead of VI124_128 to include V2DI.
> (vcond_mask_v2div2di): Delete.
> (vcond_mask_v1tiv1ti): New define_expand.
>
> gcc/testsuite/ChangeLog
> PR tree-optimization/105668
> * gcc.target/i386/pr105668.c: New test case.
>
>
> Roger
> --
>


Re: [PATCH] Use more ARRAY_SIZE.

2022-05-23 Thread Iain Buclaw via Gcc-patches
Excerpts from Martin Liška's message of Mai 11, 2022 10:17 am:
> On 5/9/22 14:03, Richard Biener wrote:
>> On Thu, May 5, 2022 at 4:30 PM Martin Liška  wrote:
>>>
>>> On 5/5/22 14:58, Iain Buclaw wrote:
 This D front-end change doesn't look right to me, besides the slight
>>>
>>> Hello.
>>>
>>> Sorry, I've re-read the patch and fixed some places where the macro usage
>>> was wrong.
>>>
>>> Patch can bootstrap on x86_64-linux-gnu and survives regression tests.
>> 
>> The middle-end parts are OK.  I'd say in files where ARRAY_SIZE is already
>> used it's OK to introduce more uses.  Otherwise I defer to the more specific
>> maintainers if they like this or not.
> 
> All right, CCing the following maintainers for other parts:
> 
> - David for JIT and Analyzer
> - Tobias for Fortran part
> - Jason for C-family part
> 

Hi Martin,

When running through contrib/config-list.mk, I noticed that this also
broke the build for the following obsolete targets:

tilegx-linux-gnu
tilegxbe-linux-gnu
tilepro-linux-gnu

---
gcc/config/tilepro/gen-mul-tables.cc: In function ‘void 
find_sequences(ExpressionTree&, ExpressionTreeMap&)’:
gcc/config/tilepro/gen-mul-tables.cc:465:26: error: ‘ARRAY_SIZE’ was not 
declared in this scope
  465 |   for (size_t f = 0; f < ARRAY_SIZE (ops); f++)
  |  ^~
gcc/config/tilepro/gen-mul-tables.cc: In function ‘void 
create_insn_code_compression_table()’:
gcc/config/tilepro/gen-mul-tables.cc:567:26: error: ‘ARRAY_SIZE’ was not 
declared in this scope
  567 |   for (size_t i = 0; i < ARRAY_SIZE (ops); i++)
  |  ^~
---


[x86 PATCH] PR tree-optimization/105668: Provide vcond_mask_v1tiv1ti pattern.

2022-05-23 Thread Roger Sayle

This patch is an alternate/supplementary fix to PR tree-optimization/105668
that provides a vcond_mask_v1titi optab/define_expand to the i386 backend.
An undocumented feature/bug of GCC's vectorization is that any target that
provides a vec_cmpeq has to also provide a matching
vcond_mask.  This backend patch preserves the status quo,
rather than fixes the underlying problem.

One aspect of this clean-up is that ix86_expand_sse_movcc provides
fallback implementations using pand/pandn/por that effectively make
V2DImode and V1TImode vcond_mask available on any TARGET_SSE2, not
just TARGET_SSE4_2.  This allows a simplification as V2DI mode can
be handled by using a VI_128 mode iterator instead of a VI124_128
mode iterator, and instead this define_expand is effectively renamed
to provide a V1TImode vcond_mask expander (as V1TI isn't in VI_128).

This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without --target_board=unix{-m32} with
no new failures.  The new test case is identical to the middle-end patch,
so if both patches are approved, this'll be committed only once.
Ok for mainline?


2022-05-23  Roger Sayle  

gcc/ChangeLog
PR tree-optimization/105668
* config/i386/i386-expand.cc (ix86_expand_sse_movcc): Support
V1TImode, just like V2DImode.
* config/i386/sse.md (vcond_mask_Msseintvecmodelower>):
Use VI_128 mode iterator instead of VI124_128 to include V2DI.
(vcond_mask_v2div2di): Delete.
(vcond_mask_v1tiv1ti): New define_expand.

gcc/testsuite/ChangeLog
PR tree-optimization/105668
* gcc.target/i386/pr105668.c: New test case.


Roger
--

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 1460bcc..e3bd661 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -4026,6 +4026,7 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, 
rtx op_false)
 case E_V8HFmode:
 case E_V4SImode:
 case E_V2DImode:
+case E_V1TImode:
   if (TARGET_SSE4_1)
{
  gen = gen_sse4_1_pblendvb;
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 191371b..f261ff6 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -4579,10 +4579,10 @@
 })
 
 (define_expand "vcond_mask_"
-  [(set (match_operand:VI124_128 0 "register_operand")
-   (vec_merge:VI124_128
- (match_operand:VI124_128 1 "vector_operand")
- (match_operand:VI124_128 2 "nonimm_or_0_operand")
+  [(set (match_operand:VI_128 0 "register_operand")
+   (vec_merge:VI_128
+ (match_operand:VI_128 1 "vector_operand")
+ (match_operand:VI_128 2 "nonimm_or_0_operand")
  (match_operand: 3 "register_operand")))]
   "TARGET_SSE2"
 {
@@ -4591,13 +4591,13 @@
   DONE;
 })
 
-(define_expand "vcond_mask_v2div2di"
-  [(set (match_operand:V2DI 0 "register_operand")
-   (vec_merge:V2DI
- (match_operand:V2DI 1 "vector_operand")
- (match_operand:V2DI 2 "nonimm_or_0_operand")
- (match_operand:V2DI 3 "register_operand")))]
-  "TARGET_SSE4_2"
+(define_expand "vcond_mask_v1tiv1ti"
+  [(set (match_operand:V1TI 0 "register_operand")
+   (vec_merge:V1TI
+ (match_operand:V1TI 1 "vector_operand")
+ (match_operand:V1TI 2 "nonimm_or_0_operand")
+ (match_operand:V1TI 3 "register_operand")))]
+  "TARGET_SSE2"
 {
   ix86_expand_sse_movcc (operands[0], operands[3],
 operands[1], operands[2]);
diff --git a/gcc/testsuite/gcc.target/i386/pr105668.c 
b/gcc/testsuite/gcc.target/i386/pr105668.c
new file mode 100644
index 000..359c2b6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr105668.c
@@ -0,0 +1,16 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O -ftracer -fno-tree-fre" } */
+
+typedef __int128 __attribute__((__vector_size__ (16))) V;
+
+int i;
+
+V
+foo (_Complex float f)
+{
+  (void) __builtin_atanhf (i);
+  V v = i != (V) { };
+  i ^= f && 8;
+  v %= 5;
+  return v;
+}