date:20200929

[RS6000] gcc/configure typo fix

2020-09-29 Thread Alan Modra via Gcc-patches

Committed as obvious.

* configure.ac (--with-long-double-format): Typo fix.
* configure: Regenerate.

diff --git a/gcc/configure.ac b/gcc/configure.ac
index f5612161dcd..1ad5bbc6935 100644
--- a/gcc/configure.ac
+++ b/gcc/configure.ac
@@ -6260,7 +6260,7 @@ supported if the default cpu is power7 or newer])
;;
   esac
   ;;
-  xpowerpc64*-*-linux*:*)
+  powerpc64*-*-linux*:*)
 AC_MSG_ERROR([--with-long-double-format argument should be ibm or ieee])
 with_long_double_format=""
 ;;

-- 
Alan Modra
Australia Development Lab, IBM

Re: [PATCH] rs6000: Use parameterized names for tablejump

2020-09-29 Thread Alan Modra via Gcc-patches

On Wed, Sep 30, 2020 at 12:04:25AM +, Segher Boessenkool wrote:
>   * config/rs6000/rs6000.md (tablejump): Simplify.
>   (tablejumpsi): Merge this ...
>   (tablejumpdi): ... and this ...
>   (@tablejump_normal): ... into this.
>   (tablejumpsi_nospec): Merge this ...
>   (tablejumpdi_nospec): ... and this ...
>   (@tablejump_nospec): ... into this.
>   (*tablejump_internal1): Delete, rename to ...
>   (@tablejump_insn_normal): ... this.
>   (*tablejump_internal1_nospec): Delete, rename to ...
>   (@tablejump_insn_nospec): ... this.

decQuad.o] Error 1
*** stack smashing detected ***: terminated
during RTL pass: expand

I'll commit this as obvious after my regstraps finish.

* config/rs6000/rs6000.md (@tablejump_normal): Don't use
non-existent operands[].
(@tablejump_nospec): Likewise.

diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 24ad80993ad..779bfd11237 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -12716,21 +12716,22 @@
(use (match_operand:P 1))]
   "rs6000_speculate_indirect_jumps"
 {
+  rtx off;
   operands[0] = force_reg (SImode, operands[0]);
   if (mode == SImode)
-operands[4] = operands[0];
+off = operands[0];
   else
 {
-  operands[4] = gen_reg_rtx (Pmode);
+  off = gen_reg_rtx (Pmode);
   rtx src = gen_rtx_fmt_e (SIGN_EXTEND, Pmode, operands[0]);
-  emit_move_insn (operands[4], src);
+  emit_move_insn (off, src);
 }
 
-  operands[2] = force_reg (Pmode, gen_rtx_LABEL_REF (Pmode, operands[1]));
-  operands[3] = gen_reg_rtx (Pmode);
+  rtx lab = force_reg (Pmode, gen_rtx_LABEL_REF (Pmode, operands[1]));
+  rtx addr = gen_reg_rtx (Pmode);
 
-  emit_insn (gen_add3 (operands[3], operands[4], operands[2]));
-  emit_jump_insn (gen_tablejump_insn_normal (Pmode, operands[3], operands[1]));
+  emit_insn (gen_add3 (addr, off, lab));
+  emit_jump_insn (gen_tablejump_insn_normal (Pmode, addr, operands[1]));
   DONE;
 })
 
@@ -12740,21 +12741,22 @@
(use (match_operand:CC 2))]
   "!rs6000_speculate_indirect_jumps"
 {
+  rtx off;
   operands[0] = force_reg (SImode, operands[0]);
   if (mode == SImode)
-operands[4] = operands[0];
+off = operands[0];
   else
 {
-  operands[4] = gen_reg_rtx (Pmode);
+  off = gen_reg_rtx (Pmode);
   rtx src = gen_rtx_fmt_e (SIGN_EXTEND, Pmode, operands[0]);
-  emit_move_insn (operands[4], src);
+  emit_move_insn (off, src);
 }
 
-  operands[5] = force_reg (Pmode, gen_rtx_LABEL_REF (Pmode, operands[1]));
-  operands[3] = gen_reg_rtx (Pmode);
+  rtx lab = force_reg (Pmode, gen_rtx_LABEL_REF (Pmode, operands[1]));
+  rtx addr = gen_reg_rtx (Pmode);
 
-  emit_insn (gen_add3 (operands[3], operands[4], operands[5]));
-  emit_jump_insn (gen_tablejump_insn_nospec (Pmode, operands[3], operands[1],
+  emit_insn (gen_add3 (addr, off, lab));
+  emit_jump_insn (gen_tablejump_insn_nospec (Pmode, addr, operands[1],
 operands[2]));
   DONE;
 })


-- 
Alan Modra
Australia Development Lab, IBM

[PATCH][GCC 10] Fix build failure with zstd versio9n 1.2.0 or older.

2020-09-29 Thread Jim Wilson

This is the gcc-10 branch version of the patch on mainline.

Extends the configure check for zstd.h to also verify the zstd version,
since gcc requires features that only exist in 1.3.0 and newer.  Without
this patch we get a build error for lto-compress.c when using an old zstd
version.

OK?

Jim

Backported from master:
2020-09-29  Jim Wilson  

gcc/
PR bootstrap/97183
* configure.ac (gcc_cv_header_zstd_h): Check ZSTD_VERISON_NUMBER.
* configure: Regenerated.
---
 gcc/configure| 11 ---
 gcc/configure.ac |  7 ++-
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/gcc/configure b/gcc/configure
index eb6061c1631..b4088d8fd1e 100755
--- a/gcc/configure
+++ b/gcc/configure
@@ -10024,9 +10024,14 @@ $as_echo_n "checking for zstd.h... " >&6; }
 if ${gcc_cv_header_zstd_h+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+  # We require version 1.3.0 or later.  This is the first version that has
+# ZSTD_getFrameContentSize.
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 #include 
+#if ZSTD_VERSION_NUMBER < 10300
+#error "need zstd 1.3.0 or better"
+#endif
 int
 main ()
 {
@@ -19015,7 +19020,7 @@ else
   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
   lt_status=$lt_dlunknown
   cat > conftest.$ac_ext <<_LT_EOF
-#line 19018 "configure"
+#line 19023 "configure"
 #include "confdefs.h"
 
 #if HAVE_DLFCN_H
@@ -19121,7 +19126,7 @@ else
   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
   lt_status=$lt_dlunknown
   cat > conftest.$ac_ext <<_LT_EOF
-#line 19124 "configure"
+#line 19129 "configure"
 #include "confdefs.h"
 
 #if HAVE_DLFCN_H
diff --git a/gcc/configure.ac b/gcc/configure.ac
index 715fcba0482..070b9c6c497 100644
--- a/gcc/configure.ac
+++ b/gcc/configure.ac
@@ -1382,8 +1382,13 @@ LDFLAGS="$LDFLAGS $ZSTD_LDFLAGS"
 
 AC_MSG_CHECKING(for zstd.h)
 AC_CACHE_VAL(gcc_cv_header_zstd_h,
+# We require version 1.3.0 or later.  This is the first version that has
+# ZSTD_getFrameContentSize.
 [AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
-[[#include ]])],
+[[#include 
+#if ZSTD_VERSION_NUMBER < 10300
+#error "need zstd 1.3.0 or better"
+#endif]])],
   [gcc_cv_header_zstd_h=yes],
   [gcc_cv_header_zstd_h=no])])
 AC_MSG_RESULT($gcc_cv_header_zstd_h)
-- 
2.17.1

Re: [PATCH] Fix GCC 10+ build failure with zstd version 1.2.0 or older.

2020-09-29 Thread Jim Wilson

On Tue, Sep 29, 2020 at 1:20 AM Richard Biener
 wrote:
>
> On Tue, Sep 29, 2020 at 2:46 AM Jim Wilson  wrote:
> >
> > Extends the configure check for zstd.h to also verify the zstd version,
> > since gcc requires features that only exist in 1.3.0 and newer.  Without
> > this patch we get a build error for lto-compress.c when using an old zstd
> > version.
> >
> > Tested with builds using zstd 0.5.1, 1.2.0, 1.3.0, and 1.3.3, and checking
> > to see whether zstd was enabled for the build or not.
> >
> > OK?
>
> OK.

Thanks.  Committed to the mainline.  Presumably we want a back port to
the gcc-10 branch as the same problem is present there.  I will send a
patch for that.

Jim

Re: [committed][testsuite] Require non_strict_align in pr94600-{1,3}.c

2020-09-29 Thread Hans-Peter Nilsson

On Thu, 24 Sep 2020, Tom de Vries wrote:

> Hi,
>
> With the nvptx target, we run into:
> ...
> FAIL: gcc.dg/pr94600-1.c scan-rtl-dump-times final "\\(mem/v" 6
> FAIL: gcc.dg/pr94600-1.c scan-rtl-dump-times final "\\(set \\(mem/v" 6
> FAIL: gcc.dg/pr94600-3.c scan-rtl-dump-times final "\\(mem/v" 1
> FAIL: gcc.dg/pr94600-3.c scan-rtl-dump-times final "\\(set \\(mem/v" 1
> ...
> The scans attempt to check for volatile stores, but on nvptx we have memcpy
> instead.
>
> This is due to nvptx being a STRICT_ALIGNMENT target, which has the effect
> that the TYPE_MODE for the store target is set to BKLmode in
> compute_record_mode.
>
> Fix the FAILs by requiring effective target non_strict_align.

No, that's wrong.  There's more than that at play; it worked for
the strict-alignment targets where it was tested at the time.

The test is a valuable canary for this kind of bug.  You now
disabled it for strict-alignment targets.

Please revert and add your target specifier instead, if you
don't feel like investigating further.

brgds, H-P

> Tested on nvptx.
>
> Committed to trunk.
>
> Thanks,
> - Tom
>
> [testsuite] Require non_strict_align in pr94600-{1,3}.c
>
> gcc/testsuite/ChangeLog:
>
> 2020-09-24  Tom de Vries  
>
>   * gcc.dg/pr94600-1.c: Require effective target non_strict_align for
>   scan-rtl-dump-times.
>   * gcc.dg/pr94600-3.c: Same.
>
> ---
>  gcc/testsuite/gcc.dg/pr94600-1.c | 4 ++--
>  gcc/testsuite/gcc.dg/pr94600-3.c | 4 ++--
>  2 files changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/gcc/testsuite/gcc.dg/pr94600-1.c 
> b/gcc/testsuite/gcc.dg/pr94600-1.c
> index b5913a0939c..38f939a98cb 100644
> --- a/gcc/testsuite/gcc.dg/pr94600-1.c
> +++ b/gcc/testsuite/gcc.dg/pr94600-1.c
> @@ -32,5 +32,5 @@ foo(void)
>  }
>
>  /* The only volatile accesses should be the obvious writes.  */
> -/* { dg-final { scan-rtl-dump-times {\(mem/v} 6 "final" } } */
> -/* { dg-final { scan-rtl-dump-times {\(set \(mem/v} 6 "final" } } */
> +/* { dg-final { scan-rtl-dump-times {\(mem/v} 6 "final" { target { 
> non_strict_align } } } } */
> +/* { dg-final { scan-rtl-dump-times {\(set \(mem/v} 6 "final" { target { 
> non_strict_align } } } } */
> diff --git a/gcc/testsuite/gcc.dg/pr94600-3.c 
> b/gcc/testsuite/gcc.dg/pr94600-3.c
> index 7537f6cb797..e8776fbdb28 100644
> --- a/gcc/testsuite/gcc.dg/pr94600-3.c
> +++ b/gcc/testsuite/gcc.dg/pr94600-3.c
> @@ -31,5 +31,5 @@ foo(void)
>  }
>
>  /* The loop isn't unrolled. */
> -/* { dg-final { scan-rtl-dump-times {\(mem/v} 1 "final" } } */
> -/* { dg-final { scan-rtl-dump-times {\(set \(mem/v} 1 "final" } } */
> +/* { dg-final { scan-rtl-dump-times {\(mem/v} 1 "final" { target { 
> non_strict_align } } } } */
> +/* { dg-final { scan-rtl-dump-times {\(set \(mem/v} 1 "final" { target { 
> non_strict_align } } } } */
>

RE: [PATCH PR96757] aarch64: ICE during GIMPLE pass: vect

2020-09-29 Thread duanbo (C)



> -Original Message-
> From: Richard Sandiford [mailto:richard.sandif...@arm.com]
> Sent: Thursday, September 24, 2020 7:56 PM
> To: duanbo (C) 
> Cc: GCC Patches 
> Subject: Re: [PATCH PR96757] aarch64: ICE during GIMPLE pass: vect
> 
> Hi,
> 
> "duanbo (C)"  writes:
> > Sorry for the late reply.
> 
> My time to apologise for the late reply.
> 
> > Thanks for your suggestions. I have modified accordingly.
> > Attached please find the v1 patch.
> 
> Thanks, the logic to choose which precision we pick looks good.
> But I think the build_mask_conversions should be deferred until after we've
> decided to make the transform.  So…
> 
> > @@ -4340,16 +4342,91 @@ vect_recog_mask_conversion_pattern
> (vec_info
> > *vinfo,
> >
> >  it is better for b1 and b2 to use the mask type associated
> >  with int elements rather bool (byte) elements.  */
> > - rhs1_type = integer_type_for_mask (TREE_OPERAND (rhs1, 0),
> vinfo);
> > - if (!rhs1_type)
> > -   rhs1_type = TREE_TYPE (TREE_OPERAND (rhs1, 0));
> > + rhs1_op0 = TREE_OPERAND (rhs1, 0);
> > + rhs1_op1 = TREE_OPERAND (rhs1, 1);
> > + if (!rhs1_op0 || !rhs1_op1)
> > +   return NULL;
> > + rhs1_op0_type = integer_type_for_mask (rhs1_op0, vinfo);
> > + rhs1_op1_type = integer_type_for_mask (rhs1_op1, vinfo);
> > +
> > + if (!rhs1_op0_type && !rhs1_op1_type)
> > +   {
> > + rhs1_type = TREE_TYPE (rhs1_op0);
> > + vectype2 = get_mask_type_for_scalar_type (vinfo, rhs1_type);
> 
> …here we should just be able to set rhs1_type, and leave vectype2 to the
> code below.
> 
> > +   }
> > + else if (!rhs1_op0_type && rhs1_op1_type)
> > +   {
> > + rhs1_type = TREE_TYPE (rhs1_op0);
> > + vectype2 = get_mask_type_for_scalar_type (vinfo, rhs1_type);
> > + if (!vectype2)
> > +   return NULL;
> > + rhs1_op1 = build_mask_conversion (vinfo, rhs1_op1,
> > +   vectype2, stmt_vinfo);
> > +   }
> > + else if (rhs1_op0_type && !rhs1_op1_type)
> > +   {
> > + rhs1_type = TREE_TYPE (rhs1_op1);
> > + vectype2 = get_mask_type_for_scalar_type (vinfo, rhs1_type);
> > + if (!vectype2)
> > +   return NULL;
> > + rhs1_op0 = build_mask_conversion (vinfo, rhs1_op0,
> > +   vectype2, stmt_vinfo);
> 
> Same for these two.
> 
> > +   }
> > + else if (TYPE_PRECISION (rhs1_op0_type)
> > +  != TYPE_PRECISION (rhs1_op1_type))
> > +   {
> > + int tmp1 = (int)TYPE_PRECISION (rhs1_op0_type)
> > +- (int)TYPE_PRECISION (TREE_TYPE (lhs));
> > + int tmp2 = (int)TYPE_PRECISION (rhs1_op1_type)
> > +- (int)TYPE_PRECISION (TREE_TYPE (lhs));
> > + if ((tmp1 > 0 && tmp2 > 0)||(tmp1 < 0 && tmp2 < 0))
> 
> Minor formatting nit, sorry, but: GCC style is to put a space after
> (int) and on either side of ||.
> 
> Might be good to use the same numbering as the operands: tmp0 and tmp1
> instead of tmp1 and tmp2.
> 
> > +   {
> > + if (abs (tmp1) > abs (tmp2))
> > +   {
> > + vectype2 = get_mask_type_for_scalar_type (vinfo,
> > +
>   rhs1_op1_type);
> > + if (!vectype2)
> > +   return NULL;
> > + rhs1_op0 = build_mask_conversion (vinfo, rhs1_op0,
> > +   vectype2,
> stmt_vinfo);
> > +   }
> > + else
> > +   {
> > + vectype2 = get_mask_type_for_scalar_type (vinfo,
> > +
>   rhs1_op0_type);
> > + if (!vectype2)
> > +   return NULL;
> > + rhs1_op1 = build_mask_conversion (vinfo, rhs1_op1,
> > +   vectype2,
> stmt_vinfo);
> > +   }
> > + rhs1_type = integer_type_for_mask (rhs1_op0, vinfo);
> 
> Here I think we can just go with rhs1_type = rhs1_op1_type if abs (tmp1) >
> abs (tmp2) (i.e. op1 is closer to the final type than op0) and rhs1_op0_type
> otherwise.
> 
> > +   }
> > + else
> > +   {
> > + rhs1_op0 = build_mask_conversion (vinfo, rhs1_op0,
> > +   vectype1, stmt_vinfo);
> > + rhs1_op1 = build_mask_conversion (vinfo, rhs1_op1,
> > +   vectype1, stmt_vinfo);
> > + rhs1_type = integer_type_for_mask (rhs1_op0, vinfo);
> > + if (!rhs1_type)
> > +   return NULL;
> > + vectype2 = get_mask_type_for_scalar_type (vinfo,
> rhs1_type);
> 
> and here I think rhs1_type can be:
> 
>   build_nonstandard_integer_type (TYPE_PRECISION (lhs_type), 1);
> 
> > +   }
> > +   }
> > + else
> > +   {
> > + rhs1_type = integer_type_for_mask (rhs1_op0, vinfo);
> > +

[PATCH] rs6000: Use parameterized names for tablejump

2020-09-29 Thread Segher Boessenkool

We have too many tablejump patterns.  Using parameterized names
simplifies the code a bit.

Tested on powerpc64-linux {-m32,-m64}.  Committing.


Segher


2020-09-29  Segher Boessenkool  

* config/rs6000/rs6000.md (tablejump): Simplify.
(tablejumpsi): Merge this ...
(tablejumpdi): ... and this ...
(@tablejump_normal): ... into this.
(tablejumpsi_nospec): Merge this ...
(tablejumpdi_nospec): ... and this ...
(@tablejump_nospec): ... into this.
(*tablejump_internal1): Delete, rename to ...
(@tablejump_insn_normal): ... this.
(*tablejump_internal1_nospec): Delete, rename to ...
(@tablejump_insn_nospec): ... this.

---
 gcc/config/rs6000/rs6000.md | 105 ++--
 1 file changed, 43 insertions(+), 62 deletions(-)

diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 694ff70..24ad809 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -12697,12 +12697,7 @@ (define_expand "tablejump"
   ""
 {
   if (rs6000_speculate_indirect_jumps)
-{
-  if (TARGET_32BIT)
-   emit_jump_insn (gen_tablejumpsi (operands[0], operands[1]));
-  else
-   emit_jump_insn (gen_tablejumpdi (operands[0], operands[1]));
-}
+emit_jump_insn (gen_tablejump_normal (Pmode, operands[0], operands[1]));
   else
 {
   rtx ccreg = gen_reg_rtx (CCmode);
@@ -12716,69 +12711,55 @@ (define_expand "tablejump"
   DONE;
 })
 
-(define_expand "tablejumpsi"
-  [(set (match_dup 3)
-   (plus:SI (match_operand:SI 0)
-(match_dup 2)))
-   (parallel [(set (pc)
-  (match_dup 3))
- (use (label_ref (match_operand 1)))])]
-  "TARGET_32BIT && rs6000_speculate_indirect_jumps"
+(define_expand "@tablejump_normal"
+  [(use (match_operand:SI 0))
+   (use (match_operand:P 1))]
+  "rs6000_speculate_indirect_jumps"
 {
   operands[0] = force_reg (SImode, operands[0]);
-  operands[2] = force_reg (SImode, gen_rtx_LABEL_REF (SImode, operands[1]));
-  operands[3] = gen_reg_rtx (SImode);
+  if (mode == SImode)
+operands[4] = operands[0];
+  else
+{
+  operands[4] = gen_reg_rtx (Pmode);
+  rtx src = gen_rtx_fmt_e (SIGN_EXTEND, Pmode, operands[0]);
+  emit_move_insn (operands[4], src);
+}
+
+  operands[2] = force_reg (Pmode, gen_rtx_LABEL_REF (Pmode, operands[1]));
+  operands[3] = gen_reg_rtx (Pmode);
+
+  emit_insn (gen_add3 (operands[3], operands[4], operands[2]));
+  emit_jump_insn (gen_tablejump_insn_normal (Pmode, operands[3], operands[1]));
+  DONE;
 })
 
-(define_expand "tablejumpsi_nospec"
-  [(set (match_dup 4)
-   (plus:SI (match_operand:SI 0)
-(match_dup 3)))
-   (parallel [(set (pc)
-  (match_dup 4))
- (use (label_ref (match_operand 1)))
- (clobber (match_operand 2))])]
-  "TARGET_32BIT && !rs6000_speculate_indirect_jumps"
+(define_expand "@tablejump_nospec"
+  [(use (match_operand:SI 0))
+   (use (match_operand:P 1))
+   (use (match_operand:CC 2))]
+  "!rs6000_speculate_indirect_jumps"
 {
   operands[0] = force_reg (SImode, operands[0]);
-  operands[3] = force_reg (SImode, gen_rtx_LABEL_REF (SImode, operands[1]));
-  operands[4] = gen_reg_rtx (SImode);
+  if (mode == SImode)
+operands[4] = operands[0];
+  else
+{
+  operands[4] = gen_reg_rtx (Pmode);
+  rtx src = gen_rtx_fmt_e (SIGN_EXTEND, Pmode, operands[0]);
+  emit_move_insn (operands[4], src);
+}
+
+  operands[5] = force_reg (Pmode, gen_rtx_LABEL_REF (Pmode, operands[1]));
+  operands[3] = gen_reg_rtx (Pmode);
+
+  emit_insn (gen_add3 (operands[3], operands[4], operands[5]));
+  emit_jump_insn (gen_tablejump_insn_nospec (Pmode, operands[3], operands[1],
+operands[2]));
+  DONE;
 })
 
-(define_expand "tablejumpdi"
-  [(set (match_dup 4)
-(sign_extend:DI (match_operand:SI 0 "lwa_operand")))
-   (set (match_dup 3)
-   (plus:DI (match_dup 4)
-(match_dup 2)))
-   (parallel [(set (pc)
-  (match_dup 3))
- (use (label_ref (match_operand 1)))])]
-  "TARGET_64BIT && rs6000_speculate_indirect_jumps"
-{
-  operands[2] = force_reg (DImode, gen_rtx_LABEL_REF (DImode, operands[1]));
-  operands[3] = gen_reg_rtx (DImode);
-  operands[4] = gen_reg_rtx (DImode);
-})
-
-(define_expand "tablejumpdi_nospec"
-  [(set (match_dup 5)
-(sign_extend:DI (match_operand:SI 0 "lwa_operand")))
-   (set (match_dup 4)
-   (plus:DI (match_dup 5)
-(match_dup 3)))
-   (parallel [(set (pc)
-  (match_dup 4))
- (use (label_ref (match_operand 1)))
- (clobber (match_operand 2))])]
-  "TARGET_64BIT && !rs6000_speculate_indirect_jumps"
-{
-  operands[3] = force_reg (DImode, gen_rtx_LABEL_REF (DImode, operands[1]));
-  operands[4] = gen_reg_rtx (DImode);
-  operands[5] = gen_reg_rtx (DImode);
-})
-
-(define_insn "*tablejump_internal1"

Re: [PATCH v2] builtins: (not just) rs6000: Add builtins for fegetround, feclearexcept and feraiseexcept [PR94193]

2020-09-29 Thread Segher Boessenkool

Hi Raoni,

Some of this isn't an rs6000 patch, but the subject says it is, so it
might well not draw the attention it needs.

Adding some Cc:s.

On Fri, Sep 04, 2020 at 12:52:30PM -0300, Raoni Fassina Firmino wrote:
> There is one pending question raised by Segher, It is about adding
> documentation, I am not sure if it is needed and if so, where it
> should be. I will quote the relevant part of the conversation[2] from
> the v1 thread for context:
> 
>   > > > +OPTAB_D (fegetround_optab, "fegetround$a")
>   > > > +OPTAB_D (feclearexcept_optab, "feclearexcept$a")
>   > > > +OPTAB_D (feraiseexcept_optab, "feraiseexcept$a")
>   > >␣
>   > > Should those be documented somewhere?  (In gcc/doc/ somewhere).
>   >
>   > I am lost on that one. I took a look on the docs (I hope looking on the
>   > online docs was good enough) and I didn't find a place where i feel it
>   > sits well. On the PowerPC target specific sections (6.60.22 Basic
>   > PowerPC Built-in Functions), I didn't found it mentioning builtins that
>   > are optimizations for the standard library functions, but we do have
>   > many of these for Power.  Then, on the generic section (6.59 Other
>   > Built-in Functions Provided by GCC) it mentions C99 functions that have
>   > builtins but it seems like it mentions builtins that have target
>   > independent implementation, or at least it dos not say that some
>   > builtins may be implemented on only some targets.  And in this case
>   > there is no implementation (for now) for any other target that is not
>   > PowerPc.
>   >
>   > So, I don't know if or where this should be documented.

I don't see much about optabs in the docs either.  Add some text to
optabs.def itself then?

> +(define_expand "feclearexceptsi"
> +  [(use (match_operand:SI 1 "const_int_operand" "n"))
> +   (set (match_operand:SI 0 "gpc_reg_operand")
> + (const_int 0))]
> +  "TARGET_HARD_FLOAT"
> +{
> +  switch (INTVAL (operands[1]))
> +{
> +case 0x200:  /* FE_INEXACT */
> +case 0x400:  /* FE_DIVBYZERO */
> +case 0x800:  /* FE_UNDERFLOW */
> +case 0x1000: /* FE_OVERFLOW */

Please write 0x0200 etc. instead?

> +;; int fegraiseexcept(int excepts)

(typo)

> +/* { dg-do run } */
> +/* { dg-require-effective-target fenv_exceptions } */
> +/* { dg-options "-lm -fno-builtin" } */

That -fno-builtin looks very strange...  Comment what it is for?

> +#define FAIL(v, e) printf("ERROR, __builtin_fegetround() returned %d," \
> +  " not the expecected value %d\n", v, e);

(Typo, "expected")

The rs6000 part is okay for trunk (with those modifications), after the
generic parts is approved.  Thanks!


Segher

[pushed] correct/improve handling of null VLA arguments (PR 97188)

2020-09-29 Thread Martin Sebor via Gcc-patches


I have committed & pushed the fix in r11-3540.

On 9/24/20 6:15 PM, Martin Sebor wrote:

The machinery recently added to support -Warray-parameter and
-Wvla-parameter also results in enhanced detection of null
pointer arguments to VLA function parameters.  This enhancement
wasn't tested as comprehensively as it should have been and
so has some bugs.  The attached patch fixes one that leads
to an ICE.  It also restructures the function and improves
the warning issues in this case.

The fix is slightly bit bigger than what I would normally commit
without a review but since it's all in code I just wrote and in
my view low risk I will go ahead and push it in a few days unless
I hear requests for changes by then.

Martin

Re: [PATCH v3] c++: Implement -Wrange-loop-construct [PR94695]

2020-09-29 Thread Marek Polacek via Gcc-patches

On Mon, Sep 28, 2020 at 03:05:55PM -0400, Jason Merrill via Gcc-patches wrote:
> On 9/28/20 12:30 PM, Marek Polacek wrote:
> > On Sat, Sep 26, 2020 at 01:22:41AM -0400, Jason Merrill wrote:
> > > > +bool
> > > > +ref_conv_binds_directly_p (tree type, tree expr)
> > > > +{
> > > > +  gcc_assert (TYPE_REF_P (type));
> > > > +  conversion *conv = implicit_conversion (type, TREE_TYPE (expr), expr,
> > > > + /*c_cast_p=*/false,
> > > > + LOOKUP_IMPLICIT, tf_none);
> > > > +  return conv && !conv_binds_ref_to_prvalue (conv);
> > > 
> > > You probably want to free any allocated conversions, like in
> > > can_convert_arg.
> > 
> > I ought to free them, indeed.  Apologies for missing that.  Fixed:
> > 
> > Bootstrapped/regtested on x86_64-pc-linux-gnu, ok for trunk?
> > 
> > -- >8 --
> > This new warning can be used to prevent expensive copies inside range-based
> > for-loops, for instance:
> > 
> >struct S { char arr[128]; };
> >void fn () {
> >  S arr[5];
> >  for (const auto x : arr) {  }
> >}
> > 
> > where auto deduces to S and then we copy the big S in every iteration.
> > Using "const auto " would not incur such a copy.  With this patch the
> > compiler will warn:
> > 
> > q.C:4:19: warning: loop variable 'x' creates a copy from type 'const S' 
> > [-Wrange-loop-construct]
> >  4 |   for (const auto x : arr) {  }
> >|   ^
> > q.C:4:19: note: use reference type 'const S&' to prevent copying
> >  4 |   for (const auto x : arr) {  }
> 
> It's unfortunate that we seem to suggest the unnecessary change from auto to
> S here.  Maybe just say "reference type" without printing the type?

Yeah, I wish there was a way to avoid it.  But I don't think we have
a TREE_TYPE bit that says that a type was deduced from auto/decltype(auto).

I'll just avoid printing the type...

> > +  auto_diagnostic_group d;
> > +  if (warning_at (DECL_SOURCE_LOCATION (decl), OPT_Wrange_loop_construct,
> 
> Why not use 'loc' here?

Fixed.

> OK however you want to resolve these comments.

Pushed, thanks.

Marek

[committed] analyzer: fix signal-handler registration location [PR95188]

2020-09-29 Thread David Malcolm via Gcc-patches

PR analyzer/95188 reports that diagnostics from
-Wanalyzer-unsafe-call-within-signal-handler use the wrong
source location when reporting the signal-handler registration
event in the diagnostic_path.  The diagnostics erroneously use the
location of the first stmt in the basic block containing the call
to "signal", rather than that of the call itself.

Fixed thusly.

Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.
Pushed to master as r11-3537-gd60d63a00bb50ba6896939705c589578177b404d.

gcc/analyzer/ChangeLog:
PR analyzer/95188
* engine.cc (stmt_requires_new_enode_p): Split enodes before
"signal" calls.

gcc/testsuite/ChangeLog:
PR analyzer/95188
* gcc.dg/analyzer/signal-registration-loc.c: New test.
---
 gcc/analyzer/engine.cc| 22 +-
 .../gcc.dg/analyzer/signal-registration-loc.c | 23 +++
 2 files changed, 39 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/analyzer/signal-registration-loc.c

diff --git a/gcc/analyzer/engine.cc b/gcc/analyzer/engine.cc
index c15d1195a97..0e79254ad60 100644
--- a/gcc/analyzer/engine.cc
+++ b/gcc/analyzer/engine.cc
@@ -2677,13 +2677,23 @@ static bool
 stmt_requires_new_enode_p (const gimple *stmt,
   const gimple *prev_stmt)
 {
-  /* Stop consolidating at calls to
- "__analyzer_dump_exploded_nodes", so they always appear at the
- start of an exploded_node.  */
   if (const gcall *call = dyn_cast  (stmt))
-if (is_special_named_call_p (call, "__analyzer_dump_exploded_nodes",
-1))
-  return true;
+{
+  /* Stop consolidating at calls to
+"__analyzer_dump_exploded_nodes", so they always appear at the
+start of an exploded_node.  */
+  if (is_special_named_call_p (call, "__analyzer_dump_exploded_nodes",
+  1))
+   return true;
+
+  /* sm-signal.cc injects an additional custom eedge at "signal" calls
+from the registration enode to the handler enode, separate from the
+regular next state, which defeats the "detect state change" logic
+in process_node.  Work around this via special-casing, to ensure
+we split the enode immediately before any "signal" call.  */
+  if (is_special_named_call_p (call, "signal", 2))
+   return true;
+}
 
   /* If we had a PREV_STMT with an unknown location, and this stmt
  has a known location, then if a state change happens here, it
diff --git a/gcc/testsuite/gcc.dg/analyzer/signal-registration-loc.c 
b/gcc/testsuite/gcc.dg/analyzer/signal-registration-loc.c
new file mode 100644
index 000..4bac1269b1e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/analyzer/signal-registration-loc.c
@@ -0,0 +1,23 @@
+/* Ensure we use the correct location when reporting where the
+   signal handler was registered (PR analyzer/95188).  */
+
+/* { dg-require-effective-target signal } */
+
+#include 
+#include 
+
+int g;
+extern int foo (void);
+
+static void
+handler (int n)
+{
+  fprintf (stderr, "got here: %i\n", g); /* { dg-warning "call to 'fprintf' 
from within signal handler" } */
+}
+
+int main (int argc, char *argv[])
+{
+  g = foo (); /* { dg-bogus "registering" } */
+  signal (SIGSEGV, handler); /* { dg-message "registering 'handler' as signal 
handler" } */
+  return 0;
+}
-- 
2.26.2

Re: [PATCH 4/6] ipa: Multiple predicates for loop properties, with frequencies

2020-09-29 Thread Jan Hubicka

> This patch enhances the ability of IPA to reason under what conditions
> loops in a function have known iteration counts or strides because it
> replaces single predicates which currently hold conjunction of
> predicates for all loops with vectors capable of holding multiple
> predicates, each with a cumulative frequency of loops with the
> property.
> 
> This second property is then used by IPA-CP to much more aggressively
> boost its heuristic score for cloning opportunities which make
> iteration counts or strides of frequent loops compile time constant.
> 
> gcc/ChangeLog:
> 
> 2020-09-03  Martin Jambor  
> 
>   * ipa-fnsummary.h (ipa_freqcounting_predicate): New type.
>   (ipa_fn_summary): Change the type of loop_iterations and loop_strides
>   to vectors of ipa_freqcounting_predicate.
>   (ipa_fn_summary::ipa_fn_summary): Construct the new vectors.
>   (ipa_call_estimates): New fields loops_with_known_iterations and
>   loops_with_known_strides.
>   * ipa-cp.c (hint_time_bonus): Multiply param_ipa_cp_loop_hint_bonus
>   with the expected frequencies of loops with known iteration count or
>   stride.
>   * ipa-fnsummary.c (add_freqcounting_predicate): New function.
>   (ipa_fn_summary::~ipa_fn_summary): Release the new vectors instead of
>   just two predicates.
>   (remap_hint_predicate_after_duplication): Replace with function
>   remap_freqcounting_preds_after_dup.
>   (ipa_fn_summary_t::duplicate): Use it or duplicate new vectors.
>   (ipa_dump_fn_summary): Dump the new vectors.
>   (analyze_function_body): Compute the loop property vectors.
>   (ipa_call_context::estimate_size_and_time): Calculate also
>   loops_with_known_iterations and loops_with_known_strides.  Adjusted
>   dumping accordinly.
>   (remap_hint_predicate): Replace with function
>   remap_freqcounting_predicate.
>   (ipa_merge_fn_summary_after_inlining): Use it.
>   (inline_read_section): Stream loopcounting vectors instead of two
>   simple predicates.
>   (ipa_fn_summary_write): Likewise.
>   * params.opt (ipa-max-loop-predicates): New parameter.
>   * doc/invoke.texi (ipa-max-loop-predicates): Document new param.
> 
> diff --git a/gcc/ipa-fnsummary.c b/gcc/ipa-fnsummary.c
> index 6082f34d63f..94aa930 100644
> --- a/gcc/ipa-fnsummary.c
> +++ b/gcc/ipa-fnsummary.c
> @@ -310,6 +310,36 @@ set_hint_predicate (predicate **p, predicate 
> new_predicate)
>  }
>  }
>  
> +/* Find if NEW_PREDICATE is already in V and if so, increment its freq.
> +   Otherwise add a new item to the vector with this predicate and frerq equal
> +   to add_freq, unless the number of predicates would exceed 
> MAX_NUM_PREDICATES
> +   in which case the function does nothing.  */
> +
> +static void
> +add_freqcounting_predicate (vec **v,
> + const predicate _predicate, sreal add_freq,
> + unsigned max_num_predicates)
> +{
> +  if (new_predicate == false || new_predicate == true)
> +return;
> +  ipa_freqcounting_predicate *f;
> +  for (int i = 0; vec_safe_iterate (*v, i, ); i++)
> +if (new_predicate == f->predicate)
> +  {
> + f->freq += add_freq;
> + return;
> +  }
> +  if (vec_safe_length (*v) >= max_num_predicates)
> +/* Too many different predicates to account for.  */
> +return;
> +
> +  ipa_freqcounting_predicate fcp;
> +  fcp.predicate = NULL;
> +  set_hint_predicate (, new_predicate);
> +  fcp.freq = add_freq;
> +  vec_safe_push (*v, fcp);
> +  return;
> +}
>  
>  /* Compute what conditions may or may not hold given information about
> parameters.  RET_CLAUSE returns truths that may hold in a specialized 
> copy,
> @@ -710,13 +740,17 @@ ipa_call_summary::~ipa_call_summary ()
>  
>  ipa_fn_summary::~ipa_fn_summary ()
>  {
> -  if (loop_iterations)
> -edge_predicate_pool.remove (loop_iterations);
> -  if (loop_stride)
> -edge_predicate_pool.remove (loop_stride);
> +  unsigned len = vec_safe_length (loop_iterations);
> +  for (unsigned i = 0; i < len; i++)
> +edge_predicate_pool.remove ((*loop_iterations)[i].predicate);
> +  len = vec_safe_length (loop_strides);
> +  for (unsigned i = 0; i < len; i++)
> +edge_predicate_pool.remove ((*loop_strides)[i].predicate);

For edges predicates are pointers since most of them have no interesting
predicate and thus NULL is more compact.  I guess here it would make
snese to make predicates inline. Is there a problem with vectors not
liking non-pods?
>vec_free (conds);
>vec_free (size_time_table);
>vec_free (call_size_time_table);
> +  vec_free (loop_iterations);
> +  vec_free (loop_strides);

However auto_vecs should work in the brave new C++ world.

The patch looks reasonable to me.  Did you check how much memory it
consumes building bigger projects?  Also I am bit worried about our
ability to use it reasonably in the heuristics since it is quite
complicated value...

Honza

RE: [PATCH][GCC][AArch64] Add support for Cortex-X1

2020-09-29 Thread Przemyslaw Wirkus via Gcc-patches

> Ok. Please make sure aarch64-tune.md is properly regenerated when
> committing as Alex has been adding new CPUs in there recently too.

commit f836f3bc8f76ef3e3ad21762590302ad11abc9f8

> Thanks,
> Kyrill
> 
> >
> > kind regards,
> > Przemyslaw Wirkus
> >
> > gcc/ChangeLog:
> >
> > * config/aarch64/aarch64-cores.def: Add Cortex-X1 Arm core.
> > * config/aarch64/aarch64-tune.md: Regenerate.
> > * doc/invoke.texi: Add -mtune=cortex-x1 docs.

RE: [PATCH][GCC][ARM] Add support for Cortex-X1

2020-09-29 Thread Przemyslaw Wirkus via Gcc-patches

> Ok, but please make sure this is properly rebased on top of Alex's patches
> that have recently gone in in this area.

commit 0eef5eea2b42d892df52b655e55458f27ac3fb81

> Thanks,
> Kyrill
> 
> 
> kind regards,
> Przemyslaw Wirkus
> 
> gcc/ChangeLog:
> 
>   * config/arm/arm-cpus.in: Add Cortex-X1 core.
>   * config/arm/arm-tables.opt: Regenerate.
>   * config/arm/arm-tune.md: Regenerate.
>   * doc/invoke.texi: Update docs.

[PATCH] avoid modifying type in place (PR 97206)

2020-09-29 Thread Martin Sebor via Gcc-patches


To render the bounds as well as the static specifier in array
and VLA function parameters the  new -Warray-parameter and
-Wvla-parameter warning builds a "synthetic" array type that
corresponds to the form of the parameter, sets its qualifiers
to match those of the pointer, and passes it to the prett-
printer for formatting.  The in-place modification of the type
causes problems when the type is subsequently [re]used for one
that doesn't have the same qualifiers set.  The attached fix
replaces the in-place modification with a call to 
build_type_attribute_qual_variant.


Tested on x86_64-linux.

I will commit this patch later this week unless I hear concerns
or suggestions for changes.

Martin
PR c/97206 - ICE in composite_type on declarations of a similar array types

gcc/ChangeLog:

	PR c/97206
	* attribs.c (attr_access::array_as_string): Avoid modifying a shared
	type in place and use build_type_attribute_qual_variant instead.
gcc/testsuite/ChangeLog:

	PR c/97206
	* gcc.dg/Warray-parameter-7.c: New test.
	* gcc.dg/Warray-parameter-8.c: New test.
	* gcc.dg/Wvla-parameter-5.c: New test.

diff --git a/gcc/attribs.c b/gcc/attribs.c
index abc75368e6c..923e2e142bb 100644
--- a/gcc/attribs.c
+++ b/gcc/attribs.c
@@ -2256,15 +2256,14 @@ attr_access::array_as_string (tree type) const
 
   if (this->str)
 {
-  /* For array parameters (but not pointers) create an array type
-	 that corresponds to the form of the parameter including its
+  /* For array parameters (but not pointers) create a temporary array
+	 type that corresponds to the form of the parameter including its
 	 qualifiers even though they apply to the pointer, not the array
 	 type.  */
   const bool vla_p = minsize == HOST_WIDE_INT_M1U;
   tree eltype = TREE_TYPE (type);
-  tree artype;
-
   tree index_type = NULL_TREE;
+
   if (minsize == HOST_WIDE_INT_M1U)
 	{
 	  /* Determine if this is a VLA (an array whose most significant
@@ -2278,28 +2277,24 @@ attr_access::array_as_string (tree type) const
   else  if (minsize)
 	index_type = build_index_type (size_int (minsize - 1));
 
-  artype = build_array_type (eltype, index_type);
-
+  tree arat = NULL_TREE;
   if (static_p || vla_p)
 	{
 	  tree flag = static_p ? integer_one_node : NULL_TREE;
 	  /* Hack: there's no language-independent way to encode
 	 the "static" specifier or the "*" notation in an array type.
-	 Temporarily add an attribute to have the pretty printer add
-	 "static" or "*", and remove it later.  The static notation
-	 is only valid in the most significant bound but [*] can be
-	 used for any bound.  Because [*] is represented the same as
-	 [0] this hack only works for the most significant bound like
-	 static and the others are rendered as [0].  */
-	  tree at = tree_cons (get_identifier ("array"), flag, NULL_TREE);
-	  TYPE_ATTRIBUTES (artype) = at;
+	 Add a "fake" attribute to have the pretty printer add "static"
+	 or "*", and remove it later.  The static notation is only
+	 valid in the most significant bound but [*] can be used for
+	 any bound.  Because [*] is represented the same as [0] this
+	 hack only works for the most significant bound like static
+	 and the others are rendered as [0].  */
+	  arat = tree_cons (get_identifier ("array"), flag, NULL_TREE);
 	}
 
-  TYPE_ATOMIC (artype) = TYPE_ATOMIC (type);
-  TYPE_READONLY (artype) = TYPE_READONLY (type);
-  TYPE_RESTRICT (artype) = TYPE_RESTRICT (type);
-  TYPE_VOLATILE (artype) = TYPE_VOLATILE (type);
-  type = artype;
+  const int quals = TYPE_QUALS (type);
+  type = build_array_type (eltype, index_type);
+  type = build_type_attribute_qual_variant (type, arat, quals);
 }
 
   /* Format the type using the current pretty printer.  The generic tree
@@ -2309,10 +2304,6 @@ attr_access::array_as_string (tree type) const
   typstr = pp_formatted_text (pp);
   delete pp;
 
-  if (this->str)
-/* Remove the attribute that wasn't installed by decl_attributes.  */
-TYPE_ATTRIBUTES (type) = NULL_TREE;
-
   return typstr;
 }
 
diff --git a/gcc/testsuite/gcc.dg/Warray-parameter-7.c b/gcc/testsuite/gcc.dg/Warray-parameter-7.c
new file mode 100644
index 000..4863045be78
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/Warray-parameter-7.c
@@ -0,0 +1,25 @@
+/* PR c/97206 - ICE in composite_type on declarations of a similar array types
+   { dg-do compile }
+   { dg-options "-Wall" } */
+
+__attribute__((__access__(__write_only__, 1))) void
+f1 (char* restrict);
+
+void f1 (char*);
+
+char a1[];
+char a1[] = { };
+
+
+void f2 (char[restrict]);
+void f2 (char*);
+
+char a2[];
+char a2[] = { };
+
+
+void f3 (char*);
+void f3 (char[const]);
+
+extern const char a3[];
+extern const char a3[1];
diff --git a/gcc/testsuite/gcc.dg/Warray-parameter-8.c b/gcc/testsuite/gcc.dg/Warray-parameter-8.c
new file mode 100644
index 000..b152702b847
--- /dev/null
+++

[PATCH] c++: ICE in dependent_type_p with constrained auto [PR97052]

2020-09-29 Thread Patrick Palka via Gcc-patches

This patch fixes an "unguarded" call to coerce_template_parms in
build_standard_check: processing_template_decl could be zero if we
we get here during processing of the first 'auto' parameter of an
abbreviated function template.  In the testcase below, this leads to an
ICE when coerce_template_parms substitutes into C's dependent default
template argument.

Bootstrapped and regtested on x86_64-pc-linux-gnu and tested by building
cmcstl2 and range-v3.  Does this look OK for trunk?

gcc/cp/ChangeLog:

PR c++/97052
* constraint.cc (build_standard_check): Temporarily increment
processing_template_decl when calling coerce_template_parms.

gcc/testsuite/ChangeLog:

PR c++/97052
* g++.dg/cpp2a/concepts-defarg2: New test.
---
 gcc/cp/constraint.cc  | 2 ++
 gcc/testsuite/g++.dg/cpp2a/concepts-defarg2.C | 9 +
 2 files changed, 11 insertions(+)
 create mode 100644 gcc/testsuite/g++.dg/cpp2a/concepts-defarg2.C

diff --git a/gcc/cp/constraint.cc b/gcc/cp/constraint.cc
index d49957a6c4a..da3b2cc7e65 100644
--- a/gcc/cp/constraint.cc
+++ b/gcc/cp/constraint.cc
@@ -1355,7 +1355,9 @@ build_standard_check (tree tmpl, tree args, 
tsubst_flags_t complain)
   gcc_assert (standard_concept_p (tmpl));
   gcc_assert (TREE_CODE (tmpl) == TEMPLATE_DECL);
   tree parms = INNERMOST_TEMPLATE_PARMS (DECL_TEMPLATE_PARMS (tmpl));
+  ++processing_template_decl;
   args = coerce_template_parms (parms, args, tmpl, complain);
+  --processing_template_decl;
   if (args == error_mark_node)
 return error_mark_node;
   return build2 (TEMPLATE_ID_EXPR, boolean_type_node, tmpl, args);
diff --git a/gcc/testsuite/g++.dg/cpp2a/concepts-defarg2.C 
b/gcc/testsuite/g++.dg/cpp2a/concepts-defarg2.C
new file mode 100644
index 000..6c0670e9fd2
--- /dev/null
+++ b/gcc/testsuite/g++.dg/cpp2a/concepts-defarg2.C
@@ -0,0 +1,9 @@
+// PR c++/97052
+// { dg-do compile { target c++20 } }
+
+template
+concept C = true;
+
+bool f(C auto) {
+  return true;
+}
-- 
2.28.0.618.g9bc233ae1c

[committed] analyzer: silence -Wsign-compare warnings

2020-09-29 Thread David Malcolm via Gcc-patches

Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.
Pushed to master as 9b4b1ed50f1e0f252a86851456b58bb2e142c495.

gcc/analyzer/ChangeLog:
* constraint-manager.cc
(constraint_manager::add_constraint_internal): Whitespace fixes.
Silence -Wsign-compare warning.
* engine.cc (maybe_process_run_of_before_supernode_enodes):
Silence -Wsign-compare warning.
---
 gcc/analyzer/constraint-manager.cc | 6 +++---
 gcc/analyzer/engine.cc | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/gcc/analyzer/constraint-manager.cc 
b/gcc/analyzer/constraint-manager.cc
index 5cd2c9e0e0b..603b22811c1 100644
--- a/gcc/analyzer/constraint-manager.cc
+++ b/gcc/analyzer/constraint-manager.cc
@@ -1014,10 +1014,10 @@ constraint_manager::add_unknown_constraint 
(equiv_class_id lhs_ec_id,
 
 void
 constraint_manager::add_constraint_internal (equiv_class_id lhs_id,
- enum constraint_op c_op,
- equiv_class_id rhs_id)
+enum constraint_op c_op,
+equiv_class_id rhs_id)
 {
-  if (m_constraints.length () >= param_analyzer_max_constraints)
+  if (m_constraints.length () >= (unsigned)param_analyzer_max_constraints)
 return;
 
   constraint new_c (lhs_id, c_op, rhs_id);
diff --git a/gcc/analyzer/engine.cc b/gcc/analyzer/engine.cc
index 84eaa8415da..c15d1195a97 100644
--- a/gcc/analyzer/engine.cc
+++ b/gcc/analyzer/engine.cc
@@ -2629,7 +2629,7 @@ maybe_process_run_of_before_supernode_enodes 
(exploded_node *enode)
}
 got_merger:
   gcc_assert (it->m_merger_idx >= 0);
-  gcc_assert (it->m_merger_idx < merged_states.length ());
+  gcc_assert ((unsigned)it->m_merger_idx < merged_states.length ());
 }
 
   /* Create merger nodes.  */
-- 
2.26.2

Re: [PATCH] c++: Implement -Wrange-loop-construct [PR94695]

2020-09-29 Thread Martin Sebor via Gcc-patches


On 9/28/20 11:34 AM, Marek Polacek wrote:

On Fri, Sep 25, 2020 at 04:31:16PM -0600, Martin Sebor wrote:

On 9/24/20 6:05 PM, Marek Polacek via Gcc-patches wrote:

This new warning can be used to prevent expensive copies inside range-based
for-loops, for instance:

struct S { char arr[128]; };
void fn () {
  S arr[5];
  for (const auto x : arr) {  }
}

where auto deduces to S and then we copy the big S in every iteration.
Using "const auto " would not incur such a copy.  With this patch the
compiler will warn:

q.C:4:19: warning: loop variable 'x' creates a copy from type 'const S' 
[-Wrange-loop-construct]
  4 |   for (const auto x : arr) {  }
|   ^
q.C:4:19: note: use reference type 'const S&' to prevent copying
  4 |   for (const auto x : arr) {  }
|   ^
|   &

As per Clang, this warning is suppressed for trivially copyable types
whose size does not exceed 64B.  The tricky part of the patch was how
to figure out if using a reference would have prevented a copy.  I've
used perform_implicit_conversion to perform the imaginary conversion.
Then if the conversion doesn't have any side-effects, I assume it does
not call any functions or create any TARGET_EXPRs, and is just a simple
assignment like this one:

const T  = (const T &) <__for_begin>;

But it can also be a CALL_EXPR:

x = (const T &) Iterator::operator* (&__for_begin)

which is still fine -- we just use the return value and don't create
any copies.

This warning is enabled by -Wall.  Further warnings of similar nature
should follow soon.


I've always thought a warning like this would be useful when passing
large objects to functions by value.  Is adding one for these cases
what you mean by future warnings?


No, but perhaps we should add it.  I don't know if we could still enable it by
-Wall.  We'd have to handle guaranteed copy elision and also the case when we
pass classes by invisible reference.  Unsure how much of the implementation
these warnings could share.

Do we have a request for the warning wrt passing chunky objects by value?


Not that I know of.  It's just something I had in the back of my
mind.



As a user, I'd probably want to have the option of figuring out where I'm
copying large types, since it can be a performance issue.


For the range loop, I wonder if more could be done to elide the copy
and avoid the warning when it isn't really necessary.  For instance,
for trivially copyable types like in your example, since x is const,
modifying it would be undefined, and so when we can prove that
the original object itself isn't modified (e.g., because it's
declared const, or because it can't be accessed in the loop),
there should be no need to make a copy on each iteration.  Using
a reference to the original object should be sufficient.  Does C++
rule out such an optimization?


Well, changing const auto x in

struct S { char arr[128]; S(); };

void
fn ()
{
   S a[5];
   for (const auto x : a)
 decltype(x) k;
}

to const auto  would break this code.


Sure, an optimization that changed code in a detectable way would
not be viable.  But I wasn't thinking of actually changing the type
of the variable at this high level.  What I meant is that it would
be nice to transform an example like this:

  struct S { int i; char a[80]; };
  const S a[] = { { 123, "abc" }, { 234, "bcd" }, { 345, "cde"} };

  int f (const char *s)
  {
for (auto x: a)
  if (__builtin_strcmp (x.a, s) == 0)
return x.i;
return -1;
  }

into this (when it's possible) and avoid issuing the warning:

  int g (const char *s)
  {
for (int i = 0; i != sizeof a / sizeof *a; ++i)
  if (strcmp (a[i].a, s) == 0)
return a[i].i;
return -1;
  }


About the name of the option: my first thought was that it was
about the construct known as the range loop, but after reading
your description I wonder if it might actually primarily be about
constructing expensive copies and the range loop is incidental.


It was a bit confusing to me too at first.  It's about constructing expensive
copies in range-based for-loops.  I don't think it's incidental that
it warns in loops only.

I'm not attached to the name but it's what Clang uses so we'll have to
follow.


(It's impossible to tell from the Clang manual because its way
of documenting warning options is to show examples of their text.)


Yes.  I really like that we provide code snippets showing what a warning
is supposed to warn on in our manual.  Let's keep it that way.


Then again, I see it's related to -Wrange-loop-analysis so that
suggests it is mainly about range loops, and that there may be
a whole series of warnings and options related to it.  Can you
please shed some light on that?  (E.g., what are some of
the "further warnings of similar nature" about?)  I think it
might also be helpful to expand the documentation a bit to help
answer common questions (I came across the following

Re: [PATCH] c++: Fix P0846 (ADL and function templates) in template [PR97010]

2020-09-29 Thread Marek Polacek via Gcc-patches

Ping.

On Fri, Sep 18, 2020 at 04:05:16PM -0400, Marek Polacek via Gcc-patches wrote:
> Ping.
> 
> On Thu, Sep 10, 2020 at 06:15:24PM -0400, Marek Polacek via Gcc-patches wrote:
> > To quickly recap, P0846 says that a name is also considered to refer to
> > a template if it is an unqualified-id followed by a < and name lookup
> > finds either one or more functions or finds nothing.
> > 
> > In a template, when parsing a function call that has type-dependent
> > arguments, we can't perform ADL right away so we set KOENIG_LOOKUP_P in
> > the call to remember to do it when instantiating the call
> > (tsubst_copy_and_build/CALL_EXPR).  When the called function is a
> > function template, we represent the call with a TEMPLATE_ID_EXPR;
> > usually the operand is an OVERLOAD.
> > 
> > In the P0846 case though, the operand can be an IDENTIFIER_NODE, when
> > name lookup found nothing when parsing the template name.  But we
> > weren't handling this correctly in tsubst_copy_and_build.  First
> > we need to pass the FUNCTION_P argument from  to
> > , otherwise we give a bogus error.  And then in
> >  we need to perform ADL.  The rest of the changes is to
> > give better errors when ADL didn't find anything.
> > 
> > Bootstrapped/regtested on x86_64-pc-linux-gnu, ok for trunk?
> > I think I'd like to backport to 10 too.
> > 
> > gcc/cp/ChangeLog:
> > 
> > PR c++/97010
> > * pt.c (tsubst_copy_and_build) : Call
> > tsubst_copy_and_build explicitly instead of using the RECUR macro.
> > Handle a TEMPLATE_ID_EXPR with an IDENTIFIER_NODE as its operand.
> > : Perform ADL for a TEMPLATE_ID_EXPR with an
> > IDENTIFIER_NODE as its operand.
> > 
> > gcc/testsuite/ChangeLog:
> > 
> > PR c++/97010
> > * g++.dg/cpp2a/fn-template21.C: New test.
> > * g++.dg/cpp2a/fn-template22.C: New test.
> > ---
> >  gcc/cp/pt.c| 37 --
> >  gcc/testsuite/g++.dg/cpp2a/fn-template21.C | 24 ++
> >  gcc/testsuite/g++.dg/cpp2a/fn-template22.C | 25 +++
> >  3 files changed, 77 insertions(+), 9 deletions(-)
> >  create mode 100644 gcc/testsuite/g++.dg/cpp2a/fn-template21.C
> >  create mode 100644 gcc/testsuite/g++.dg/cpp2a/fn-template22.C
> > 
> > diff --git a/gcc/cp/pt.c b/gcc/cp/pt.c
> > index 30c6735dede..566e24f9bf3 100644
> > --- a/gcc/cp/pt.c
> > +++ b/gcc/cp/pt.c
> > @@ -19241,7 +19241,8 @@ out:
> >  }
> >  
> >  /* Like tsubst but deals with expressions and performs semantic
> > -   analysis.  FUNCTION_P is true if T is the "F" in "F (ARGS)".  */
> > +   analysis.  FUNCTION_P is true if T is the "F" in "F (ARGS)" or
> > +   "F (ARGS)".  */
> >  
> >  tree
> >  tsubst_copy_and_build (tree t,
> > @@ -19323,7 +19324,10 @@ tsubst_copy_and_build (tree t,
> >  case TEMPLATE_ID_EXPR:
> >{
> > tree object;
> > -   tree templ = RECUR (TREE_OPERAND (t, 0));
> > +   tree templ = tsubst_copy_and_build (TREE_OPERAND (t, 0), args,
> > +   complain, in_decl,
> > +   function_p,
> > +   integral_constant_expression_p);
> > tree targs = TREE_OPERAND (t, 1);
> >  
> > if (targs)
> > @@ -19370,13 +19374,21 @@ tsubst_copy_and_build (tree t,
> >   }
> > else
> >   object = NULL_TREE;
> > -   templ = lookup_template_function (templ, targs);
> > +
> > +   tree tid = lookup_template_function (templ, targs);
> >  
> > if (object)
> > - RETURN (build3 (COMPONENT_REF, TREE_TYPE (templ),
> > -object, templ, NULL_TREE));
> > + RETURN (build3 (COMPONENT_REF, TREE_TYPE (tid),
> > +object, tid, NULL_TREE));
> > +   else if (identifier_p (templ))
> > + {
> > +   /* C++20 P0846: we can encounter an IDENTIFIER_NODE here when
> > +  name lookup found nothing when parsing the template name.  */
> > +   gcc_assert (cxx_dialect >= cxx20 || seen_error ());
> > +   RETURN (tid);
> > + }
> > else
> > - RETURN (baselink_for_fns (templ));
> > + RETURN (baselink_for_fns (tid));
> >}
> >  
> >  case INDIRECT_REF:
> > @@ -19967,14 +19979,17 @@ tsubst_copy_and_build (tree t,
> >  
> > /* We do not perform argument-dependent lookup if normal
> >lookup finds a non-function, in accordance with the
> > -  expected resolution of DR 218.  */
> > +  resolution of DR 218.  */
> > if (koenig_p
> > && ((is_overloaded_fn (function)
> >  /* If lookup found a member function, the Koenig lookup is
> > not appropriate, even if an unqualified-name was used
> > to denote the function.  */
> >  && !DECL_FUNCTION_MEMBER_P (get_first_fn (function)))
> > -   || identifier_p (function))
> > +   || identifier_p (function)
> > +   /* C++20 P0846: Lookup found nothing.  */
> > +   || (TREE_CODE (function) == TEMPLATE_ID_EXPR
> > +

Re: [PATCH] c++: CTAD and explicit deduction guides for copy-list-init [PR90210]

2020-09-29 Thread Marek Polacek via Gcc-patches

Ping.

On Sat, Sep 19, 2020 at 05:33:36PM -0400, Marek Polacek via Gcc-patches wrote:
> This PR points out that we accept
> 
>   template struct tuple { tuple(T); }; // #1
>   template explicit tuple(T t) -> tuple; // #2
>   tuple t = { 1 };
> 
> despite the 'explicit' deduction guide in a copy-list-initialization
> context.  That's because in deduction_guides_for we first find the
> user-defined deduction guide (#2), and then ctor_deduction_guides_for
> creates artificial deduction guides: one from the tuple(T) constructor and
> a copy guide.  So we end up with these three guides:
> 
>   (1) template tuple(T) -> tuple [DECL_NONCONVERTING_P]
>   (2) template tuple(tuple) -> tuple
>   (3) template tuple(T) -> tuple
> 
> Then, in do_class_deduction, we prune this set, and get rid of (1).
> Then overload resolution selects (3) and we succeed.
> 
> But [over.match.list]p1 says "In copy-list-initialization, if an explicit
> constructor is chosen, the initialization is ill-formed."  It also goes
> on to say that this differs from other situations where only converting
> constructors are considered for copy-initialization.  Therefore for
> list-initialization we consider explicit constructors and complain if one
> is chosen.  E.g. convert_like_internal/ck_user can give an error.
> 
> So my logic runs that we should not prune the deduction_guides_for guides
> in a copy-list-initialization context, and only complain if we actually
> choose an explicit deduction guide.  This matches clang++/EDG/msvc++.
> 
> Bootstrapped/regtested on x86_64-pc-linux-gnu, ok for trunk?
> 
> gcc/cp/ChangeLog:
> 
>   PR c++/90210
>   * pt.c (do_class_deduction): Don't prune explicit deduction guides
>   in copy-list-initialization.  In copy-list-initialization, if an
>   explicit deduction guide was selected, give an error.
> 
> gcc/testsuite/ChangeLog:
> 
>   PR c++/90210
>   * g++.dg/cpp1z/class-deduction73.C: New test.
> ---
>  gcc/cp/pt.c   | 49 ++-
>  .../g++.dg/cpp1z/class-deduction73.C  | 41 
>  2 files changed, 79 insertions(+), 11 deletions(-)
>  create mode 100644 gcc/testsuite/g++.dg/cpp1z/class-deduction73.C
> 
> diff --git a/gcc/cp/pt.c b/gcc/cp/pt.c
> index cfe5ff4a94f..9bcb743dc1d 100644
> --- a/gcc/cp/pt.c
> +++ b/gcc/cp/pt.c
> @@ -28929,6 +28929,7 @@ do_class_deduction (tree ptype, tree tmpl, tree init,
>tree type = TREE_TYPE (tmpl);
>  
>bool try_list_ctor = false;
> +  bool list_init_p = false;
>  
>releasing_vec rv_args = NULL;
>vec * = *_args;
> @@ -28936,6 +28937,7 @@ do_class_deduction (tree ptype, tree tmpl, tree init,
>  args = make_tree_vector ();
>else if (BRACE_ENCLOSED_INITIALIZER_P (init))
>  {
> +  list_init_p = true;
>try_list_ctor = TYPE_HAS_LIST_CTOR (type);
>if (try_list_ctor && CONSTRUCTOR_NELTS (init) == 1)
>   {
> @@ -28967,9 +28969,10 @@ do_class_deduction (tree ptype, tree tmpl, tree init,
>if (cands == error_mark_node)
>  return error_mark_node;
>  
> -  /* Prune explicit deduction guides in copy-initialization context.  */
> +  /* Prune explicit deduction guides in copy-initialization context (but
> + not copy-list-initialization).  */
>bool elided = false;
> -  if (flags & LOOKUP_ONLYCONVERTING)
> +  if (!list_init_p && (flags & LOOKUP_ONLYCONVERTING))
>  {
>for (lkp_iterator iter (cands); !elided && iter; ++iter)
>   if (DECL_NONCONVERTING_P (STRIP_TEMPLATE (*iter)))
> @@ -29038,18 +29041,42 @@ do_class_deduction (tree ptype, tree tmpl, tree 
> init,
>--cp_unevaluated_operand;
>  }
>  
> -  if (call == error_mark_node
> -  && (complain & tf_warning_or_error))
> +  if (call == error_mark_node)
>  {
> -  error ("class template argument deduction failed:");
> +  if (complain & tf_warning_or_error)
> + {
> +   error ("class template argument deduction failed:");
>  
> -  ++cp_unevaluated_operand;
> -  call = build_new_function_call (cands, , complain | tf_decltype);
> -  --cp_unevaluated_operand;
> +   ++cp_unevaluated_operand;
> +   call = build_new_function_call (cands, ,
> +   complain | tf_decltype);
> +   --cp_unevaluated_operand;
>  
> -  if (elided)
> - inform (input_location, "explicit deduction guides not considered "
> - "for copy-initialization");
> +   if (elided)
> + inform (input_location, "explicit deduction guides not considered "
> + "for copy-initialization");
> + }
> +  return error_mark_node;
> +}
> +  /* [over.match.list]/1: In copy-list-initialization, if an explicit
> + constructor is chosen, the initialization is ill-formed.  */
> +  else if (flags & LOOKUP_ONLYCONVERTING)
> +{
> +  tree fndecl = cp_get_callee_fndecl_nofold (call);
> +  if (fndecl && DECL_NONCONVERTING_P (fndecl))
> + {
> +   if (complain &

Another issue on RS6000 target. Re: One issue with default implementation of zero_call_used_regs

2020-09-29 Thread Qing Zhao via Gcc-patches

Hi, Richard,

At the same time testing aarch64, I also tested the default implementation on 
rs6000 target. 

The default implementation now is:

+/* The default hook for TARGET_ZERO_CALL_USED_REGS.  */
+
+HARD_REG_SET
+default_zero_call_used_regs (HARD_REG_SET need_zeroed_hardregs)
+{
+  gcc_assert (!hard_reg_set_empty_p (need_zeroed_hardregs));
+
+  for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+if (TEST_HARD_REG_BIT (need_zeroed_hardregs, regno))
+  {
+   machine_mode mode = reg_raw_mode[regno];
+   rtx reg = gen_rtx_REG (mode, regno);
+   emit_move_insn (reg, const0_rtx);
+  }
+  return need_zeroed_hardregs;
+}
+

With the small testing case:
int
test ()
{
  return 1;
}

If I compiled it with 

/home/qinzhao/Install/latest/bin/gcc -O2 -fzero-call-used-regs=all-arg t.c

It will failed as:

t.c: In function ‘test’:
t.c:6:1: error: insn does not satisfy its constraints:
6 | }
  | ^
(insn 28 27 29 (set (reg:DI 33 1)
(const_int 0 [0])) "t.c":6:1 647 {*movdi_internal64}
 (nil))
during RTL pass: shorten
dump file: t.c.319r.shorten
t.c:6:1: internal compiler error: in extract_constrain_insn_cached, at 
recog.c:2207
0x1018d693 _fatal_insn(char const*, rtx_def const*, char const*, int, char 
const*)
../../latest-gcc-x86/gcc/rtl-error.c:108
0x1018d6e7 _fatal_insn_not_found(rtx_def const*, char const*, int, char const*)
../../latest-gcc-x86/gcc/rtl-error.c:118
0x1099a82b extract_constrain_insn_cached(rtx_insn*)
../../latest-gcc-x86/gcc/recog.c:2207
0x11393917 insn_min_length(rtx_insn*)
../../latest-gcc-x86/gcc/config/rs6000/rs6000.md:721
0x105bece3 shorten_branches(rtx_insn*)
../../latest-gcc-x86/gcc/final.c:1118


As I checked, when the FP registers are zeroed, the above failure happened.

I suspect that the issue still relate to the following statement:

machine_mode mode = reg_raw_mode[regno];

As I checked, the reg_raw_mode always return the integer mode that can be hold 
by the hard registers, even though it’s FP register.

So, I still wondering:

1. Is there another available utility routine that returns the proper MODE for 
the hard registers that can be readily used to zero the hard register?
2. If not, should I add one more target hook for this purpose? i.e 

/* Return the proper machine mode that can be used to zero this hard register 
specified by REGNO.  */
machine_mode zero-call-used-regs-mode (unsigned int REGNO)

3. Or should I just delete the default implemeantion, and let the target to 
implement it.

Thanks.

Qing
> 
> 
> Thanks for testing aarch64.  I think there are two issues here,
> one in the patch and one in the aarch64 backend:
> 
> - the patch should use emit_move_insn rather than use gen_rtx_SET directly.
> 
> - the aarch64 backend doesn't handle zeroing TImode vector registers,
>  but should.  E.g. for:
> 
>void
>foo ()
>{
>  register __int128_t q0 asm ("q0");
>  q0 = 0;
>  asm volatile ("" :: "w" (q0));
>}
> 
>  we generate:
> 
>mov x0, 0
>mov x1, 0
>fmovd0, x0
>fmovv0.d[1], x1
> 
>  which is, er, somewhat suboptimal.
> 
> I'll try to fix the aarch64 bug for Monday next week.
> 
> Thanks,
> Richard

[PATCH] arm: Fix multiple inheritance thunks for thumb-1 with -mpure-code

2020-09-29 Thread Christophe Lyon via Gcc-patches

When mi_delta is > 255 and -mpure-code is used, we cannot load delta
from code memory (like we do without -mpure-code).

This patch builds the value of mi_delta into r3 with a series of
movs/adds/lsls.

We also do some cleanup by not emitting the function address and delta
via .word directives at the end of the thunk since we don't use them
with -mpure-code.

No need for new testcases, this bug was already identified by
eg. pr46287-3.C

2020-09-29  Christophe Lyon  

gcc/
* config/arm/arm.c (arm_thumb1_mi_thunk): Build mi_delta in r3 and
do not emit function address and delta when -mpure-code is used.

k#   (use "git pull" to merge the remote branch into yours)
---
 gcc/config/arm/arm.c | 91 +---
 1 file changed, 66 insertions(+), 25 deletions(-)

diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index ceeb91f..62abeb5 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -28342,9 +28342,43 @@ arm_thumb1_mi_thunk (FILE *file, tree, HOST_WIDE_INT 
delta,
 {
   if (mi_delta > 255)
{
- fputs ("\tldr\tr3, ", file);
- assemble_name (file, label);
- fputs ("+4\n", file);
+ /* With -mpure-code, we cannot load delta from the constant
+pool: we build it explicitly.  */
+ if (target_pure_code)
+   {
+ bool mov_done_p = false;
+ int i;
+
+ /* Emit upper 3 bytes if needed.  */
+ for (i = 0; i < 3; i++)
+   {
+ int byte = (mi_delta >> (8 * (3 - i))) & 0xff;
+
+ if (byte)
+   {
+ if (mov_done_p)
+   asm_fprintf (file, "\tadds\tr3, #%d\n", byte);
+ else
+   asm_fprintf (file, "\tmovs\tr3, #%d\n", byte);
+ mov_done_p = true;
+   }
+
+ if (mov_done_p)
+   asm_fprintf (file, "\tlsls\tr3, #8\n");
+   }
+
+ /* Emit lower byte if needed.  */
+ if (!mov_done_p)
+   asm_fprintf (file, "\tmovs\tr3, #%d\n", mi_delta & 0xff);
+ else if (mi_delta & 0xff)
+   asm_fprintf (file, "\tadds\tr3, #%d\n", mi_delta & 0xff);
+   }
+ else
+   {
+ fputs ("\tldr\tr3, ", file);
+ assemble_name (file, label);
+ fputs ("+4\n", file);
+   }
  asm_fprintf (file, "\t%ss\t%r, %r, r3\n",
   mi_op, this_regno, this_regno);
}
@@ -28380,30 +28414,37 @@ arm_thumb1_mi_thunk (FILE *file, tree, HOST_WIDE_INT 
delta,
fputs ("\tpop\t{r3}\n", file);
 
   fprintf (file, "\tbx\tr12\n");
-  ASM_OUTPUT_ALIGN (file, 2);
-  assemble_name (file, label);
-  fputs (":\n", file);
-  if (flag_pic)
+
+  /* With -mpure-code, we don't need to emit literals for the
+function address and delta since we emitted code to build
+them.  */
+  if (!target_pure_code)
{
- /* Output ".word .LTHUNKn-[3,7]-.LTHUNKPCn".  */
- rtx tem = XEXP (DECL_RTL (function), 0);
- /* For TARGET_THUMB1_ONLY the thunk is in Thumb mode, so the PC
-pipeline offset is four rather than eight.  Adjust the offset
-accordingly.  */
- tem = plus_constant (GET_MODE (tem), tem,
-  TARGET_THUMB1_ONLY ? -3 : -7);
- tem = gen_rtx_MINUS (GET_MODE (tem),
-  tem,
-  gen_rtx_SYMBOL_REF (Pmode,
-  ggc_strdup (labelpc)));
- assemble_integer (tem, 4, BITS_PER_WORD, 1);
-   }
-  else
-   /* Output ".word .LTHUNKn".  */
-   assemble_integer (XEXP (DECL_RTL (function), 0), 4, BITS_PER_WORD, 1);
+ ASM_OUTPUT_ALIGN (file, 2);
+ assemble_name (file, label);
+ fputs (":\n", file);
+ if (flag_pic)
+   {
+ /* Output ".word .LTHUNKn-[3,7]-.LTHUNKPCn".  */
+ rtx tem = XEXP (DECL_RTL (function), 0);
+ /* For TARGET_THUMB1_ONLY the thunk is in Thumb mode, so the PC
+pipeline offset is four rather than eight.  Adjust the offset
+accordingly.  */
+ tem = plus_constant (GET_MODE (tem), tem,
+  TARGET_THUMB1_ONLY ? -3 : -7);
+ tem = gen_rtx_MINUS (GET_MODE (tem),
+  tem,
+  gen_rtx_SYMBOL_REF (Pmode,
+  ggc_strdup (labelpc)));
+ assemble_integer (tem, 4, BITS_PER_WORD, 1);
+   }
+ else
+   /* Output ".word .LTHUNKn".  */
+   assemble_integer (XEXP (DECL_RTL (function), 0), 4, BITS_PER_WORD, 
1);
 
-  if (TARGET_THUMB1_ONLY && mi_delta > 255)
-

Re: [PATCH 6/6] ipa-cp: Separate and increase the large-unit parameter

2020-09-29 Thread Jan Hubicka

> 
> gcc/ChangeLog:
> 
> 2020-09-07  Martin Jambor  
> 
>   * params.opt (ipa-cp-large-unit-insns): New parameter.
>   * ipa-cp.c (get_max_overall_size): Use the new parameter.
OK,
thanks!
Honza
> ---
>  gcc/ipa-cp.c   | 2 +-
>  gcc/params.opt | 4 
>  2 files changed, 5 insertions(+), 1 deletion(-)
> 
> diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c
> index 12acf24c553..2152f9e5876 100644
> --- a/gcc/ipa-cp.c
> +++ b/gcc/ipa-cp.c
> @@ -3448,7 +3448,7 @@ static long
>  get_max_overall_size (cgraph_node *node)
>  {
>long max_new_size = orig_overall_size;
> -  long large_unit = opt_for_fn (node->decl, param_large_unit_insns);
> +  long large_unit = opt_for_fn (node->decl, param_ipa_cp_large_unit_insns);
>if (max_new_size < large_unit)
>  max_new_size = large_unit;
>int unit_growth = opt_for_fn (node->decl, param_ipa_cp_unit_growth);
> diff --git a/gcc/params.opt b/gcc/params.opt
> index acb59f17e45..9d177ab50ad 100644
> --- a/gcc/params.opt
> +++ b/gcc/params.opt
> @@ -218,6 +218,10 @@ Percentage penalty functions containing a single call to 
> another function will r
>  Common Joined UInteger Var(param_ipa_cp_unit_growth) Init(10) Param 
> Optimization
>  How much can given compilation unit grow because of the interprocedural 
> constant propagation (in percent).
>  
> +-param=ipa-cp-large-unit-insns=
> +Common Joined UInteger Var(param_ipa_cp_large_unit_insns) Optimization 
> Init(16000) Param
> +The size of translation unit that IPA-CP pass considers large.
> +
>  -param=ipa-cp-value-list-size=
>  Common Joined UInteger Var(param_ipa_cp_value_list_size) Init(8) Param 
> Optimization
>  Maximum size of a list of values associated with each parameter for 
> interprocedural constant propagation.
> -- 
> 2.28.0

[PATCH] libgomp: Enforce 1-thread limit in subteams

2020-09-29 Thread Andrew Stubbs

My recent patch to fix barriers in nested teams relied on the assumption 
that nested teams would only ever have one thread each.


However, that can be changed by altering the ICVs, via runtime call or 
environment variable (not that the accelerator-side libgomp can see the 
host environment), so it wasn't completely safe.


This patch ensures that the previous assumption is safe, by ignoring the 
relevant ICV on NVPTX and AMD GCN, neither of which can support it.


OK to commit?

Andrew
libgomp: Enforce 1-thread limit in subteams

Accelerators with fixed thread-counts will break if nested teams are expected
to have multiple threads each.

libgomp/ChangeLog:

2020-09-29  Andrew Stubbs  

	* parallel.c (gomp_resolve_num_threads): Ignore nest_var on nvptx
	and amdgcn targets.

diff --git a/libgomp/parallel.c b/libgomp/parallel.c
index 2423f11f44a..0618056a7fe 100644
--- a/libgomp/parallel.c
+++ b/libgomp/parallel.c
@@ -48,7 +48,14 @@ gomp_resolve_num_threads (unsigned specified, unsigned count)
 
   if (specified == 1)
 return 1;
-  else if (thr->ts.active_level >= 1 && !icv->nest_var)
+
+  /* Accelerators with fixed thread counts require this to return 1 for
+ nested parallel regions.  */
+  if (thr->ts.active_level >= 1
+#if !defined(__AMDGCN__) && !defined(__nvptx__)
+  && !icv->nest_var
+#endif
+  )
 return 1;
   else if (thr->ts.active_level >= gomp_max_active_levels_var)
 return 1;

c++: Hiddenness is a property of the symbol table

2020-09-29 Thread Nathan Sidwell


This patch moves the handling of decl-hiddenness entirely into the
name lookup machinery, where it belongs.  We need a few new flags,
because pressing the existing OVL_HIDDEN_P into play for non-function
decls doesn't work well.  For a local binding we only need one marker,
as there cannot be both a hidden implicit typedef and a hidden
function.  That's not true for namespace-scope, where they could both
be hidden.

The name-lookup machinery maintains the existing decl_hidden and co
flags, and asserts have been sprinkled around to make sure they are
consistent.  The next series of patches will remove those old markers.
(we'll need to keep one, as there are some special restrictions on
redeclaring friend functions with in-class definitions or default args.)

gcc/cp/
* cp-tree.h (ovl_insert): Change final parm to hidden-or-using
indicator.
* name-lookup.h (HIDDEN_TYPE_BINDING_P): New.
(struct cxx_binding): Add type_is_hidden flag.
* tree.c (ovl_insert): Change using_p parm to using_or_hidden,
adjust.
(ovl_skip_hidden): Assert we never see a naked hidden decl.
* decl.c (xref_tag_1): Delete unhiding friend from here (moved to
lookup_elaborated_type_1).
* name-lookup.c (STAT_TYPE_HIDDEN_P, STAT_DECL_HIDDEN_P): New.
(name_lookup::search_namespace_only): Check new hidden markers.
(cxx_binding_make): Clear HIDDEN_TYPE_BINDING_P.
(update_binding): Update new hidden markers.
(lookup_name_1): Check HIDDEN_TYPE_BINDING_P and simplify friend
ignoring.
(lookup_elaborated_type_1): Use new hidden markers.  Reveal the
decl here.

pushed to trunk

nathan
--
Nathan Sidwell
diff --git i/gcc/cp/cp-tree.h w/gcc/cp/cp-tree.h
index b7f5b6b399f..a25934e3263 100644
--- i/gcc/cp/cp-tree.h
+++ w/gcc/cp/cp-tree.h
@@ -7371,7 +7371,7 @@ inline tree ovl_first(tree) ATTRIBUTE_PURE;
 extern tree ovl_make(tree fn,
 		 tree next = NULL_TREE);
 extern tree ovl_insert(tree fn, tree maybe_ovl,
-		 bool using_p = false);
+		 int using_or_hidden = 0);
 extern tree ovl_skip_hidden			(tree) ATTRIBUTE_PURE;
 extern void lookup_mark(tree lookup, bool val);
 extern tree lookup_add(tree fns, tree lookup);
diff --git i/gcc/cp/decl.c w/gcc/cp/decl.c
index c00b996294e..617b96e02e4 100644
--- i/gcc/cp/decl.c
+++ w/gcc/cp/decl.c
@@ -15089,22 +15089,9 @@ xref_tag_1 (enum tag_types tag_code, tree name,
 	  return error_mark_node;
 	}
 
-  if (how != TAG_how::HIDDEN_FRIEND && TYPE_HIDDEN_P (t))
-	{
-	  /* This is no longer an invisible friend.  Make it
-	 visible.  */
-	  tree decl = TYPE_NAME (t);
-
-	  DECL_ANTICIPATED (decl) = false;
-	  DECL_FRIEND_P (decl) = false;
-
-	  if (TYPE_TEMPLATE_INFO (t))
-	{
-	  tree tmpl = TYPE_TI_TEMPLATE (t);
-	  DECL_ANTICIPATED (tmpl) = false;
-	  DECL_FRIEND_P (tmpl) = false;
-	}
-	}
+  gcc_checking_assert (how == TAG_how::HIDDEN_FRIEND
+			   || !(DECL_LANG_SPECIFIC (TYPE_NAME (t))
+&& DECL_ANTICIPATED (TYPE_NAME (t;
 }
 
   return t;
diff --git i/gcc/cp/name-lookup.c w/gcc/cp/name-lookup.c
index 89f1a4c5d64..bc60d343f7e 100644
--- i/gcc/cp/name-lookup.c
+++ w/gcc/cp/name-lookup.c
@@ -55,6 +55,15 @@ static name_hint suggest_alternatives_for_1 (location_t location, tree name,
 #define MAYBE_STAT_DECL(N) (STAT_HACK_P (N) ? STAT_DECL (N) : N)
 #define MAYBE_STAT_TYPE(N) (STAT_HACK_P (N) ? STAT_TYPE (N) : NULL_TREE)
 
+/* For regular (maybe) overloaded functions, we have OVL_HIDDEN_P.
+   But we also need to indicate hiddenness on implicit type decls
+   (injected friend classes), and (coming soon) decls injected from
+   block-scope externs.  It is too awkward to press the existing
+   overload marking for that.  If we have a hidden non-function, we
+   always create a STAT_HACK, and use these two markers as needed.  */
+#define STAT_TYPE_HIDDEN_P(N) OVL_HIDDEN_P (N)
+#define STAT_DECL_HIDDEN_P(N) OVL_DEDUP_P (N)
+
 /* Create a STAT_HACK node with DECL as the value binding and TYPE as
the type binding.  */
 
@@ -545,14 +554,18 @@ name_lookup::search_namespace_only (tree scope)
 	{
 	  type = STAT_TYPE (value);
 	  value = STAT_DECL (value);
-  
-	  if (!bool (want & LOOK_want::HIDDEN_FRIEND)
-	  && DECL_LANG_SPECIFIC (type)
-	  && DECL_ANTICIPATED (type))
-	type = NULL_TREE;
+	  
+	  if (!bool (want & LOOK_want::HIDDEN_FRIEND))
+	{
+	  if (STAT_TYPE_HIDDEN_P (*binding))
+		type = NULL_TREE;
+	  if (STAT_DECL_HIDDEN_P (*binding))
+		value = NULL_TREE;
+	  else
+		value = ovl_skip_hidden (value);
+	}
 	}
-
-  if (!bool (want & LOOK_want::HIDDEN_FRIEND))
+  else if (!bool (want & LOOK_want::HIDDEN_FRIEND))
 	value = ovl_skip_hidden (value);
 
   found |= process_binding (value, type);
@@ -1975,6 +1988,7 @@ cxx_binding_make (tree value, tree type)
   /* Clear flags by default.  */
   LOCAL_BINDING_P (binding) = false;
   INHERITED_VALUE_BINDING_P (binding) =

Re: [PATCH] aarch64: Add extend-as-extract-with-shift pattern [PR96998]

2020-09-29 Thread Segher Boessenkool

On Tue, Sep 29, 2020 at 11:36:12AM +0100, Alex Coplan wrote:
> Is the combine change (a canonicalization fix, as described below) OK
> for trunk in light of this info?

Can you please resend it with correct info and a corresponding commit
message?


Segher

Re: [rs6000] Avoid useless masking of count operand for rotation

2020-09-29 Thread Segher Boessenkool

Hi!

[ Please CC: me on rs6000 patches.  Thanks! ]

On Tue, Sep 29, 2020 at 12:26:28PM +0200, Eric Botcazou wrote:
> and the masking is present all the way down to the assembly at -O2:
> 
> rlwinm 4,4,0,27,31
> rotlw 3,3,4
> 
> Now this masking is redundant since it's done by the hardware so it would be 
> nice to get rid of it.  I have attached a couple of patches to that effect: 
> the first one adds new instructions while the second one only adds splitters.

>   * config/rs6000/rs6000.md (*rotl3_mask): New.
>   (*rotlsi3_mask_64): Likewise.
>   (*rotl3_dot): Change to use P mode iterator.
>   (*rotl3_mask_dot): New.
>   (*rotl3_dot2): Change to use P mode iterator.
>   (*rotl3_mask_dot2): New.

Don't call it "mask" please: *all* of our basic rotate instructions
already have something called "mask" (that is the "m" in "rlwnm" for
example; and "rotlw d,a,b" is just an extended mnemonic for
"rlwnm d,a,b,0,31").  The hardware also does not mask the shift amount
at all (instead, only the low 5 bits of RB *are* the shift amount).

> +;; Avoid useless masking of count operand

(Sentences end in a full stop.)

> +(define_insn "*rotl3_mask"
> +  [(set (match_operand:GPR 0 "gpc_reg_operand" "=r")
> + (rotate:GPR (match_operand:GPR 1 "gpc_reg_operand" "r")
> + (and:GPR (match_operand:GPR 2 "gpc_reg_operand" "r")
> +  (match_operand:GPR 3 "const_int_operand" "n"]
> +  "(UINTVAL (operands[3]) & (GET_MODE_BITSIZE (mode) - 1))
> +   == (unsigned HOST_WIDE_INT) (GET_MODE_BITSIZE (mode) - 1)"

(Useless casts are useless.)

Don't mask operands[3] please (in the UINTVAL): RTL with the number
outside of that range is *undefined*.  So just check that it is equal?

>  (define_insn_and_split "*rotl3_dot"
>[(set (match_operand:CC 3 "cc_reg_operand" "=x,?y")
> - (compare:CC (rotate:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r")
> - (match_operand:SI 2 "reg_or_cint_operand" 
> "rn,rn"))
> + (compare:CC (rotate:P (match_operand:P 1 "gpc_reg_operand" "r,r")
> +   (match_operand:SI 2 "reg_or_cint_operand" 
> "rn,rn"))
>   (const_int 0)))
> -   (clobber (match_scratch:GPR 0 "=r,r"))]
> -  "mode == Pmode"
> +   (clobber (match_scratch:P 0 "=r,r"))]
> +  ""
>"@
> rotl%I2. %0,%1,%2
> #"
> -  "&& reload_completed && cc_reg_not_cr0_operand (operands[3], CCmode)"
> +  "reload_completed && cc_reg_not_cr0_operand (operands[3], CCmode)"
>[(set (match_dup 0)
> - (rotate:GPR (match_dup 1)
> - (match_dup 2)))
> + (rotate:P (match_dup 1)
> +   (match_dup 2)))
> (set (match_dup 3)
>   (compare:CC (match_dup 0)
>   (const_int 0)))]

Why?  This diverges the "dot" version from the non-dot for no reason.

(We can do 32-bit rotates on 64-bit implementations just fine, and even
the record ("dot") form works just fine; except for the setting of
"smaller than" in CR0.  And we can fix that even (by not using 0,31 but
a wrapping mask, say, 1,0), but more readable generated code was more
important so far.)

I don't see a patch with splitters only?  Huh.  Did you attach the same
patch twice?

Since this won't be handled before combine (or what do I miss?), it is
fine to do splitters only (splitters for combine).  But the other
approach is fine as well.

Thanks,

Segher

Re: V2 [PATCH] x86: Replace with

2020-09-29 Thread Jakub Jelinek via Gcc-patches

On Tue, Sep 29, 2020 at 11:58:39AM -0700, H.J. Lu wrote:
> Here is the V2 patch.  OK for master and GCC 10 branches?

Yes, thanks.

Jakub

V2 [PATCH] x86: Replace with

2020-09-29 Thread H.J. Lu via Gcc-patches

On Tue, Sep 29, 2020 at 11:49 AM Jakub Jelinek  wrote:
>
> On Tue, Sep 29, 2020 at 11:46:24AM -0700, H.J. Lu via Gcc-patches wrote:
> > Fix a typo in config/i386/enqcmdintrin.h by replacing 
> > with :
> >
> > [hjl@gnu-cfl-2 x86-gcc]$ echo "#include " | gcc -S -o 
> > /dev/null -x c -
> > In file included from :1:
> > /usr/lib/gcc/x86_64-redhat-linux/10/include/enqcmdintrin.h:25:3: error: 
> > #error "Never use  directly; include  instead."
> >25 | # error "Never use  directly; include  
> > instead."
> >   |   ^
> > [hjl@gnu-cfl-2 x86-gcc]$
> >
> > gcc/
> >
> >   PR target/97247
> >   * config/i386/enqcmdintrin.h: Replace  with
> >   .
>
> >  gcc/config/i386/enqcmdintrin.h | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/gcc/config/i386/enqcmdintrin.h b/gcc/config/i386/enqcmdintrin.h
> > index 4b2efcb9a20..72ef1ca7b5d 100644
> > --- a/gcc/config/i386/enqcmdintrin.h
> > +++ b/gcc/config/i386/enqcmdintrin.h
> > @@ -22,7 +22,7 @@
> > .  */
> >
> >  #if !defined _IMMINTRIN_H_INCLUDED
> > -# error "Never use  directly; include  
> > instead."
> > +# error "Never use  directly; include  
> > instead."
>
> That isn't the only typo.
> >  #endif
> >
> >  #ifndef _ENQCMDNTRIN_H_INCLUDED
>
> The I is missing in this macro too.
>

Fixed.

Here is the V2 patch.  OK for master and GCC 10 branches?

Thanks.

-- 
H.J.
From b135b3b2e016f2095bd9c2ae7da10175cf082adb Mon Sep 17 00:00:00 2001
From: "H.J. Lu" 
Date: Tue, 29 Sep 2020 11:40:46 -0700
Subject: [PATCH] x86: Replace  with 

Fix 2 typos in config/i386/enqcmdintrin.h by replacing 
with :

[hjl@gnu-cfl-2 x86-gcc]$ echo "#include " | gcc -S -o /dev/null -x c -
In file included from :1:
/usr/lib/gcc/x86_64-redhat-linux/10/include/enqcmdintrin.h:25:3: error: #error "Never use  directly; include  instead."
   25 | # error "Never use  directly; include  instead."
  |   ^
[hjl@gnu-cfl-2 x86-gcc]$

and _ENQCMDINTRIN_H_INCLUDED with _ENQCMDINTRIN_H_INCLUDED.

gcc/

	PR target/97247
	* config/i386/enqcmdintrin.h: Replace  with
	.  Replace _ENQCMDNTRIN_H_INCLUDED with
	_ENQCMDINTRIN_H_INCLUDED.
---
 gcc/config/i386/enqcmdintrin.h | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gcc/config/i386/enqcmdintrin.h b/gcc/config/i386/enqcmdintrin.h
index 4b2efcb9a20..721dfb2ca92 100644
--- a/gcc/config/i386/enqcmdintrin.h
+++ b/gcc/config/i386/enqcmdintrin.h
@@ -22,11 +22,11 @@
.  */
 
 #if !defined _IMMINTRIN_H_INCLUDED
-# error "Never use  directly; include  instead."
+# error "Never use  directly; include  instead."
 #endif
 
-#ifndef _ENQCMDNTRIN_H_INCLUDED
-#define _ENQCMDNTRIN_H_INCLUDED
+#ifndef _ENQCMDINTRIN_H_INCLUDED
+#define _ENQCMDINTRIN_H_INCLUDED
 
 #ifndef __ENQCMD__
 #pragma GCC push_options
@@ -52,4 +52,4 @@ _enqcmds (void * __P, const void * __Q)
 #undef __DISABLE_ENQCMD__
 #pragma GCC pop_options
 #endif /* __DISABLE_ENQCMD__ */
-#endif /* _ENQCMDNTRIN_H_INCLUDED.  */
+#endif /* _ENQCMDINTRIN_H_INCLUDED.  */
-- 
2.26.2

Re: [PATCH] x86: Replace with

2020-09-29 Thread Jakub Jelinek via Gcc-patches

On Tue, Sep 29, 2020 at 11:46:24AM -0700, H.J. Lu via Gcc-patches wrote:
> Fix a typo in config/i386/enqcmdintrin.h by replacing 
> with :
> 
> [hjl@gnu-cfl-2 x86-gcc]$ echo "#include " | gcc -S -o 
> /dev/null -x c -
> In file included from :1:
> /usr/lib/gcc/x86_64-redhat-linux/10/include/enqcmdintrin.h:25:3: error: 
> #error "Never use  directly; include  instead."
>25 | # error "Never use  directly; include  
> instead."
>   |   ^
> [hjl@gnu-cfl-2 x86-gcc]$
> 
> gcc/
> 
>   PR target/97247
>   * config/i386/enqcmdintrin.h: Replace  with
>   .

>  gcc/config/i386/enqcmdintrin.h | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/gcc/config/i386/enqcmdintrin.h b/gcc/config/i386/enqcmdintrin.h
> index 4b2efcb9a20..72ef1ca7b5d 100644
> --- a/gcc/config/i386/enqcmdintrin.h
> +++ b/gcc/config/i386/enqcmdintrin.h
> @@ -22,7 +22,7 @@
> .  */
>  
>  #if !defined _IMMINTRIN_H_INCLUDED
> -# error "Never use  directly; include  instead."
> +# error "Never use  directly; include  instead."

That isn't the only typo.
>  #endif
>  
>  #ifndef _ENQCMDNTRIN_H_INCLUDED

The I is missing in this macro too.

Jakub

[PATCH] x86: Replace with

2020-09-29 Thread H.J. Lu via Gcc-patches

Fix a typo in config/i386/enqcmdintrin.h by replacing 
with :

[hjl@gnu-cfl-2 x86-gcc]$ echo "#include " | gcc -S -o /dev/null 
-x c -
In file included from :1:
/usr/lib/gcc/x86_64-redhat-linux/10/include/enqcmdintrin.h:25:3: error: #error 
"Never use  directly; include  instead."
   25 | # error "Never use  directly; include  
instead."
  |   ^
[hjl@gnu-cfl-2 x86-gcc]$

gcc/

PR target/97247
* config/i386/enqcmdintrin.h: Replace  with
.
---
 gcc/config/i386/enqcmdintrin.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/config/i386/enqcmdintrin.h b/gcc/config/i386/enqcmdintrin.h
index 4b2efcb9a20..72ef1ca7b5d 100644
--- a/gcc/config/i386/enqcmdintrin.h
+++ b/gcc/config/i386/enqcmdintrin.h
@@ -22,7 +22,7 @@
.  */
 
 #if !defined _IMMINTRIN_H_INCLUDED
-# error "Never use  directly; include  instead."
+# error "Never use  directly; include  instead."
 #endif
 
 #ifndef _ENQCMDNTRIN_H_INCLUDED
-- 
2.26.2

Re: [PATCH 5/6] ipa-cp: Add dumping of overall_size after cloning

2020-09-29 Thread Jan Hubicka

> When experimenting with IPA-CP parameters, especially when looking
> into exchange2_r, it has been very useful to know what the value of
> overall_size is at different stages of the decision process.  This
> patch therefore adds it to the generated dumps.
> 
> gcc/ChangeLog:
> 
> 2020-09-07  Martin Jambor  
> 
>   * ipa-cp.c (estimate_local_effects): Add overeall_size to dumped
>   string.
>   (decide_about_value): Add dumping new overall_size.
OK,
thanks
Honza
> ---
>  gcc/ipa-cp.c | 6 +-
>  1 file changed, 5 insertions(+), 1 deletion(-)
> 
> diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c
> index f6320c787de..12acf24c553 100644
> --- a/gcc/ipa-cp.c
> +++ b/gcc/ipa-cp.c
> @@ -3517,7 +3517,8 @@ estimate_local_effects (struct cgraph_node *node)
>  
> if (dump_file)
>   fprintf (dump_file, " Decided to specialize for all "
> -  "known contexts, growth deemed beneficial.\n");
> +  "known contexts, growth (to %li) deemed "
> +  "beneficial.\n", overall_size);
>   }
> else if (dump_file && (dump_flags & TDF_DETAILS))
>   fprintf (dump_file, "  Not cloning for all contexts because "
> @@ -5506,6 +5507,9 @@ decide_about_value (struct cgraph_node *node, int 
> index, HOST_WIDE_INT offset,
>val->spec_node = create_specialized_node (node, known_csts, known_contexts,
>   aggvals, callers);
>overall_size += val->local_size_cost;
> +  if (dump_file && (dump_flags & TDF_DETAILS))
> +fprintf (dump_file, " overall size reached %li\n",
> +  overall_size);
>  
>/* TODO: If for some lattice there is only one other known value
>   left, make a special node for it too. */
> -- 
> 2.28.0
>

Re: [PATCH 3/6] ipa: Bundle estimates of ipa_call_context::estimate_size_and_time

2020-09-29 Thread Jan Hubicka

> A subsequent patch adds another two estimates that the code in
> ipa_call_context::estimate_size_and_time computes, and the fact that
> the function has a special output parameter for each thing it computes
> would make it have just too many.  Therefore, this patch collapses all
> those ouptut parameters into one output structure.
> 
> gcc/ChangeLog:
> 
> 2020-09-02  Martin Jambor  
> 
>   * ipa-inline-analysis.c (do_estimate_edge_time): Adjusted to use
>   ipa_call_estimates.
>   (do_estimate_edge_size): Likewise.
>   (do_estimate_edge_hints): Likewise.
>   * ipa-fnsummary.h (struct ipa_call_estimates): New type.
>   (ipa_call_context::estimate_size_and_time): Adjusted declaration.
>   (estimate_ipcp_clone_size_and_time): Likewise.
>   * ipa-cp.c (hint_time_bonus): Changed the type of the second argument
>   to ipa_call_estimates.
>   (perform_estimation_of_a_value): Adjusted to use ipa_call_estimates.
>   (estimate_local_effects): Likewise.
>   * ipa-fnsummary.c (ipa_call_context::estimate_size_and_time): Adjusted
>   to return estimates in a single ipa_call_estimates parameter.
>   (estimate_ipcp_clone_size_and_time): Likewise.
OK,
Honza
> ---
>  gcc/ipa-cp.c  | 45 ++---
>  gcc/ipa-fnsummary.c   | 60 +++
>  gcc/ipa-fnsummary.h   | 36 +--
>  gcc/ipa-inline-analysis.c | 47 +-
>  4 files changed, 105 insertions(+), 83 deletions(-)
> 
> diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c
> index 292dd7e5bdf..77c84a6ed5d 100644
> --- a/gcc/ipa-cp.c
> +++ b/gcc/ipa-cp.c
> @@ -3196,12 +3196,13 @@ devirtualization_time_bonus (struct cgraph_node *node,
>return res;
>  }
>  
> -/* Return time bonus incurred because of HINTS.  */
> +/* Return time bonus incurred because of hints stored in ESTIMATES.  */
>  
>  static int
> -hint_time_bonus (cgraph_node *node, ipa_hints hints)
> +hint_time_bonus (cgraph_node *node, const ipa_call_estimates )
>  {
>int result = 0;
> +  ipa_hints hints = estimates.hints;
>if (hints & (INLINE_HINT_loop_iterations | INLINE_HINT_loop_stride))
>  result += opt_for_fn (node->decl, param_ipa_cp_loop_hint_bonus);
>return result;
> @@ -3397,15 +3398,13 @@ perform_estimation_of_a_value (cgraph_node *node,
>  int removable_params_cost, int est_move_cost,
>  ipcp_value_base *val)
>  {
> -  int size, time_benefit;
> -  sreal time, base_time;
> -  ipa_hints hints;
> +  int time_benefit;
> +  ipa_call_estimates estimates;
>  
> -  estimate_ipcp_clone_size_and_time (node, avals, , ,
> -  _time, );
> -  base_time -= time;
> -  if (base_time > 65535)
> -base_time = 65535;
> +  estimate_ipcp_clone_size_and_time (node, avals, );
> +  sreal time_delta = estimates.nonspecialized_time - estimates.time;
> +  if (time_delta > 65535)
> +time_delta = 65535;
>  
>/* Extern inline functions have no cloning local time benefits because they
>   will be inlined anyway.  The only reason to clone them is if it enables
> @@ -3413,11 +3412,12 @@ perform_estimation_of_a_value (cgraph_node *node,
>if (DECL_EXTERNAL (node->decl) && DECL_DECLARED_INLINE_P (node->decl))
>  time_benefit = 0;
>else
> -time_benefit = base_time.to_int ()
> +time_benefit = time_delta.to_int ()
>+ devirtualization_time_bonus (node, avals)
> -  + hint_time_bonus (node, hints)
> +  + hint_time_bonus (node, estimates)
>+ removable_params_cost + est_move_cost;
>  
> +  int size = estimates.size;
>gcc_checking_assert (size >=0);
>/* The inliner-heuristics based estimates may think that in certain
>   contexts some functions do not have any size at all but we want
> @@ -3472,23 +3472,21 @@ estimate_local_effects (struct cgraph_node *node)
>|| (removable_params_cost && node->can_change_signature))
>  {
>struct caller_statistics stats;
> -  ipa_hints hints;
> -  sreal time, base_time;
> -  int size;
> +  ipa_call_estimates estimates;
>  
>init_caller_stats ();
>node->call_for_symbol_thunks_and_aliases (gather_caller_stats, ,
> false);
> -  estimate_ipcp_clone_size_and_time (node, , , ,
> -  _time, );
> -  time -= devirt_bonus;
> -  time -= hint_time_bonus (node, hints);
> -  time -= removable_params_cost;
> -  size -= stats.n_calls * removable_params_cost;
> +  estimate_ipcp_clone_size_and_time (node, , );
> +  sreal time = estimates.nonspecialized_time - estimates.time;
> +  time += devirt_bonus;
> +  time += hint_time_bonus (node, estimates);
> +  time += removable_params_cost;
> +  int size = estimates.size - stats.n_calls * removable_params_cost;
>  
>if (dump_file)
>   fprintf (dump_file, " - context independent

Re: [PATCH 2/6] ipa: Introduce ipa_cached_call_context

2020-09-29 Thread Jan Hubicka

> Hi,
> 
> as we discussed with Honza on the mailin glist last week, making
> cached call context structure distinct from the normal one may make it
> clearer that the cached data need to be explicitely deallocated.
> 
> This patch does that division.  It is not mandatory for the overall
> main goals of the patch set and can be dropped if deemed superfluous.
> 
> gcc/ChangeLog:
> 
> 2020-09-02  Martin Jambor  
> 
>   * ipa-fnsummary.h (ipa_cached_call_context): New forward declaration
>   and class.
>   (class ipa_call_context): Make friend ipa_cached_call_context.  Moved
>   methods duplicate_from and release to it too.
>   * ipa-fnsummary.c (ipa_call_context::duplicate_from): Moved to class
>   ipa_cached_call_context.
>   (ipa_call_context::release): Likewise, removed the parameter.
>   * ipa-inline-analysis.c (node_context_cache_entry): Change the type of
>   ctx to ipa_cached_call_context.
>   (do_estimate_edge_time): Remove parameter from the call to
>   ipa_cached_call_context::release.
OK,
thanks
Honza

[PATCH 5/6] ipa-cp: Add dumping of overall_size after cloning

2020-09-29 Thread Martin Jambor

When experimenting with IPA-CP parameters, especially when looking
into exchange2_r, it has been very useful to know what the value of
overall_size is at different stages of the decision process.  This
patch therefore adds it to the generated dumps.

gcc/ChangeLog:

2020-09-07  Martin Jambor  

* ipa-cp.c (estimate_local_effects): Add overeall_size to dumped
string.
(decide_about_value): Add dumping new overall_size.
---
 gcc/ipa-cp.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c
index f6320c787de..12acf24c553 100644
--- a/gcc/ipa-cp.c
+++ b/gcc/ipa-cp.c
@@ -3517,7 +3517,8 @@ estimate_local_effects (struct cgraph_node *node)
 
  if (dump_file)
fprintf (dump_file, " Decided to specialize for all "
-"known contexts, growth deemed beneficial.\n");
+"known contexts, growth (to %li) deemed "
+"beneficial.\n", overall_size);
}
  else if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file, "  Not cloning for all contexts because "
@@ -5506,6 +5507,9 @@ decide_about_value (struct cgraph_node *node, int index, 
HOST_WIDE_INT offset,
   val->spec_node = create_specialized_node (node, known_csts, known_contexts,
aggvals, callers);
   overall_size += val->local_size_cost;
+  if (dump_file && (dump_flags & TDF_DETAILS))
+fprintf (dump_file, " overall size reached %li\n",
+overall_size);
 
   /* TODO: If for some lattice there is only one other known value
  left, make a special node for it too. */
-- 
2.28.0

[PATCH 1/6] ipa: Bundle vectors describing argument values

2020-09-29 Thread Martin Jambor

Hi,

this large patch is mostly mechanical change which aims to replace
uses of separate vectors about known scalar values (usually called
known_vals or known_csts), known aggregate values (known_aggs), known
virtual call contexts (known_contexts) and known value
ranges (known_value_ranges) with uses of either new type
ipa_call_arg_values or ipa_auto_call_arg_values, both of which simply
contain these vectors inside them.

The need for two distinct comes from the fact that when the vectors
are constructed from jump functions or lattices, we really should use
auto_vecs with embedded storage allocated on stack.  On the other hand,
the bundle in ipa_call_context can be allocated on heap when in cache,
one time for each call_graph node.

ipa_call_context is constructible from ipa_auto_call_arg_values but
then its vectors must not be resized, otherwise the vectors will stop
pointing to the stack ones.  Unfortunately, I don't think the
structure embedded in ipa_call_context can be made constant because we
need to manipulate and deallocate it when in cache.

gcc/ChangeLog:

2020-09-01  Martin Jambor  

* ipa-prop.h (ipa_auto_call_arg_values): New type.
(class ipa_call_arg_values): Likewise.
(ipa_get_indirect_edge_target): Replaced vector arguments with
ipa_call_arg_values in declaration.  Added an overload for
ipa_auto_call_arg_values.
* ipa-fnsummary.h (ipa_call_context): Removed members m_known_vals,
m_known_contexts, m_known_aggs, duplicate_from, release and equal_to,
new members m_avals, store_to_cache and equivalent_to_p.  Adjusted
construcotr arguments.
(estimate_ipcp_clone_size_and_time): Replaced vector arguments
with ipa_auto_call_arg_values in declaration.
(evaluate_properties_for_edge): Likewise.
* ipa-cp.c (ipa_get_indirect_edge_target): Adjusted to work on
ipa_call_arg_values rather than on separate vectors.  Added an
overload for ipa_auto_call_arg_values.
(devirtualization_time_bonus): Adjusted to work on
ipa_auto_call_arg_values rather than on separate vectors.
(gather_context_independent_values): Adjusted to work on
ipa_auto_call_arg_values rather than on separate vectors.
(perform_estimation_of_a_value): Likewise.
(estimate_local_effects): Likewise.
(modify_known_vectors_with_val): Adjusted both variants to work on
ipa_auto_call_arg_values and rename them to
copy_known_vectors_add_val.
(decide_about_value): Adjusted to work on ipa_call_arg_values rather
than on separate vectors.
(decide_whether_version_node): Likewise.
* ipa-fnsummary.c (evaluate_conditions_for_known_args): Likewise.
(evaluate_properties_for_edge): Likewise.
(ipa_fn_summary_t::duplicate): Likewise.
(estimate_edge_devirt_benefit): Adjusted to work on
ipa_call_arg_values rather than on separate vectors.
(estimate_edge_size_and_time): Likewise.
(estimate_calls_size_and_time_1): Likewise.
(summarize_calls_size_and_time): Adjusted calls to
estimate_edge_size_and_time.
(estimate_calls_size_and_time): Adjusted to work on
ipa_call_arg_values rather than on separate vectors.
(ipa_call_context::ipa_call_context): Construct from a pointer to
ipa_auto_call_arg_values instead of inividual vectors.
(ipa_call_context::duplicate_from): Adjusted to access vectors within
m_avals.
(ipa_call_context::release): Likewise.
(ipa_call_context::equal_to): Likewise.
(ipa_call_context::estimate_size_and_time): Adjusted to work on
ipa_call_arg_values rather than on separate vectors.
(estimate_ipcp_clone_size_and_time): Adjusted to work with
ipa_auto_call_arg_values rather than on separate vectors.
(ipa_merge_fn_summary_after_inlining): Likewise.  Adjusted call to
estimate_edge_size_and_time.
(ipa_update_overall_fn_summary): Adjusted call to
estimate_edge_size_and_time.
* ipa-inline-analysis.c (do_estimate_edge_time): Adjusted to work with
ipa_auto_call_arg_values rather than with separate vectors.
(do_estimate_edge_size): Likewise.
(do_estimate_edge_hints): Likewise.
* ipa-prop.c (ipa_auto_call_arg_values::~ipa_auto_call_arg_values):
New destructor.
---
 gcc/ipa-cp.c  | 245 ++---
 gcc/ipa-fnsummary.c   | 446 +-
 gcc/ipa-fnsummary.h   |  27 +--
 gcc/ipa-inline-analysis.c |  41 +---
 gcc/ipa-prop.c|  10 +
 gcc/ipa-prop.h| 112 +-
 6 files changed, 452 insertions(+), 429 deletions(-)

diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c
index b3e7d41ea10..292dd7e5bdf 100644
--- a/gcc/ipa-cp.c
+++ b/gcc/ipa-cp.c
@@ -3117,30 +3117,40 @@ ipa_get_indirect_edge_target_1 (struct cgraph_edge *ie,
   return target;
 }
 
-

[PATCH 4/6] ipa: Multiple predicates for loop properties, with frequencies

2020-09-29 Thread Martin Jambor

This patch enhances the ability of IPA to reason under what conditions
loops in a function have known iteration counts or strides because it
replaces single predicates which currently hold conjunction of
predicates for all loops with vectors capable of holding multiple
predicates, each with a cumulative frequency of loops with the
property.

This second property is then used by IPA-CP to much more aggressively
boost its heuristic score for cloning opportunities which make
iteration counts or strides of frequent loops compile time constant.

gcc/ChangeLog:

2020-09-03  Martin Jambor  

* ipa-fnsummary.h (ipa_freqcounting_predicate): New type.
(ipa_fn_summary): Change the type of loop_iterations and loop_strides
to vectors of ipa_freqcounting_predicate.
(ipa_fn_summary::ipa_fn_summary): Construct the new vectors.
(ipa_call_estimates): New fields loops_with_known_iterations and
loops_with_known_strides.
* ipa-cp.c (hint_time_bonus): Multiply param_ipa_cp_loop_hint_bonus
with the expected frequencies of loops with known iteration count or
stride.
* ipa-fnsummary.c (add_freqcounting_predicate): New function.
(ipa_fn_summary::~ipa_fn_summary): Release the new vectors instead of
just two predicates.
(remap_hint_predicate_after_duplication): Replace with function
remap_freqcounting_preds_after_dup.
(ipa_fn_summary_t::duplicate): Use it or duplicate new vectors.
(ipa_dump_fn_summary): Dump the new vectors.
(analyze_function_body): Compute the loop property vectors.
(ipa_call_context::estimate_size_and_time): Calculate also
loops_with_known_iterations and loops_with_known_strides.  Adjusted
dumping accordinly.
(remap_hint_predicate): Replace with function
remap_freqcounting_predicate.
(ipa_merge_fn_summary_after_inlining): Use it.
(inline_read_section): Stream loopcounting vectors instead of two
simple predicates.
(ipa_fn_summary_write): Likewise.
* params.opt (ipa-max-loop-predicates): New parameter.
* doc/invoke.texi (ipa-max-loop-predicates): Document new param.

gcc/testsuite/ChangeLog:

2020-09-03  Martin Jambor  

* gcc.dg/ipa/ipcp-loophint-1.c: New test.
---
 gcc/doc/invoke.texi|   4 +
 gcc/ipa-cp.c   |   9 +
 gcc/ipa-fnsummary.c| 318 ++---
 gcc/ipa-fnsummary.h|  38 ++-
 gcc/params.opt |   4 +
 gcc/testsuite/gcc.dg/ipa/ipcp-loophint-1.c |  29 ++
 6 files changed, 288 insertions(+), 114 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/ipa/ipcp-loophint-1.c

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 226b0e1dc91..829598228ac 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -13433,6 +13433,10 @@ of iterations of a loop known, it adds a bonus of
 @option{ipa-cp-loop-hint-bonus} to the profitability score of
 the candidate.
 
+@item ipa-max-loop-predicates
+The maximum number of different predicates IPA will use to describe when
+loops in a function have known properties.
+
 @item ipa-max-aa-steps
 During its analysis of function bodies, IPA-CP employs alias analysis
 in order to track values pointed to by function parameters.  In order
diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c
index 77c84a6ed5d..f6320c787de 100644
--- a/gcc/ipa-cp.c
+++ b/gcc/ipa-cp.c
@@ -3205,6 +3205,15 @@ hint_time_bonus (cgraph_node *node, const 
ipa_call_estimates )
   ipa_hints hints = estimates.hints;
   if (hints & (INLINE_HINT_loop_iterations | INLINE_HINT_loop_stride))
 result += opt_for_fn (node->decl, param_ipa_cp_loop_hint_bonus);
+
+  sreal bonus_for_one = opt_for_fn (node->decl, param_ipa_cp_loop_hint_bonus);
+
+  if (hints & INLINE_HINT_loop_iterations)
+result += (estimates.loops_with_known_iterations * bonus_for_one).to_int 
();
+
+  if (hints & INLINE_HINT_loop_stride)
+result += (estimates.loops_with_known_strides * bonus_for_one).to_int ();
+
   return result;
 }
 
diff --git a/gcc/ipa-fnsummary.c b/gcc/ipa-fnsummary.c
index 6082f34d63f..94aa930 100644
--- a/gcc/ipa-fnsummary.c
+++ b/gcc/ipa-fnsummary.c
@@ -310,6 +310,36 @@ set_hint_predicate (predicate **p, predicate new_predicate)
 }
 }
 
+/* Find if NEW_PREDICATE is already in V and if so, increment its freq.
+   Otherwise add a new item to the vector with this predicate and frerq equal
+   to add_freq, unless the number of predicates would exceed MAX_NUM_PREDICATES
+   in which case the function does nothing.  */
+
+static void
+add_freqcounting_predicate (vec **v,
+   const predicate _predicate, sreal add_freq,
+   unsigned max_num_predicates)
+{
+  if (new_predicate == false || new_predicate == true)
+return;
+  ipa_freqcounting_predicate *f;
+  for (int i = 0; vec_safe_iterate (*v, i, ); i++)
+

[PATCH 2/6] ipa: Introduce ipa_cached_call_context

2020-09-29 Thread Martin Jambor

Hi,

as we discussed with Honza on the mailin glist last week, making
cached call context structure distinct from the normal one may make it
clearer that the cached data need to be explicitely deallocated.

This patch does that division.  It is not mandatory for the overall
main goals of the patch set and can be dropped if deemed superfluous.

gcc/ChangeLog:

2020-09-02  Martin Jambor  

* ipa-fnsummary.h (ipa_cached_call_context): New forward declaration
and class.
(class ipa_call_context): Make friend ipa_cached_call_context.  Moved
methods duplicate_from and release to it too.
* ipa-fnsummary.c (ipa_call_context::duplicate_from): Moved to class
ipa_cached_call_context.
(ipa_call_context::release): Likewise, removed the parameter.
* ipa-inline-analysis.c (node_context_cache_entry): Change the type of
ctx to ipa_cached_call_context.
(do_estimate_edge_time): Remove parameter from the call to
ipa_cached_call_context::release.
---
 gcc/ipa-fnsummary.c   | 21 -
 gcc/ipa-fnsummary.h   | 16 ++--
 gcc/ipa-inline-analysis.c |  4 ++--
 3 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/gcc/ipa-fnsummary.c b/gcc/ipa-fnsummary.c
index e8645aa0a1b..4ef7d2570e9 100644
--- a/gcc/ipa-fnsummary.c
+++ b/gcc/ipa-fnsummary.c
@@ -3329,7 +3329,7 @@ ipa_call_context::ipa_call_context (cgraph_node *node, 
clause_t possible_truths,
 /* Set THIS to be a duplicate of CTX.  Copy all relevant info.  */
 
 void
-ipa_call_context::duplicate_from (const ipa_call_context )
+ipa_cached_call_context::duplicate_from (const ipa_call_context )
 {
   m_node = ctx.m_node;
   m_possible_truths = ctx.m_possible_truths;
@@ -3399,24 +3399,19 @@ ipa_call_context::duplicate_from (const 
ipa_call_context )
   m_avals.m_known_value_ranges = vNULL;
 }
 
-/* Release memory used by known_vals/contexts/aggs vectors.
-   If ALL is true release also inline_param_summary.
-   This happens when context was previously duplicated to be stored
-   into cache.  */
+/* Release memory used by known_vals/contexts/aggs vectors.  and
+   inline_param_summary.  */
 
 void
-ipa_call_context::release (bool all)
+ipa_cached_call_context::release ()
 {
   /* See if context is initialized at first place.  */
   if (!m_node)
 return;
-  ipa_release_agg_values (m_avals.m_known_aggs, all);
-  if (all)
-{
-  m_avals.m_known_vals.release ();
-  m_avals.m_known_contexts.release ();
-  m_inline_param_summary.release ();
-}
+  ipa_release_agg_values (m_avals.m_known_aggs, true);
+  m_avals.m_known_vals.release ();
+  m_avals.m_known_contexts.release ();
+  m_inline_param_summary.release ();
 }
 
 /* Return true if CTX describes the same call context as THIS.  */
diff --git a/gcc/ipa-fnsummary.h b/gcc/ipa-fnsummary.h
index 6893858d18e..020a6f0425d 100644
--- a/gcc/ipa-fnsummary.h
+++ b/gcc/ipa-fnsummary.h
@@ -287,6 +287,8 @@ public:
  ipa_call_summary *dst_data);
 };
 
+class ipa_cached_call_context;
+
 /* This object describe a context of call.  That is a summary of known
information about its parameters.  Main purpose of this context is
to give more realistic estimations of function runtime, size and
@@ -307,8 +309,6 @@ public:
   sreal *ret_time,
   sreal *ret_nonspecialized_time,
   ipa_hints *ret_hints);
-  void duplicate_from (const ipa_call_context );
-  void release (bool all = false);
   bool equal_to (const ipa_call_context &);
   bool exists_p ()
   {
@@ -329,6 +329,18 @@ private:
   /* Even after having calculated clauses, the information about argument
  values is used to resolve indirect calls.  */
   ipa_call_arg_values m_avals;
+
+  friend ipa_cached_call_context;
+};
+
+/* Variant of ipa_call_context that is stored in a cache over a longer period
+   of time.  */
+
+class ipa_cached_call_context : public ipa_call_context
+{
+public:
+  void duplicate_from (const ipa_call_context );
+  void release ();
 };
 
 extern fast_call_summary  *ipa_call_summaries;
diff --git a/gcc/ipa-inline-analysis.c b/gcc/ipa-inline-analysis.c
index d2ae8196d09..b7af77f7b9b 100644
--- a/gcc/ipa-inline-analysis.c
+++ b/gcc/ipa-inline-analysis.c
@@ -57,7 +57,7 @@ fast_call_summary 
*edge_growth_cache = NULL;
 class node_context_cache_entry
 {
 public:
-  ipa_call_context ctx;
+  ipa_cached_call_context ctx;
   sreal time, nonspec_time;
   int size;
   ipa_hints hints;
@@ -226,7 +226,7 @@ do_estimate_edge_time (struct cgraph_edge *edge, sreal 
*ret_nonspec_time)
node_context_cache_miss++;
  else
node_context_cache_clear++;
- e->entry.ctx.release (true);
+ e->entry.ctx.release ();
  ctx.estimate_size_and_time (, _size,
  , _time, );
  e->entry.size = size;
-- 
2.28.0

[PATCH 3/6] ipa: Bundle estimates of ipa_call_context::estimate_size_and_time

2020-09-29 Thread Martin Jambor

A subsequent patch adds another two estimates that the code in
ipa_call_context::estimate_size_and_time computes, and the fact that
the function has a special output parameter for each thing it computes
would make it have just too many.  Therefore, this patch collapses all
those ouptut parameters into one output structure.

gcc/ChangeLog:

2020-09-02  Martin Jambor  

* ipa-inline-analysis.c (do_estimate_edge_time): Adjusted to use
ipa_call_estimates.
(do_estimate_edge_size): Likewise.
(do_estimate_edge_hints): Likewise.
* ipa-fnsummary.h (struct ipa_call_estimates): New type.
(ipa_call_context::estimate_size_and_time): Adjusted declaration.
(estimate_ipcp_clone_size_and_time): Likewise.
* ipa-cp.c (hint_time_bonus): Changed the type of the second argument
to ipa_call_estimates.
(perform_estimation_of_a_value): Adjusted to use ipa_call_estimates.
(estimate_local_effects): Likewise.
* ipa-fnsummary.c (ipa_call_context::estimate_size_and_time): Adjusted
to return estimates in a single ipa_call_estimates parameter.
(estimate_ipcp_clone_size_and_time): Likewise.
---
 gcc/ipa-cp.c  | 45 ++---
 gcc/ipa-fnsummary.c   | 60 +++
 gcc/ipa-fnsummary.h   | 36 +--
 gcc/ipa-inline-analysis.c | 47 +-
 4 files changed, 105 insertions(+), 83 deletions(-)

diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c
index 292dd7e5bdf..77c84a6ed5d 100644
--- a/gcc/ipa-cp.c
+++ b/gcc/ipa-cp.c
@@ -3196,12 +3196,13 @@ devirtualization_time_bonus (struct cgraph_node *node,
   return res;
 }
 
-/* Return time bonus incurred because of HINTS.  */
+/* Return time bonus incurred because of hints stored in ESTIMATES.  */
 
 static int
-hint_time_bonus (cgraph_node *node, ipa_hints hints)
+hint_time_bonus (cgraph_node *node, const ipa_call_estimates )
 {
   int result = 0;
+  ipa_hints hints = estimates.hints;
   if (hints & (INLINE_HINT_loop_iterations | INLINE_HINT_loop_stride))
 result += opt_for_fn (node->decl, param_ipa_cp_loop_hint_bonus);
   return result;
@@ -3397,15 +3398,13 @@ perform_estimation_of_a_value (cgraph_node *node,
   int removable_params_cost, int est_move_cost,
   ipcp_value_base *val)
 {
-  int size, time_benefit;
-  sreal time, base_time;
-  ipa_hints hints;
+  int time_benefit;
+  ipa_call_estimates estimates;
 
-  estimate_ipcp_clone_size_and_time (node, avals, , ,
-_time, );
-  base_time -= time;
-  if (base_time > 65535)
-base_time = 65535;
+  estimate_ipcp_clone_size_and_time (node, avals, );
+  sreal time_delta = estimates.nonspecialized_time - estimates.time;
+  if (time_delta > 65535)
+time_delta = 65535;
 
   /* Extern inline functions have no cloning local time benefits because they
  will be inlined anyway.  The only reason to clone them is if it enables
@@ -3413,11 +3412,12 @@ perform_estimation_of_a_value (cgraph_node *node,
   if (DECL_EXTERNAL (node->decl) && DECL_DECLARED_INLINE_P (node->decl))
 time_benefit = 0;
   else
-time_benefit = base_time.to_int ()
+time_benefit = time_delta.to_int ()
   + devirtualization_time_bonus (node, avals)
-  + hint_time_bonus (node, hints)
+  + hint_time_bonus (node, estimates)
   + removable_params_cost + est_move_cost;
 
+  int size = estimates.size;
   gcc_checking_assert (size >=0);
   /* The inliner-heuristics based estimates may think that in certain
  contexts some functions do not have any size at all but we want
@@ -3472,23 +3472,21 @@ estimate_local_effects (struct cgraph_node *node)
   || (removable_params_cost && node->can_change_signature))
 {
   struct caller_statistics stats;
-  ipa_hints hints;
-  sreal time, base_time;
-  int size;
+  ipa_call_estimates estimates;
 
   init_caller_stats ();
   node->call_for_symbol_thunks_and_aliases (gather_caller_stats, ,
  false);
-  estimate_ipcp_clone_size_and_time (node, , , ,
-_time, );
-  time -= devirt_bonus;
-  time -= hint_time_bonus (node, hints);
-  time -= removable_params_cost;
-  size -= stats.n_calls * removable_params_cost;
+  estimate_ipcp_clone_size_and_time (node, , );
+  sreal time = estimates.nonspecialized_time - estimates.time;
+  time += devirt_bonus;
+  time += hint_time_bonus (node, estimates);
+  time += removable_params_cost;
+  int size = estimates.size - stats.n_calls * removable_params_cost;
 
   if (dump_file)
fprintf (dump_file, " - context independent values, size: %i, "
-"time_benefit: %f\n", size, (base_time - time).to_double ());
+"time_benefit: %f\n", size, (time).to_double ());
 
   if (size <= 0 ||

[PATCH 0/6] IPA cleanups and IPA-CP improvements for 548.exchange2_r

2020-09-29 Thread Martin Jambor

Hi,

this patch set is a result of rebasing the one I sent here three weeks
ago on current trunk.  Last week I also checked the WPA memory
requirements when building Firefox and it did not change from the
unpatched numbers.

Bootstrapped and tested and LTO bootstrapped on x86-64.  OK for trunk?

Thanks,


Martin




Martin Jambor (6):
  ipa: Bundle vectors describing argument values
  ipa: Introduce ipa_cached_call_context
  ipa: Bundle estimates of ipa_call_context::estimate_size_and_time
  ipa: Multiple predicates for loop properties, with frequencies
  ipa-cp: Add dumping of overall_size after cloning
  ipa-cp: Separate and increase the large-unit parameter

 gcc/doc/invoke.texi|   4 +
 gcc/ipa-cp.c   | 303 
 gcc/ipa-fnsummary.c| 829 +++--
 gcc/ipa-fnsummary.h| 113 ++-
 gcc/ipa-inline-analysis.c  |  92 +--
 gcc/ipa-prop.c |  10 +
 gcc/ipa-prop.h | 112 ++-
 gcc/params.opt |   8 +
 gcc/testsuite/gcc.dg/ipa/ipcp-loophint-1.c |  29 +
 9 files changed, 867 insertions(+), 633 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/ipa/ipcp-loophint-1.c

-- 
2.28.0

[PATCH 6/6] ipa-cp: Separate and increase the large-unit parameter

2020-09-29 Thread Martin Jambor

A previous patch in the series has taught IPA-CP to identify the
important cloning opportunities in 548.exchange2_r as worthwhile on
their own, but the optimization is still prevented from taking place
because of the overall unit-growh limit.  This patches raises that
limit so that it takes place and the benchmark runs 30% faster (on AMD
Zen2 CPU at least).

Before this patch, IPA-CP uses the following formulae to arrive at the
overall_size limit:

base = MAX(orig_size, param_large_unit_insns)
unit_growth_limit = base + base * param_ipa_cp_unit_growth / 100

since param_ipa_cp_unit_growth has default 10, param_large_unit_insns
has default value 1.

The problem with exchange2 (at least on zen2 but I have had a quick
look on aarch64 too) is that the original estimated unit size is 10513
and so param_large_unit_insns does not apply and the default limit is
therefore 11564 which is good enough only for one of the ideal 8
clonings, we need the limit to be at least 16291.

I would like to raise param_ipa_cp_unit_growth a little bit more soon
too, but most certainly not to 55.  Therefore, the large_unit must be
increased.  In this patch, I decided to decouple the inlining and
ipa-cp large-unit parameters.  It also makes sense because IPA-CP uses
it only at -O3 while inlining also at -O2 (IIUC).  But if we agree we
can try raising param_large_unit_insns to 13-14 thousand
"instructions," perhaps it is not necessary.  But then again, it may
make sense to actually increase the IPA-CP limit further.

I plan to experiment with IPA-CP tuning on a larger set of programs.
Meanwhile, mainly to address the 548.exchange2_r regression, I'm
suggesting this simple change.

gcc/ChangeLog:

2020-09-07  Martin Jambor  

* params.opt (ipa-cp-large-unit-insns): New parameter.
* ipa-cp.c (get_max_overall_size): Use the new parameter.
---
 gcc/ipa-cp.c   | 2 +-
 gcc/params.opt | 4 
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c
index 12acf24c553..2152f9e5876 100644
--- a/gcc/ipa-cp.c
+++ b/gcc/ipa-cp.c
@@ -3448,7 +3448,7 @@ static long
 get_max_overall_size (cgraph_node *node)
 {
   long max_new_size = orig_overall_size;
-  long large_unit = opt_for_fn (node->decl, param_large_unit_insns);
+  long large_unit = opt_for_fn (node->decl, param_ipa_cp_large_unit_insns);
   if (max_new_size < large_unit)
 max_new_size = large_unit;
   int unit_growth = opt_for_fn (node->decl, param_ipa_cp_unit_growth);
diff --git a/gcc/params.opt b/gcc/params.opt
index acb59f17e45..9d177ab50ad 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -218,6 +218,10 @@ Percentage penalty functions containing a single call to 
another function will r
 Common Joined UInteger Var(param_ipa_cp_unit_growth) Init(10) Param 
Optimization
 How much can given compilation unit grow because of the interprocedural 
constant propagation (in percent).
 
+-param=ipa-cp-large-unit-insns=
+Common Joined UInteger Var(param_ipa_cp_large_unit_insns) Optimization 
Init(16000) Param
+The size of translation unit that IPA-CP pass considers large.
+
 -param=ipa-cp-value-list-size=
 Common Joined UInteger Var(param_ipa_cp_value_list_size) Init(8) Param 
Optimization
 Maximum size of a list of values associated with each parameter for 
interprocedural constant propagation.
-- 
2.28.0

Re: [PATCH] Add Missing FSF copyright notes for some x86 intrinsic headers

2020-09-29 Thread H.J. Lu via Gcc-patches

On Mon, Sep 28, 2020 at 9:06 AM H.J. Lu  wrote:
>
> On Mon, Sep 28, 2020 at 9:04 AM Hongyu Wang via Gcc-patches
>  wrote:
> >
> > Hi,
> >
> > Some x86 intrinsic headers is missing FSF copyright notes. This patch add
> > the missed notes for those headers.
> >
> > OK for master?
> >
> > gcc/ChangeLog:
> >
> > * config/i386/amxbf16intrin.h: Add FSF copyright notes.
> > * config/i386/amxint8intrin.h: Ditto.
> > * config/i386/amxtileintrin.h: Ditto.
> > * config/i386/avx512vp2intersectintrin.h: Ditto.
> > * config/i386/avx512vp2intersectvlintrin.h: Ditto.
> > * config/i386/pconfigintrin.h: Ditto.
> > * config/i386/tsxldtrkintrin.h: Ditto.
> > * config/i386/wbnoinvdintrin.h: Ditto.
> >
>
> I will check it for Hongyu tomorrow if there are no objections.
>

I checked it into master branch and will backport it to release branches.


-- 
H.J.

[SLP][VECT] Add check to fix 96837

2020-09-29 Thread Joel Hutton via Gcc-patches

 Hi All,

The following patch adds a simple check to prevent slp stmts from vector 
constructors being rearranged. vect_attempt_slp_rearrange_stmts tries to 
rearrange to avoid a load permutation.

This fixes PR target/96837 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96827
gcc/ChangeLog:

2020-09-29  Joel Hutton  

PR target/96837
* tree-vect-slp.c (vect_analyze_slp): Do not call 
vect_attempt_slp_rearrange_stmts for vector constructors.

gcc/testsuite/ChangeLog:

2020-09-29  Joel Hutton  

PR target/96837
* gcc.dg/vect/bb-slp-49.c: New test.From 2c738e2c0eddbc4fcdbf8ff2443bb809b36c7e28 Mon Sep 17 00:00:00 2001
From: Joel Hutton 
Date: Tue, 29 Sep 2020 15:46:44 +0100
Subject: [PATCH] [SLP][VECT] Add check to fix 96827

Do not call vect_attempt_slp_rearrange_stmts if an slp instance is an
SLP_INSTANCE_ROOT_STMT, i.e. if the tree is built from a constructor
rather than a grouped store. This function is intended to rearrange
stmts in a reduction chain so they do not require load permutation.
Rearranging causes the resulting constructor to be in the wrong order.
---
 gcc/testsuite/gcc.dg/vect/bb-slp-49.c | 28 +++
 gcc/tree-vect-slp.c   |  3 ++-
 2 files changed, 30 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/bb-slp-49.c

diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-49.c b/gcc/testsuite/gcc.dg/vect/bb-slp-49.c
new file mode 100644
index ..e7101fcff4627bb545549bdfefd33c2ed58aee7b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-49.c
@@ -0,0 +1,28 @@
+/* This checks that vectorized constructors have the correct ordering. */
+/* { dg-require-effective-target vect_int } */
+
+typedef int V __attribute__((__vector_size__(16)));
+
+__attribute__((__noipa__)) void
+foo (unsigned int x, V *y)
+{
+  unsigned int a[4] = { x + 0, x + 2, x + 4, x + 6 };
+  for (unsigned int i = 0; i < 3; ++i)
+if (a[i] == 1234)
+  a[i]--;
+  *y = (V) { a[3], a[2], a[1], a[0] };
+}
+
+int
+main ()
+{
+  V b;
+  foo (0, );
+  if (b[0] != 6 || b[1] != 4 || b[2] != 2 || b[3] != 0)
+__builtin_abort ();
+  return 0;
+}
+
+/* See that we vectorize an SLP instance.  */
+/* { dg-final { scan-tree-dump "Analyzing vectorizable constructor" "slp1" } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "slp1" } } */
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index c44fd396bf0b69a4153e46026c545bebb3797551..7ba24e241deb76c0fd884ccfff04675d1b050ef7 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -2495,7 +2495,8 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
   /* Reduction (there are no data-refs in the root).
 	 In reduction chain the order of the loads is not important.  */
   if (!STMT_VINFO_DATA_REF (stmt_info)
-	  && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
+	  && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
+	  && !SLP_INSTANCE_ROOT_STMT (instance))
 	vect_attempt_slp_rearrange_stmts (instance);
 }
 
-- 
2.17.1

c++: Name lookup simplifications

2020-09-29 Thread Nathan Sidwell



Here are a few cleanups, prior to landing the hidden decl changes.

1) Clear cxx_binding flags in the allocator, not at each user of the 
allocator.


2) Refactor update_binding.  The logic was getting too convoluted.

3) Set friendliness and	anticipatedness	before pushing a template decl 
(not after).


gcc/cp/
* name-lookup.c (create_local_binding): Do not clear
INHERITED_VALUE_BINDING_P here.
(name_lookup::process_binding): Move done hidden-decl triage to ...
(name_lookup::search_namespace_only): ... here, its only caller.
(cxx_binding_make): Clear flags here.
(push_binding): Not here.
(pop_local_binding): RAII.
(update_binding): Refactor.
(do_pushdecl): Assert we're never revealing a local binding.
(do_pushdecl_with_scope): Directly call do_pushdecl.
(get_class_binding): Do not clear LOCAL_BINDING_P here.
* pt.c (push_template_decl): Set friend & anticipated before
pushing.

pushing to trunk

nathan

--
Nathan Sidwell
diff --git i/gcc/cp/name-lookup.c w/gcc/cp/name-lookup.c
index f195e81280a..89f1a4c5d64 100644
--- i/gcc/cp/name-lookup.c
+++ w/gcc/cp/name-lookup.c
@@ -77,7 +77,6 @@ create_local_binding (cp_binding_level *level, tree name)
 {
   cxx_binding *binding = cxx_binding_make (NULL, NULL);
 
-  INHERITED_VALUE_BINDING_P (binding) = false;
   LOCAL_BINDING_P (binding) = true;
   binding->scope = level;
   binding->previous = IDENTIFIER_BINDING (name);
@@ -480,22 +479,17 @@ name_lookup::add_type (tree new_type)
 }
 
 /* Process a found binding containing NEW_VAL and NEW_TYPE.  Returns
-   true if we actually found something noteworthy.  */
+   true if we actually found something noteworthy.  Hiddenness has
+   already been handled in the caller.  */
 
 bool
 name_lookup::process_binding (tree new_val, tree new_type)
 {
   /* Did we really see a type? */
   if (new_type
-  && ((want & LOOK_want::TYPE_NAMESPACE) == LOOK_want::NAMESPACE
-	  || (!bool (want & LOOK_want::HIDDEN_FRIEND)
-	  && DECL_LANG_SPECIFIC (new_type)
-	  && DECL_ANTICIPATED (new_type
+  && (want & LOOK_want::TYPE_NAMESPACE) == LOOK_want::NAMESPACE)
 new_type = NULL_TREE;
 
-  if (new_val && !bool (want & LOOK_want::HIDDEN_FRIEND))
-new_val = ovl_skip_hidden (new_val);
-
   /* Do we really see a value? */
   if (new_val)
 switch (TREE_CODE (new_val))
@@ -544,8 +538,25 @@ name_lookup::search_namespace_only (tree scope)
   bool found = false;
 
   if (tree *binding = find_namespace_slot (scope, name))
-found |= process_binding (MAYBE_STAT_DECL (*binding),
-			  MAYBE_STAT_TYPE (*binding));
+{
+  tree value = *binding, type = NULL_TREE;
+
+  if (STAT_HACK_P (value))
+	{
+	  type = STAT_TYPE (value);
+	  value = STAT_DECL (value);
+  
+	  if (!bool (want & LOOK_want::HIDDEN_FRIEND)
+	  && DECL_LANG_SPECIFIC (type)
+	  && DECL_ANTICIPATED (type))
+	type = NULL_TREE;
+	}
+
+  if (!bool (want & LOOK_want::HIDDEN_FRIEND))
+	value = ovl_skip_hidden (value);
+
+  found |= process_binding (value, type);
+}
 
   return found;
 }
@@ -1954,15 +1965,17 @@ cxx_binding_init (cxx_binding *binding, tree value, tree type)
 static cxx_binding *
 cxx_binding_make (tree value, tree type)
 {
-  cxx_binding *binding;
-  if (free_bindings)
-{
-  binding = free_bindings;
-  free_bindings = binding->previous;
-}
+  cxx_binding *binding = free_bindings;
+
+  if (binding)
+free_bindings = binding->previous;
   else
 binding = ggc_alloc ();
 
+  /* Clear flags by default.  */
+  LOCAL_BINDING_P (binding) = false;
+  INHERITED_VALUE_BINDING_P (binding) = false;
+
   cxx_binding_init (binding, value, type);
 
   return binding;
@@ -2009,7 +2022,6 @@ push_binding (tree id, tree decl, cp_binding_level* level)
 
   /* Now, fill in the binding information.  */
   binding->previous = IDENTIFIER_BINDING (id);
-  INHERITED_VALUE_BINDING_P (binding) = 0;
   LOCAL_BINDING_P (binding) = (level != class_binding_level);
 
   /* And put it on the front of the list of bindings for ID.  */
@@ -2022,8 +2034,6 @@ push_binding (tree id, tree decl, cp_binding_level* level)
 void
 pop_local_binding (tree id, tree decl)
 {
-  cxx_binding *binding;
-
   if (id == NULL_TREE)
 /* It's easiest to write the loops that call this function without
checking whether or not the entities involved have names.  We
@@ -2031,7 +2041,7 @@ pop_local_binding (tree id, tree decl)
 return;
 
   /* Get the innermost binding for ID.  */
-  binding = IDENTIFIER_BINDING (id);
+  cxx_binding *binding = IDENTIFIER_BINDING (id);
 
   /* The name should be bound.  */
   gcc_assert (binding != NULL);
@@ -2356,9 +2366,16 @@ static tree
 update_binding (cp_binding_level *level, cxx_binding *binding, tree *slot,
 		tree old, tree decl, bool hiding = false)
 {
+  tree old_type = NULL_TREE;
+
+  if (!slot)
+old_type = binding->type;
+  else if (STAT_HACK_P (*slot))
+  old_type =

Re: Add trailing dots to fortran io fnspecs to match signature

2020-09-29 Thread Jan Hubicka

> On September 29, 2020 4:20:42 PM GMT+02:00, Jan Hubicka  
> wrote:
> >Hi,
> >this patch is not needed but makes it possible to sanity check that
> >fnspec match function signature. It turns out that there are quite few
> >mistakes in that in trans-decl and one mistake here.
> >Transfer_derived has additional parameters.
> 
> Hmm, omitting trailing dots was on purpose to make the string short (also 
> consider varargs...).  You can still sanity check the prefix, no? 

Yes, I check the prefix and check that only permitted letters appears on
given positions.  However it seems there is enough fuzz to justify one
extra byte or two in the string (it is not very long anyway).

I only check it in gfc_build infrastructure and allow early ending
strings otherwise.

I do not have very strong opinionshere, but it seems it is easy to shit
the string by one or miss a middle argument (especially for calls with
13 parameters) that is caught by this check.

I was also consiering teaching fortran to check that R/W is used only
for pointer type parameters (but did not implement it)
Honza
> 
> >Bootstrapped/regtested x86_64-linux. OK?
> >Honza
> >
> > * transe-io.c (gfc_build_io_library_fndecls): Add traling "." for
> > fnspecs so the match number of parameters.
> >diff --git a/gcc/fortran/trans-io.c b/gcc/fortran/trans-io.c
> >index 21bdd5ef0d8..363cca51ef9 100644
> >--- a/gcc/fortran/trans-io.c
> >+++ b/gcc/fortran/trans-io.c
> >@@ -328,86 +328,86 @@ gfc_build_io_library_fndecls (void)
> >dt_parm_type = build_pointer_type (st_parameter[IOPARM_ptype_dt].type);
> > 
> > iocall[IOCALL_X_INTEGER] = gfc_build_library_function_decl_with_spec (
> >-get_identifier (PREFIX("transfer_integer")), ".wW",
> >+get_identifier (PREFIX("transfer_integer")), ".wW.",
> > void_type_node, 3, dt_parm_type, pvoid_type_node, gfc_int4_type_node);
> > 
> >iocall[IOCALL_X_INTEGER_WRITE] =
> >gfc_build_library_function_decl_with_spec (
> >-get_identifier (PREFIX("transfer_integer_write")), ".wR",
> >+get_identifier (PREFIX("transfer_integer_write")), ".wR.",
> > void_type_node, 3, dt_parm_type, pvoid_type_node, gfc_int4_type_node);
> > 
> > iocall[IOCALL_X_LOGICAL] = gfc_build_library_function_decl_with_spec (
> >-get_identifier (PREFIX("transfer_logical")), ".wW",
> >+get_identifier (PREFIX("transfer_logical")), ".wW.",
> > void_type_node, 3, dt_parm_type, pvoid_type_node, gfc_int4_type_node);
> > 
> >iocall[IOCALL_X_LOGICAL_WRITE] =
> >gfc_build_library_function_decl_with_spec (
> >-get_identifier (PREFIX("transfer_logical_write")), ".wR",
> >+get_identifier (PREFIX("transfer_logical_write")), ".wR.",
> > void_type_node, 3, dt_parm_type, pvoid_type_node, gfc_int4_type_node);
> > 
> >iocall[IOCALL_X_CHARACTER] = gfc_build_library_function_decl_with_spec
> >(
> >-get_identifier (PREFIX("transfer_character")), ".wW",
> >+get_identifier (PREFIX("transfer_character")), ".wW.",
> > void_type_node, 3, dt_parm_type, pvoid_type_node,
> >gfc_charlen_type_node);
> > 
> >iocall[IOCALL_X_CHARACTER_WRITE] =
> >gfc_build_library_function_decl_with_spec (
> >-get_identifier (PREFIX("transfer_character_write")), ".wR",
> >+get_identifier (PREFIX("transfer_character_write")), ".wR.",
> > void_type_node, 3, dt_parm_type, pvoid_type_node,
> >gfc_charlen_type_node);
> > 
> >iocall[IOCALL_X_CHARACTER_WIDE] =
> >gfc_build_library_function_decl_with_spec (
> >-get_identifier (PREFIX("transfer_character_wide")), ".wW",
> >+get_identifier (PREFIX("transfer_character_wide")), ".wW..",
> > void_type_node, 4, dt_parm_type, pvoid_type_node,
> > gfc_charlen_type_node, gfc_int4_type_node);
> > 
> >   iocall[IOCALL_X_CHARACTER_WIDE_WRITE] =
> > gfc_build_library_function_decl_with_spec (
> >-get_identifier (PREFIX("transfer_character_wide_write")), ".wR",
> >+get_identifier (PREFIX("transfer_character_wide_write")), ".wR..",
> > void_type_node, 4, dt_parm_type, pvoid_type_node,
> > gfc_charlen_type_node, gfc_int4_type_node);
> > 
> >   iocall[IOCALL_X_REAL] = gfc_build_library_function_decl_with_spec (
> >-get_identifier (PREFIX("transfer_real")), ".wW",
> >+get_identifier (PREFIX("transfer_real")), ".wW.",
> > void_type_node, 3, dt_parm_type, pvoid_type_node, gfc_int4_type_node);
> > 
> >iocall[IOCALL_X_REAL_WRITE] = gfc_build_library_function_decl_with_spec
> >(
> >-get_identifier (PREFIX("transfer_real_write")), ".wR",
> >+get_identifier (PREFIX("transfer_real_write")), ".wR.",
> > void_type_node, 3, dt_parm_type, pvoid_type_node, gfc_int4_type_node);
> > 
> > iocall[IOCALL_X_COMPLEX] = gfc_build_library_function_decl_with_spec (
> >-get_identifier (PREFIX("transfer_complex")), ".wW",
> >+get_identifier (PREFIX("transfer_complex")), ".wW.",
> > void_type_node, 3, dt_parm_type, pvoid_type_node, gfc_int4_type_node);
> > 
> >iocall[IOCALL_X_COMPLEX_WRITE] =
> >gfc_build_library_function_decl_with_spec (
> >-get_identifier

Re: [PATCH] libstdc++: Diagnose visitors with different return types [PR95904]

2020-09-29 Thread Ville Voutilainen via Gcc-patches

On Tue, 29 Sep 2020 at 14:20, Jonathan Wakely  wrote:
> I think this is what we want:
>
>template
>  constexpr inline __same_types = (is_same_v<_Tp, _Types> && ...);
>
> is_same_v is very cheap, it uses the built-in directly, so you don't
> need to instantiate any class templates at all.
>
> >+
> >+  template 
>
> typename not class please.
>
> >+decltype(auto) __check_visitor_result(_Visitor&& __vis,
>
> New line after the decltype(auto) please, not in the middle of the
> parameter list.

Aye.
diff --git a/libstdc++-v3/include/std/variant b/libstdc++-v3/include/std/variant
index dd8847cf829..6f647d622c4 100644
--- a/libstdc++-v3/include/std/variant
+++ b/libstdc++-v3/include/std/variant
@@ -182,7 +182,7 @@ namespace __variant
   // used for raw visitation with indices passed in
   struct __variant_idx_cookie { using type = __variant_idx_cookie; };
   // Used to enable deduction (and same-type checking) for std::visit:
-  template struct __deduce_visit_result { };
+  template struct __deduce_visit_result { using type = _Tp; };
 
   // Visit variants that might be valueless.
   template
@@ -1017,7 +1017,22 @@ namespace __variant
 
   static constexpr auto
   _S_apply()
-  { return _Array_type{&__visit_invoke}; }
+  {
+	constexpr bool __visit_ret_type_mismatch =
+	  _Array_type::__result_is_deduced::value
+	  && !is_same_v(),
+		std::declval<_Variants>()...))>;
+	if constexpr (__visit_ret_type_mismatch)
+	  {
+	static_assert(!__visit_ret_type_mismatch,
+			  "std::visit requires the visitor to have the same "
+			  "return type for all alternatives of a variant");
+	return __nonesuch{};
+	  }
+	else
+	  return _Array_type{&__visit_invoke};
+  }
 };
 
   template
@@ -1692,6 +1707,26 @@ namespace __variant
 			   std::forward<_Variants>(__variants)...);
 }
 
+  template
+ constexpr inline bool __same_types = (is_same_v<_Tp, _Types> && ...);
+
+  template 
+decltype(auto)
+__check_visitor_result(_Visitor&& __vis, _Variant&& __variant)
+{
+  return std::forward<_Visitor>(__vis)(
+std::get<_Idx>(std::forward<_Variant>(__variant)));
+}
+
+  template 
+constexpr bool __check_visitor_results(std::index_sequence<_Idxs...>)
+{
+  return __same_types(
+	std::declval<_Visitor>(),
+	std::declval<_Variant>()))...>;
+}
+
+
   template
 constexpr decltype(auto)
 visit(_Visitor&& __visitor, _Variants&&... __variants)
@@ -1704,8 +1739,28 @@ namespace __variant
 
   using _Tag = __detail::__variant::__deduce_visit_result<_Result_type>;
 
-  return std::__do_visit<_Tag>(std::forward<_Visitor>(__visitor),
-   std::forward<_Variants>(__variants)...);
+  if constexpr (sizeof...(_Variants) == 1)
+{
+	  constexpr bool __visit_rettypes_match =
+	__check_visitor_results<_Visitor, _Variants...>(
+	  std::make_index_sequence<
+	std::variant_size...>::value>());
+	  if constexpr (!__visit_rettypes_match)
+	{
+	  static_assert(__visit_rettypes_match,
+			  "std::visit requires the visitor to have the same "
+			  "return type for all alternatives of a variant");
+	  return;
+	}
+	  else
+	return std::__do_visit<_Tag>(
+	  std::forward<_Visitor>(__visitor),
+	  std::forward<_Variants>(__variants)...);
+	}
+  else
+	return std::__do_visit<_Tag>(
+  std::forward<_Visitor>(__visitor),
+	  std::forward<_Variants>(__variants)...);
 }
 
 #if __cplusplus > 201703L

RE: [PATCH] arm: Fix ICEs in no-literal-pool.c on MVE

2020-09-29 Thread Kyrylo Tkachov via Gcc-patches

Hi Alex,

> -Original Message-
> From: Alex Coplan 
> Sent: 29 September 2020 14:48
> To: gcc-patches@gcc.gnu.org
> Cc: ni...@redhat.com; Richard Earnshaw ;
> Ramana Radhakrishnan ; Kyrylo
> Tkachov 
> Subject: [PATCH] arm: Fix ICEs in no-literal-pool.c on MVE
> 
> Hello,
> 
> This patch fixes ICEs when compiling
> gcc/testsuite/gcc.target/arm/pure-code/no-literal-pool.c with
> -mfp16-format=ieee -mfloat-abi=hard -march=armv8.1-m.main+mve
> -mpure-code.
> 
> The existing conditions in the movsf/movdf expanders (as well as the
> no_literal_pool patterns) were too restrictive, requiring
> TARGET_HARD_FLOAT instead of TARGET_VFP_BASE, which caused
> unrecognised
> insns when compiling this testcase with integer MVE and -mpure-code.
> 
> Testing:
>  * Bootstrapped and regtested on arm-none-linux-gnueabihf.
>  * Regtested an MVE cross build.
> 
> Comparison of test results before/after patch on MVE build:
> 
> UNRESOLVED->PASS: gcc.target/arm/pure-code/no-literal-pool.c   -O0
> scan-assembler-not
> \\.(float|l\\?double|d?byte|short|int|long|quad|word)\\s+[^.]
> UNRESOLVED->PASS: gcc.target/arm/pure-code/no-literal-pool.c   -O0
> scan-assembler text,"0x2006"
> FAIL->PASS: gcc.target/arm/pure-code/no-literal-pool.c   -O0  (test for
> excess errors)
> UNRESOLVED->PASS: gcc.target/arm/pure-code/no-literal-pool.c   -O1
> scan-assembler-not
> \\.(float|l\\?double|d?byte|short|int|long|quad|word)\\s+[^.]
> UNRESOLVED->PASS: gcc.target/arm/pure-code/no-literal-pool.c   -O1
> scan-assembler text,"0x2006"
> FAIL->PASS: gcc.target/arm/pure-code/no-literal-pool.c   -O1  (test for
> excess errors)
> UNRESOLVED->PASS: gcc.target/arm/pure-code/no-literal-pool.c  -O2 -flto -
> fno-use-linker-plugin -flto-partition=none -ffat-lto-objects  scan-assembler-
> not \\.(float|l\\?double|d?byte|short|int|long|quad|word)\\s+[^.]
> UNRESOLVED->PASS: gcc.target/arm/pure-code/no-literal-pool.c  -O2 -flto -
> fno-use-linker-plugin -flto-partition=none -ffat-lto-objects  scan-assembler
> text,"0x2006"
> FAIL->PASS: gcc.target/arm/pure-code/no-literal-pool.c  -O2 -flto -fno-use-
> linker-plugin -flto-partition=none -ffat-lto-objects (test for excess errors)
> UNRESOLVED->PASS: gcc.target/arm/pure-code/no-literal-pool.c  -O2 -flto -
> fuse-linker-plugin -fno-fat-lto-objects -ffat-lto-objects  scan-assembler-not
> \\.(float|l\\?double|d?byte|short|int|long|quad|word)\\s+[^.]
> UNRESOLVED->PASS: gcc.target/arm/pure-code/no-literal-pool.c  -O2 -flto -
> fuse-linker-plugin -fno-fat-lto-objects -ffat-lto-objects  scan-assembler
> text,"0x2006"
> FAIL->PASS: gcc.target/arm/pure-code/no-literal-pool.c  -O2 -flto -fuse-
> linker-plugin -fno-fat-lto-objects -ffat-lto-objects (test for excess errors)
> UNRESOLVED->PASS: gcc.target/arm/pure-code/no-literal-pool.c   -O2
> scan-assembler-not
> \\.(float|l\\?double|d?byte|short|int|long|quad|word)\\s+[^.]
> UNRESOLVED->PASS: gcc.target/arm/pure-code/no-literal-pool.c   -O2
> scan-assembler text,"0x2006"
> FAIL->PASS: gcc.target/arm/pure-code/no-literal-pool.c   -O2  (test for
> excess errors)
> UNRESOLVED->PASS: gcc.target/arm/pure-code/no-literal-pool.c   -O3 -
> fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions
> scan-assembler-not
> \\.(float|l\\?double|d?byte|short|int|long|quad|word)\\s+[^.]
> UNRESOLVED->PASS: gcc.target/arm/pure-code/no-literal-pool.c   -O3 -
> fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions
> scan-assembler text,"0x2006"
> FAIL->PASS: gcc.target/arm/pure-code/no-literal-pool.c   -O3 -fomit-frame-
> pointer -funroll-loops -fpeel-loops -ftracer -finline-functions  (test for 
> excess
> errors)
> UNRESOLVED->PASS: gcc.target/arm/pure-code/no-literal-pool.c   -Os
> scan-assembler-not
> \\.(float|l\\?double|d?byte|short|int|long|quad|word)\\s+[^.]
> UNRESOLVED->PASS: gcc.target/arm/pure-code/no-literal-pool.c   -Os
> scan-assembler text,"0x2006"
> FAIL->PASS: gcc.target/arm/pure-code/no-literal-pool.c   -Os  (test for
> excess errors)
> UNRESOLVED->PASS: gcc.target/arm/thumb2-slow-flash-data-1.c scan-
> assembler-not
> \\.(float|l\\?double|d?byte|short|int|long|quad|word)\\s+[^.]
> FAIL->PASS: gcc.target/arm/thumb2-slow-flash-data-1.c (test for excess
> errors)
> 
> OK for trunk?

Ok.
Thanks,
Kyrill

> 
> Thanks,
> Alex
> 
> ---
> 
> gcc/ChangeLog:
> 
>   * config/arm/arm.md (movsf): Relax TARGET_HARD_FLOAT to
>   TARGET_VFP_BASE.
>   (movdf): Likewise.
>   * config/arm/vfp.md (no_literal_pool_df_immediate): Likewise.
>   (no_literal_pool_sf_immediate): Likewise.

RE: [PATCH][GCC][ARM] Add support for Cortex-X1

2020-09-29 Thread Kyrylo Tkachov via Gcc-patches

From: Przemyslaw Wirkus  
Sent: 29 September 2020 15:43
To: gcc-patches@gcc.gnu.org
Cc: ni...@redhat.com; Ramana Radhakrishnan ; 
Richard Earnshaw ; Kyrylo Tkachov 

Subject: [PATCH][GCC][ARM] Add support for Cortex-X1

Hi,

This change adds support for the Arm Cortex-X1 CPU. For more information about
this processor, see [0].

[0] : https://www.arm.com/products/cortex-x

OK for master branch ?

Ok, but please make sure this is properly rebased on top of Alex's patches that 
have recently gone in in this area.
Thanks,
Kyrill

kind regards,
Przemyslaw Wirkus

gcc/ChangeLog:

  * config/arm/arm-cpus.in: Add Cortex-X1 core.
  * config/arm/arm-tables.opt: Regenerate.
  * config/arm/arm-tune.md: Regenerate.
  * doc/invoke.texi: Update docs.

RE: [PATCH][GCC][AArch64] Add support for Cortex-X1

2020-09-29 Thread Kyrylo Tkachov via Gcc-patches




> -Original Message-
> From: Przemyslaw Wirkus 
> Sent: 29 September 2020 15:39
> To: gcc-patches@gcc.gnu.org
> Cc: Richard Earnshaw ; Richard Sandiford
> ; Kyrylo Tkachov
> ; Marcus Shawcroft
> 
> Subject: [PATCH][GCC][AArch64] Add support for Cortex-X1
> 
> Hi,
> 
> This change adds support for the Arm Cortex-X1 CPU in AArch64 GCC. For
> more
> information about this processor, see [0].
> 
> [0] : https://www.arm.com/products/cortex-x
> 
> OK for master branch ?

Ok. Please make sure aarch64-tune.md is properly regenerated when committing as 
Alex has been adding new CPUs in there recently too.

Thanks,
Kyrill

> 
> kind regards,
> Przemyslaw Wirkus
> 
> gcc/ChangeLog:
> 
> * config/aarch64/aarch64-cores.def: Add Cortex-X1 Arm core.
> * config/aarch64/aarch64-tune.md: Regenerate.
> * doc/invoke.texi: Add -mtune=cortex-x1 docs.

Re: [PATCH] assorted improvements for fold_truth_andor_1

2020-09-29 Thread Alexandre Oliva

On Sep 29, 2020, Alexandre Oliva  wrote:

> Yeah, ifcombine's bb_no_side_effects_p gives up on any gimple_vuse in
> the inner block.  that won't do when the whole point is to merge loads
> from memory.

> That seems excessive.  Since we rule out any memory-changing side
> effects, I suppose we could get away with checking for volatile operands
> there.  Then, adding just a little SSA_DEF chasing, I believe I could
> bring all of the fold_truth_andor_1 logic I've worked on into ifcombine
> without much difficulty, and then we could do away with at least that
> part of fold_truth_andor.

Confirmed, a very ugly prototype seems to work!

-- 
Alexandre Oliva, happy hacker
https://FSFLA.org/blogs/lxo/
Free Software Activist
GNU Toolchain Engineer

RE: [PATCH][GCC 10] aarch64: Add support for Neoverse N2 CPU

2020-09-29 Thread Kyrylo Tkachov via Gcc-patches



> -Original Message-
> From: Alex Coplan 
> Sent: 29 September 2020 17:04
> To: gcc-patches@gcc.gnu.org
> Cc: Richard Earnshaw ; Richard Sandiford
> ; Kyrylo Tkachov 
> Subject: [PATCH][GCC 10] aarch64: Add support for Neoverse N2 CPU
> 
> Hello,
> 
> This patch backports the AArch64 support for Arm's Neoverse N2 CPU to
> GCC 10.
> 
> Testing:
>  * Bootstrapped and regtested on aarch64-none-linux-gnu.
> 
> OK for GCC 10 branch?

Ok.
Thanks,
Kyrill

> 
> Thanks,
> Alex
> 
> ---
> 
> gcc/ChangeLog:
> 
>   * config/aarch64/aarch64-cores.def: Add Neoverse N2.
>   * config/aarch64/aarch64-tune.md: Regenerate.
>   * doc/invoke.texi: Document AArch64 support for Neoverse N2.

[PATCH][GCC 10] aarch64: Add support for Neoverse N2 CPU

2020-09-29 Thread Alex Coplan via Gcc-patches

Hello,

This patch backports the AArch64 support for Arm's Neoverse N2 CPU to
GCC 10.

Testing:
 * Bootstrapped and regtested on aarch64-none-linux-gnu.

OK for GCC 10 branch?

Thanks,
Alex

---

gcc/ChangeLog:

* config/aarch64/aarch64-cores.def: Add Neoverse N2.
* config/aarch64/aarch64-tune.md: Regenerate.
* doc/invoke.texi: Document AArch64 support for Neoverse N2.

diff --git a/gcc/config/aarch64/aarch64-cores.def 
b/gcc/config/aarch64/aarch64-cores.def
index a3bd56f5b43..94e6a6a1329 100644
--- a/gcc/config/aarch64/aarch64-cores.def
+++ b/gcc/config/aarch64/aarch64-cores.def
@@ -139,6 +139,9 @@ AARCH64_CORE("neoverse-v1", neoversev1, cortexa57, 8_4A,  
AARCH64_FL_FOR_ARCH8_4
 /* Qualcomm ('Q') cores. */
 AARCH64_CORE("saphira", saphira,saphira,8_4A,  
AARCH64_FL_FOR_ARCH8_4 | AARCH64_FL_CRYPTO | AARCH64_FL_RCPC, saphira,   0x51, 
0xC01, -1)
 
+/* Armv8.5-A Architecture Processors.  */
+AARCH64_CORE("neoverse-n2", neoversen2, cortexa57, 8_5A, 
AARCH64_FL_FOR_ARCH8_5 | AARCH64_FL_I8MM | AARCH64_FL_BF16 | AARCH64_FL_F16 | 
AARCH64_FL_SVE | AARCH64_FL_SVE2 | AARCH64_FL_SVE2_BITPERM | AARCH64_FL_RNG | 
AARCH64_FL_MEMTAG, neoversen1, 0x41, 0xd49, -1)
+
 /* ARMv8-A big.LITTLE implementations.  */
 
 AARCH64_CORE("cortex-a57.cortex-a53",  cortexa57cortexa53, cortexa53, 8A,  
AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa57, 0x41, AARCH64_BIG_LITTLE 
(0xd07, 0xd03), -1)
diff --git a/gcc/config/aarch64/aarch64-tune.md 
b/gcc/config/aarch64/aarch64-tune.md
index 8e38052d6cf..7fda2294b8a 100644
--- a/gcc/config/aarch64/aarch64-tune.md
+++ b/gcc/config/aarch64/aarch64-tune.md
@@ -1,5 +1,5 @@
 ;; -*- buffer-read-only: t -*-
 ;; Generated automatically by gentune.sh from aarch64-cores.def
 (define_attr "tune"
-   
"cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa65,cortexa65ae,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55"
+   
"cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa65,cortexa65ae,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,saphira,neoversen2,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55"
(const (symbol_ref "((enum attr_tune) aarch64_tune)")))
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 0eb5b6bb135..4c08258bf57 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -16976,8 +16976,8 @@ performance of the code.  Permissible values for this 
option are:
 @samp{cortex-a57}, @samp{cortex-a72}, @samp{cortex-a73}, @samp{cortex-a75},
 @samp{cortex-a76}, @samp{cortex-a76ae}, @samp{cortex-a77},
 @samp{cortex-a65}, @samp{cortex-a65ae}, @samp{cortex-a34},
-@samp{ares}, @samp{exynos-m1}, @samp{emag}, @samp{falkor},
-@samp{neoverse-e1},@samp{neoverse-n1},@samp{neoverse-v1},@samp{qdf24xx},
+@samp{ares}, @samp{exynos-m1}, @samp{emag}, @samp{falkor}, @samp{neoverse-e1},
+@samp{neoverse-n1}, @samp{neoverse-n2}, @samp{neoverse-v1}, @samp{qdf24xx},
 @samp{saphira}, @samp{phecda}, @samp{xgene1}, @samp{vulcan}, @samp{octeontx},
 @samp{octeontx81},  @samp{octeontx83},
 @samp{octeontx2}, @samp{octeontx2t98}, @samp{octeontx2t96}

[PATCH] i386: Define __LAHF_SAHF and MOVBE__ macros, based on ISA flags

2020-09-29 Thread Florian Weimer

It looks like these have been omitted by accident.

gcc/
* config/i386/i386-c.c (ix86_target_macros_internal): Define
__LAHF_SAHF__ and __MOVBE__ based on ISA flags.

---
 gcc/config/i386/i386-c.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/gcc/config/i386/i386-c.c b/gcc/config/i386/i386-c.c
index 9da682ab05c..e647fce9ad4 100644
--- a/gcc/config/i386/i386-c.c
+++ b/gcc/config/i386/i386-c.c
@@ -594,6 +594,10 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
 def_or_undef (parse_in, "__AMX_INT8__");
   if (isa_flag2 & OPTION_MASK_ISA2_AMX_BF16)
 def_or_undef (parse_in, "__AMX_BF16__");
+  if (isa_flag & OPTION_MASK_ISA_SAHF)
+def_or_undef (parse_in, "__LAHF_SAHF__");
+  if (isa_flag2 & OPTION_MASK_ISA2_MOVBE)
+def_or_undef (parse_in, "__MOVBE__");
 
   if (TARGET_IAMCU)
 {

Re: Add trailing dots to fortran io fnspecs to match signature

2020-09-29 Thread Richard Biener

On September 29, 2020 4:20:42 PM GMT+02:00, Jan Hubicka  wrote:
>Hi,
>this patch is not needed but makes it possible to sanity check that
>fnspec match function signature. It turns out that there are quite few
>mistakes in that in trans-decl and one mistake here.
>Transfer_derived has additional parameters.

Hmm, omitting trailing dots was on purpose to make the string short (also 
consider varargs...).  You can still sanity check the prefix, no? 

>Bootstrapped/regtested x86_64-linux. OK?
>Honza
>
>   * transe-io.c (gfc_build_io_library_fndecls): Add traling "." for
>   fnspecs so the match number of parameters.
>diff --git a/gcc/fortran/trans-io.c b/gcc/fortran/trans-io.c
>index 21bdd5ef0d8..363cca51ef9 100644
>--- a/gcc/fortran/trans-io.c
>+++ b/gcc/fortran/trans-io.c
>@@ -328,86 +328,86 @@ gfc_build_io_library_fndecls (void)
>dt_parm_type = build_pointer_type (st_parameter[IOPARM_ptype_dt].type);
> 
> iocall[IOCALL_X_INTEGER] = gfc_build_library_function_decl_with_spec (
>-  get_identifier (PREFIX("transfer_integer")), ".wW",
>+  get_identifier (PREFIX("transfer_integer")), ".wW.",
>   void_type_node, 3, dt_parm_type, pvoid_type_node, gfc_int4_type_node);
> 
>iocall[IOCALL_X_INTEGER_WRITE] =
>gfc_build_library_function_decl_with_spec (
>-  get_identifier (PREFIX("transfer_integer_write")), ".wR",
>+  get_identifier (PREFIX("transfer_integer_write")), ".wR.",
>   void_type_node, 3, dt_parm_type, pvoid_type_node, gfc_int4_type_node);
> 
> iocall[IOCALL_X_LOGICAL] = gfc_build_library_function_decl_with_spec (
>-  get_identifier (PREFIX("transfer_logical")), ".wW",
>+  get_identifier (PREFIX("transfer_logical")), ".wW.",
>   void_type_node, 3, dt_parm_type, pvoid_type_node, gfc_int4_type_node);
> 
>iocall[IOCALL_X_LOGICAL_WRITE] =
>gfc_build_library_function_decl_with_spec (
>-  get_identifier (PREFIX("transfer_logical_write")), ".wR",
>+  get_identifier (PREFIX("transfer_logical_write")), ".wR.",
>   void_type_node, 3, dt_parm_type, pvoid_type_node, gfc_int4_type_node);
> 
>iocall[IOCALL_X_CHARACTER] = gfc_build_library_function_decl_with_spec
>(
>-  get_identifier (PREFIX("transfer_character")), ".wW",
>+  get_identifier (PREFIX("transfer_character")), ".wW.",
>   void_type_node, 3, dt_parm_type, pvoid_type_node,
>gfc_charlen_type_node);
> 
>iocall[IOCALL_X_CHARACTER_WRITE] =
>gfc_build_library_function_decl_with_spec (
>-  get_identifier (PREFIX("transfer_character_write")), ".wR",
>+  get_identifier (PREFIX("transfer_character_write")), ".wR.",
>   void_type_node, 3, dt_parm_type, pvoid_type_node,
>gfc_charlen_type_node);
> 
>iocall[IOCALL_X_CHARACTER_WIDE] =
>gfc_build_library_function_decl_with_spec (
>-  get_identifier (PREFIX("transfer_character_wide")), ".wW",
>+  get_identifier (PREFIX("transfer_character_wide")), ".wW..",
>   void_type_node, 4, dt_parm_type, pvoid_type_node,
>   gfc_charlen_type_node, gfc_int4_type_node);
> 
>   iocall[IOCALL_X_CHARACTER_WIDE_WRITE] =
> gfc_build_library_function_decl_with_spec (
>-  get_identifier (PREFIX("transfer_character_wide_write")), ".wR",
>+  get_identifier (PREFIX("transfer_character_wide_write")), ".wR..",
>   void_type_node, 4, dt_parm_type, pvoid_type_node,
>   gfc_charlen_type_node, gfc_int4_type_node);
> 
>   iocall[IOCALL_X_REAL] = gfc_build_library_function_decl_with_spec (
>-  get_identifier (PREFIX("transfer_real")), ".wW",
>+  get_identifier (PREFIX("transfer_real")), ".wW.",
>   void_type_node, 3, dt_parm_type, pvoid_type_node, gfc_int4_type_node);
> 
>iocall[IOCALL_X_REAL_WRITE] = gfc_build_library_function_decl_with_spec
>(
>-  get_identifier (PREFIX("transfer_real_write")), ".wR",
>+  get_identifier (PREFIX("transfer_real_write")), ".wR.",
>   void_type_node, 3, dt_parm_type, pvoid_type_node, gfc_int4_type_node);
> 
> iocall[IOCALL_X_COMPLEX] = gfc_build_library_function_decl_with_spec (
>-  get_identifier (PREFIX("transfer_complex")), ".wW",
>+  get_identifier (PREFIX("transfer_complex")), ".wW.",
>   void_type_node, 3, dt_parm_type, pvoid_type_node, gfc_int4_type_node);
> 
>iocall[IOCALL_X_COMPLEX_WRITE] =
>gfc_build_library_function_decl_with_spec (
>-  get_identifier (PREFIX("transfer_complex_write")), ".wR",
>+  get_identifier (PREFIX("transfer_complex_write")), ".wR.",
>   void_type_node, 3, dt_parm_type, pvoid_type_node, gfc_int4_type_node);
> 
>   /* Version for __float128.  */
> iocall[IOCALL_X_REAL128] = gfc_build_library_function_decl_with_spec (
>-  get_identifier (PREFIX("transfer_real128")), ".wW",
>+  get_identifier (PREFIX("transfer_real128")), ".wW.",
>   void_type_node, 3, dt_parm_type, pvoid_type_node, gfc_int4_type_node);
> 
>iocall[IOCALL_X_REAL128_WRITE] =
>gfc_build_library_function_decl_with_spec (
>-  get_identifier (PREFIX("transfer_real128_write")), ".wR",
>+  get_identifier (PREFIX("transfer_real128_write")), ".wR.",
>

Re: Fix internal fnspec

2020-09-29 Thread Richard Biener

On September 29, 2020 4:17:30 PM GMT+02:00, Jan Hubicka  wrote:
>Hi,
>this patch fixes accidental \000 in fnspec strings for internal fns.
>OK?

OK. 

Richard. 

>Honza
>
>   * internal-fn.c (DEF_INTERNAL_FN): Fix call of build_string.
>diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
>index 8ea3195d31c..c8970820026 100644
>--- a/gcc/internal-fn.c
>+++ b/gcc/internal-fn.c
>@@ -93,7 +93,7 @@ init_internal_fns ()
> {
> #define DEF_INTERNAL_FN(CODE, FLAGS, FNSPEC) \
>   if (FNSPEC) internal_fn_fnspec_array[IFN_##CODE] = \
>-build_string ((int) sizeof (FNSPEC), FNSPEC ? FNSPEC : "");
>+build_string ((int) sizeof (FNSPEC) - 1, FNSPEC ? FNSPEC : "");
> #include "internal-fn.def"
>   internal_fn_fnspec_array[IFN_LAST] = 0;
> }

Re: Add trailing dots to fortran io fnspecs to match signature

2020-09-29 Thread Martin Liška


On 9/29/20 4:45 PM, Jan Hubicka wrote:

My fixup is longer:)


Heh. So please include my gcc_checking_assert hunk to your patch.

Martin


All strings starting with R or W are wrong.  However I have instances of
miamatched lengths say for caf_register, deregister and others.
There are few cases where i dropped the sting
Such as for caf_sendget_by_ref since I was unable to make sense of it.

c++: Identifier type value should not update binding

2020-09-29 Thread Nathan Sidwell



This simplification removes some unneeded behaviour in
set_identifier_type_value_with_scope, which was updating the namespace
binding.  And causing update_binding to have to deal with meeting two
implicit typedefs.  But the typedef is already there, and there's no
other way to have two such typedef's collide (we'll already have dealt
with that in lookup_elaborated_type).

So, let's kill this crufty code.

gcc/cp/
* name-lookup.c (update_binding): We never meet two implicit
typedefs.
(do_pushdecl): Adjust set_identifier_type_value_with_scope calls.
(set_identifier_type_value_with_scope): Do not update binding in
the namespace-case.  Assert it is already there.

pushing to trunk

nathan

--
Nathan Sidwell
diff --git i/gcc/cp/name-lookup.c w/gcc/cp/name-lookup.c
index 184e9c873e7..f195e81280a 100644
--- i/gcc/cp/name-lookup.c
+++ w/gcc/cp/name-lookup.c
@@ -2365,33 +2365,24 @@ update_binding (cp_binding_level *level, cxx_binding *binding, tree *slot,
   if (old == error_mark_node)
 old = NULL_TREE;
 
-  if (TREE_CODE (decl) == TYPE_DECL && DECL_ARTIFICIAL (decl))
+  if (DECL_IMPLICIT_TYPEDEF_P (decl))
 {
-  tree other = to_type;
-
-  if (old && TREE_CODE (old) == TYPE_DECL && DECL_ARTIFICIAL (old))
-	other = old;
-
-  /* Pushing an artificial typedef.  See if this matches either
-	 the type slot or the old value slot.  */
-  if (!other)
-	;
-  else if (same_type_p (TREE_TYPE (other), TREE_TYPE (decl)))
-	/* Two artificial decls to same type.  Do nothing.  */
-	return other;
-  else
-	goto conflict;
+  /* Pushing an artificial decl.  We should not find another
+ artificial decl here already -- lookup_elaborated_type will
+ have already found it.  */
+  gcc_checking_assert (!to_type
+			   && !(old && DECL_IMPLICIT_TYPEDEF_P (old)));
 
   if (old)
 	{
 	  /* Slide decl into the type slot, keep old unaltered  */
 	  to_type = decl;
 	  to_val = old;
-	  goto done;
 	}
+  goto done;
 }
 
-  if (old && TREE_CODE (old) == TYPE_DECL && DECL_ARTIFICIAL (old))
+  if (old && DECL_IMPLICIT_TYPEDEF_P (old))
 {
   /* Slide old into the type slot.  */
   to_type = old;
@@ -3122,7 +3113,7 @@ do_pushdecl (tree decl, bool hiding)
 
 	  if (TREE_CODE (decl) == NAMESPACE_DECL)
 	/* A local namespace alias.  */
-	set_identifier_type_value (name, NULL_TREE);
+	set_identifier_type_value_with_scope (name, NULL_TREE, level);
 
 	  if (!binding)
 	binding = create_local_binding (level, name);
@@ -3150,10 +3141,7 @@ do_pushdecl (tree decl, bool hiding)
 	  if (TYPE_NAME (type) != decl)
 		set_underlying_type (decl);
 
-	  if (!ns)
-		set_identifier_type_value_with_scope (name, decl, level);
-	  else
-		SET_IDENTIFIER_TYPE_VALUE (name, global_type_node);
+	  set_identifier_type_value_with_scope (name, decl, level);
 	}
 
 	  /* If this is a locally defined typedef in a function that
@@ -3768,8 +3756,9 @@ identifier_type_value (tree id)
 }
 
 /* Push a definition of struct, union or enum tag named ID.  into
-   binding_level B.  DECL is a TYPE_DECL for the type.  We assume that
-   the tag ID is not already defined.  */
+   binding_level B.  DECL is a TYPE_DECL for the type.  DECL has
+   already been pushed into its binding level.  This is bookkeeping to
+   find it easily.  */
 
 static void
 set_identifier_type_value_with_scope (tree id, tree decl, cp_binding_level *b)
@@ -3781,20 +3770,25 @@ set_identifier_type_value_with_scope (tree id, tree decl, cp_binding_level *b)
   /* Shadow the marker, not the real thing, so that the marker
 	 gets restored later.  */
   tree old_type_value = REAL_IDENTIFIER_TYPE_VALUE (id);
-  b->type_shadowed
-	= tree_cons (id, old_type_value, b->type_shadowed);
+  b->type_shadowed = tree_cons (id, old_type_value, b->type_shadowed);
   type = decl ? TREE_TYPE (decl) : NULL_TREE;
   TREE_TYPE (b->type_shadowed) = type;
 }
   else
 {
-  tree *slot = find_namespace_slot (current_namespace, id, true);
   gcc_assert (decl);
-  update_binding (b, NULL, slot, MAYBE_STAT_DECL (*slot), decl);
+  if (CHECKING_P)
+	{
+	  tree *slot = find_namespace_slot (current_namespace, id);
+	  gcc_checking_assert (slot
+			   && (decl == MAYBE_STAT_TYPE (*slot)
+   || decl == MAYBE_STAT_DECL (*slot)));
+	}
 
   /* Store marker instead of real type.  */
   type = global_type_node;
 }
+
   SET_IDENTIFIER_TYPE_VALUE (id, type);
 }

Re: Add trailing dots to fortran io fnspecs to match signature

2020-09-29 Thread Jan Hubicka

> On 9/29/20 4:20 PM, Jan Hubicka wrote:
> > this patch is not needed but makes it possible to sanity check that
> > fnspec match function signature. It turns out that there are quite few
> 
> I'm sending the run-time sanity check patch and few more places that assert.
> I'm going to test the patch.
> 
> Martin

> From 3b1ff799c08f6e7adb3eb6fb988599197c639ec0 Mon Sep 17 00:00:00 2001
> From: Martin Liska 
> Date: Tue, 29 Sep 2020 16:35:12 +0200
> Subject: [PATCH] Add trailing dots to
>  gfc_build_library_function_decl_with_spec.
> 
> gcc/fortran/ChangeLog:
> 
>   * trans-decl.c (gfc_build_library_function_decl_with_spec): Add
>   runtime assert about length of SPEC argument.
>   (gfc_build_intrinsic_function_decls): Add trailing dots.
>   (gfc_build_builtin_function_decls): Likewise.

My fixup is longer :)
All strings starting with R or W are wrong.  However I have instances of
miamatched lengths say for caf_register, deregister and others.
There are few cases where i dropped the sting
Such as for caf_sendget_by_ref since I was unable to make sense of it.

Honza
diff --git a/gcc/fortran/trans-decl.c b/gcc/fortran/trans-decl.c
index 92242771dde..8a5537432be 100644
--- a/gcc/fortran/trans-decl.c
+++ b/gcc/fortran/trans-decl.c
@@ -3484,16 +3493,16 @@ gfc_build_intrinsic_function_decls (void)
   /* Misc. functions.  */
 
   gfor_fndecl_ttynam = gfc_build_library_function_decl_with_spec (
-   get_identifier (PREFIX("ttynam")), ".W",
+   get_identifier (PREFIX("ttynam")), ".W..",
void_type_node, 3, pchar_type_node, gfc_charlen_type_node,
integer_type_node);
 
   gfor_fndecl_fdate = gfc_build_library_function_decl_with_spec (
-   get_identifier (PREFIX("fdate")), ".W",
+   get_identifier (PREFIX("fdate")), ".W.",
void_type_node, 2, pchar_type_node, gfc_charlen_type_node);
 
   gfor_fndecl_ctime = gfc_build_library_function_decl_with_spec (
-   get_identifier (PREFIX("ctime")), ".W",
+   get_identifier (PREFIX("ctime")), ".W..",
void_type_node, 3, pchar_type_node, gfc_charlen_type_node,
gfc_int8_type_node);
 
@@ -3514,8 +3523,8 @@ gfc_build_intrinsic_function_decls (void)
   DECL_PURE_P (gfor_fndecl_si_kind) = 1;
   TREE_NOTHROW (gfor_fndecl_si_kind) = 1;
 
-  gfor_fndecl_sr_kind = gfc_build_library_function_decl_with_spec (
-   get_identifier (PREFIX("selected_real_kind2008")), ".RR",
+  gfor_fndecl_sr_kind = gfc_build_library_function_decl (
+   get_identifier (PREFIX("selected_real_kind2008")), 
gfc_int4_type_node, 3, pvoid_type_node, pvoid_type_node,
pvoid_type_node);
   DECL_PURE_P (gfor_fndecl_sr_kind) = 1;
@@ -3662,7 +3671,7 @@ gfc_build_intrinsic_function_decls (void)
   TREE_NOTHROW (gfor_fndecl_size0) = 1;
 
   gfor_fndecl_size1 = gfc_build_library_function_decl_with_spec (
-   get_identifier (PREFIX("size1")), ".R",
+   get_identifier (PREFIX("size1")), ".R.",
gfc_array_index_type, 2, pvoid_type_node, gfc_array_index_type);
   DECL_PURE_P (gfor_fndecl_size1) = 1;
   TREE_NOTHROW (gfor_fndecl_size1) = 1;
@@ -3701,7 +3710,7 @@ gfc_build_builtin_function_decls (void)
   TREE_THIS_VOLATILE (gfor_fndecl_stop_numeric) = 1;
 
   gfor_fndecl_stop_string = gfc_build_library_function_decl_with_spec (
-   get_identifier (PREFIX("stop_string")), ".R.",
+   get_identifier (PREFIX("stop_string")), ".R..",
void_type_node, 3, pchar_type_node, size_type_node,
boolean_type_node);
   /* STOP doesn't return.  */
@@ -3714,7 +3723,7 @@ gfc_build_builtin_function_decls (void)
   TREE_THIS_VOLATILE (gfor_fndecl_error_stop_numeric) = 1;
 
   gfor_fndecl_error_stop_string = gfc_build_library_function_decl_with_spec (
-   get_identifier (PREFIX("error_stop_string")), ".R.",
+   get_identifier (PREFIX("error_stop_string")), ".R..",
void_type_node, 3, pchar_type_node, size_type_node,
boolean_type_node);
   /* ERROR STOP doesn't return.  */
@@ -3841,50 +3850,50 @@ gfc_build_builtin_function_decls (void)
get_identifier (PREFIX("caf_num_images")), integer_type_node,
2, integer_type_node, integer_type_node);
 
-  gfor_fndecl_caf_register = gfc_build_library_function_decl_with_spec (
-   get_identifier (PREFIX("caf_register")), "RRR", void_type_node, 7,
+  gfor_fndecl_caf_register = gfc_build_library_function_decl (
+   get_identifier (PREFIX("caf_register")), void_type_node, 7,
size_type_node, integer_type_node, ppvoid_type_node, pvoid_type_node,
pint_type, pchar_type_node, size_type_node);
 
   gfor_fndecl_caf_deregister = gfc_build_library_function_decl_with_spec (
-   get_identifier (PREFIX("caf_deregister")), "WRWWR", void_type_node, 5,
+   get_identifier (PREFIX("caf_deregister")), ".W.WW.", void_type_node, 5,
ppvoid_type_node, integer_type_node, pint_type, pchar_type_node,
size_type_node);
 
-  gfor_fndecl_caf_get = gfc_build_library_function_decl_with_spec (
-   get_identifier

[PATCH][GCC][ARM] Add support for Cortex-X1

2020-09-29 Thread Przemyslaw Wirkus

Hi,

This change adds support for the Arm Cortex-X1 CPU. For more information about
this processor, see [0].

[0] : https://www.arm.com/products/cortex-x

OK for master branch ?

kind regards,
Przemyslaw Wirkus

gcc/ChangeLog:

  * config/arm/arm-cpus.in: Add Cortex-X1 core.
  * config/arm/arm-tables.opt: Regenerate.
  * config/arm/arm-tune.md: Regenerate.
  * doc/invoke.texi: Update docs.


rb13543.patch
Description: rb13543.patch

[PATCH][GCC][AArch64] Add support for Cortex-X1

2020-09-29 Thread Przemyslaw Wirkus

Hi,

This change adds support for the Arm Cortex-X1 CPU in AArch64 GCC. For more
information about this processor, see [0].

[0] : https://www.arm.com/products/cortex-x

OK for master branch ?

kind regards,
Przemyslaw Wirkus

gcc/ChangeLog:

* config/aarch64/aarch64-cores.def: Add Cortex-X1 Arm core.
* config/aarch64/aarch64-tune.md: Regenerate.
* doc/invoke.texi: Add -mtune=cortex-x1 docs.


rb13542.patch
Description: rb13542.patch

Re: Add trailing dots to fortran io fnspecs to match signature

2020-09-29 Thread Martin Liška


On 9/29/20 4:20 PM, Jan Hubicka wrote:

this patch is not needed but makes it possible to sanity check that
fnspec match function signature. It turns out that there are quite few


I'm sending the run-time sanity check patch and few more places that assert.
I'm going to test the patch.

Martin
>From 3b1ff799c08f6e7adb3eb6fb988599197c639ec0 Mon Sep 17 00:00:00 2001
From: Martin Liska 
Date: Tue, 29 Sep 2020 16:35:12 +0200
Subject: [PATCH] Add trailing dots to
 gfc_build_library_function_decl_with_spec.

gcc/fortran/ChangeLog:

	* trans-decl.c (gfc_build_library_function_decl_with_spec): Add
	runtime assert about length of SPEC argument.
	(gfc_build_intrinsic_function_decls): Add trailing dots.
	(gfc_build_builtin_function_decls): Likewise.
---
 gcc/fortran/trans-decl.c | 15 ---
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/gcc/fortran/trans-decl.c b/gcc/fortran/trans-decl.c
index 92242771dde..ef9c3f7d28f 100644
--- a/gcc/fortran/trans-decl.c
+++ b/gcc/fortran/trans-decl.c
@@ -3303,6 +3303,7 @@ tree
 gfc_build_library_function_decl_with_spec (tree name, const char *spec,
 	   tree rettype, int nargs, ...)
 {
+  gcc_checking_assert ((int)strlen (spec) == abs (nargs) + 1);
   tree ret;
   va_list args;
   va_start (args, nargs);
@@ -3484,16 +3485,16 @@ gfc_build_intrinsic_function_decls (void)
   /* Misc. functions.  */
 
   gfor_fndecl_ttynam = gfc_build_library_function_decl_with_spec (
-	get_identifier (PREFIX("ttynam")), ".W",
+	get_identifier (PREFIX("ttynam")), ".W..",
 	void_type_node, 3, pchar_type_node, gfc_charlen_type_node,
 	integer_type_node);
 
   gfor_fndecl_fdate = gfc_build_library_function_decl_with_spec (
-	get_identifier (PREFIX("fdate")), ".W",
+	get_identifier (PREFIX("fdate")), ".W.",
 	void_type_node, 2, pchar_type_node, gfc_charlen_type_node);
 
   gfor_fndecl_ctime = gfc_build_library_function_decl_with_spec (
-	get_identifier (PREFIX("ctime")), ".W",
+	get_identifier (PREFIX("ctime")), ".W..",
 	void_type_node, 3, pchar_type_node, gfc_charlen_type_node,
 	gfc_int8_type_node);
 
@@ -3515,7 +3516,7 @@ gfc_build_intrinsic_function_decls (void)
   TREE_NOTHROW (gfor_fndecl_si_kind) = 1;
 
   gfor_fndecl_sr_kind = gfc_build_library_function_decl_with_spec (
-	get_identifier (PREFIX("selected_real_kind2008")), ".RR",
+	get_identifier (PREFIX("selected_real_kind2008")), ".RR.",
 	gfc_int4_type_node, 3, pvoid_type_node, pvoid_type_node,
 	pvoid_type_node);
   DECL_PURE_P (gfor_fndecl_sr_kind) = 1;
@@ -3662,7 +3663,7 @@ gfc_build_intrinsic_function_decls (void)
   TREE_NOTHROW (gfor_fndecl_size0) = 1;
 
   gfor_fndecl_size1 = gfc_build_library_function_decl_with_spec (
-	get_identifier (PREFIX("size1")), ".R",
+	get_identifier (PREFIX("size1")), ".R.",
 	gfc_array_index_type, 2, pvoid_type_node, gfc_array_index_type);
   DECL_PURE_P (gfor_fndecl_size1) = 1;
   TREE_NOTHROW (gfor_fndecl_size1) = 1;
@@ -3701,7 +3702,7 @@ gfc_build_builtin_function_decls (void)
   TREE_THIS_VOLATILE (gfor_fndecl_stop_numeric) = 1;
 
   gfor_fndecl_stop_string = gfc_build_library_function_decl_with_spec (
-	get_identifier (PREFIX("stop_string")), ".R.",
+	get_identifier (PREFIX("stop_string")), ".R..",
 	void_type_node, 3, pchar_type_node, size_type_node,
 	boolean_type_node);
   /* STOP doesn't return.  */
@@ -3714,7 +3715,7 @@ gfc_build_builtin_function_decls (void)
   TREE_THIS_VOLATILE (gfor_fndecl_error_stop_numeric) = 1;
 
   gfor_fndecl_error_stop_string = gfc_build_library_function_decl_with_spec (
-	get_identifier (PREFIX("error_stop_string")), ".R.",
+	get_identifier (PREFIX("error_stop_string")), ".R..",
 	void_type_node, 3, pchar_type_node, size_type_node,
 	boolean_type_node);
   /* ERROR STOP doesn't return.  */
-- 
2.28.0

[PATCH] c++: Set the constraints of a class type sooner [PR96229]

2020-09-29 Thread Patrick Palka via Gcc-patches

In the testcase below, during processing (at parse time) of Y's base
class X, convert_template_argument calls is_compatible_template_arg
to check if the template argument Y is no more constrained than the
parameter P.  But at this point we haven't yet set Y's constraints, so
get_normalized_constraints_from_decl yields NULL_TREE as the normal form
and it caches this result in the normalized_map.

We set Y's constraints later in cp_parser_class_specifier_1 but the
stale normal form in the normalized_map remains.  This ultimately causes
us to miss the constraint failure for Y because according to the
cached normal form, it's not constrained.

This patch fixes this issue by moving up the call to
associate_classtype_constraints so that we set constraints before we
begin processing its bases.

Tested on x86_64-pc-linux-gnu, and also on the cmcstlv2 and range-v3
libraries.  Does this look OK to commit?

gcc/cp/ChangeLog:

PR c++/96229
* parser.c (cp_parser_class_specifier_1): Move call to
associate_classtype_constraints from here to ...
(cp_parser_class_head): ... here, before we process bases.
* pt.c (is_compatible_template_arg): Correct documentation to
say "argument is _no_ more constrained than the parameter".

gcc/testsuite/ChangeLog:

PR c++/96229
* g++.dg/cpp2a/concepts-class2.C: New test.
---
 gcc/cp/parser.c  |  8 
 gcc/cp/pt.c  |  7 ---
 gcc/testsuite/g++.dg/cpp2a/concepts-class2.C | 11 +++
 3 files changed, 19 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/cpp2a/concepts-class2.C

diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c
index 8905833fbd6..b44bdf21e1d 100644
--- a/gcc/cp/parser.c
+++ b/gcc/cp/parser.c
@@ -23978,10 +23978,6 @@ cp_parser_class_specifier_1 (cp_parser* parser)
 = parser->in_unbraced_linkage_specification_p;
   parser->in_unbraced_linkage_specification_p = false;
 
-  // Associate constraints with the type.
-  if (flag_concepts)
-type = associate_classtype_constraints (type);
-
   /* Start the class.  */
   if (nested_name_specifier_p)
 {
@@ -24749,6 +24745,10 @@ cp_parser_class_head (cp_parser* parser,
   fixup_attribute_variants (type);
 }
 
+  /* Associate constraints with the type.  */
+  if (flag_concepts)
+type = associate_classtype_constraints (type);
+
   /* We will have entered the scope containing the class; the names of
  base classes should be looked up in that context.  For example:
 
diff --git a/gcc/cp/pt.c b/gcc/cp/pt.c
index 199fe658f71..96ad2025893 100644
--- a/gcc/cp/pt.c
+++ b/gcc/cp/pt.c
@@ -8126,9 +8126,10 @@ canonicalize_expr_argument (tree arg, tsubst_flags_t 
complain)
   return canon;
 }
 
-// A template declaration can be substituted for a constrained
-// template template parameter only when the argument is more
-// constrained than the parameter.
+/* A template declaration can be substituted for a constrained
+   template template parameter only when the argument is no more
+   constrained than the parameter.  */
+
 static bool
 is_compatible_template_arg (tree parm, tree arg)
 {
diff --git a/gcc/testsuite/g++.dg/cpp2a/concepts-class2.C 
b/gcc/testsuite/g++.dg/cpp2a/concepts-class2.C
new file mode 100644
index 000..0ed9eb0a386
--- /dev/null
+++ b/gcc/testsuite/g++.dg/cpp2a/concepts-class2.C
@@ -0,0 +1,11 @@
+// PR c++/96229
+// { dg-do compile { target c++20 } }
+
+template  concept Int = requires { T{0}; };
+template  class P> struct X{ };
+template struct Y : X { };
+  struct Z{ };
+  struct W{ int i; };
+
+Y z; // { dg-error "constraint" }
+Y w;
-- 
2.28.0.618.g9bc233ae1c

Add trailing dots to fortran io fnspecs to match signature

2020-09-29 Thread Jan Hubicka

Hi,
this patch is not needed but makes it possible to sanity check that
fnspec match function signature. It turns out that there are quite few
mistakes in that in trans-decl and one mistake here.
Transfer_derived has additional parameters.

Bootstrapped/regtested x86_64-linux. OK?
Honza

* transe-io.c (gfc_build_io_library_fndecls): Add traling "." for
fnspecs so the match number of parameters.
diff --git a/gcc/fortran/trans-io.c b/gcc/fortran/trans-io.c
index 21bdd5ef0d8..363cca51ef9 100644
--- a/gcc/fortran/trans-io.c
+++ b/gcc/fortran/trans-io.c
@@ -328,86 +328,86 @@ gfc_build_io_library_fndecls (void)
   dt_parm_type = build_pointer_type (st_parameter[IOPARM_ptype_dt].type);
 
   iocall[IOCALL_X_INTEGER] = gfc_build_library_function_decl_with_spec (
-   get_identifier (PREFIX("transfer_integer")), ".wW",
+   get_identifier (PREFIX("transfer_integer")), ".wW.",
void_type_node, 3, dt_parm_type, pvoid_type_node, gfc_int4_type_node);
 
   iocall[IOCALL_X_INTEGER_WRITE] = gfc_build_library_function_decl_with_spec (
-   get_identifier (PREFIX("transfer_integer_write")), ".wR",
+   get_identifier (PREFIX("transfer_integer_write")), ".wR.",
void_type_node, 3, dt_parm_type, pvoid_type_node, gfc_int4_type_node);
 
   iocall[IOCALL_X_LOGICAL] = gfc_build_library_function_decl_with_spec (
-   get_identifier (PREFIX("transfer_logical")), ".wW",
+   get_identifier (PREFIX("transfer_logical")), ".wW.",
void_type_node, 3, dt_parm_type, pvoid_type_node, gfc_int4_type_node);
 
   iocall[IOCALL_X_LOGICAL_WRITE] = gfc_build_library_function_decl_with_spec (
-   get_identifier (PREFIX("transfer_logical_write")), ".wR",
+   get_identifier (PREFIX("transfer_logical_write")), ".wR.",
void_type_node, 3, dt_parm_type, pvoid_type_node, gfc_int4_type_node);
 
   iocall[IOCALL_X_CHARACTER] = gfc_build_library_function_decl_with_spec (
-   get_identifier (PREFIX("transfer_character")), ".wW",
+   get_identifier (PREFIX("transfer_character")), ".wW.",
void_type_node, 3, dt_parm_type, pvoid_type_node, 
gfc_charlen_type_node);
 
   iocall[IOCALL_X_CHARACTER_WRITE] = gfc_build_library_function_decl_with_spec 
(
-   get_identifier (PREFIX("transfer_character_write")), ".wR",
+   get_identifier (PREFIX("transfer_character_write")), ".wR.",
void_type_node, 3, dt_parm_type, pvoid_type_node, 
gfc_charlen_type_node);
 
   iocall[IOCALL_X_CHARACTER_WIDE] = gfc_build_library_function_decl_with_spec (
-   get_identifier (PREFIX("transfer_character_wide")), ".wW",
+   get_identifier (PREFIX("transfer_character_wide")), ".wW..",
void_type_node, 4, dt_parm_type, pvoid_type_node,
gfc_charlen_type_node, gfc_int4_type_node);
 
   iocall[IOCALL_X_CHARACTER_WIDE_WRITE] =
 gfc_build_library_function_decl_with_spec (
-   get_identifier (PREFIX("transfer_character_wide_write")), ".wR",
+   get_identifier (PREFIX("transfer_character_wide_write")), ".wR..",
void_type_node, 4, dt_parm_type, pvoid_type_node,
gfc_charlen_type_node, gfc_int4_type_node);
 
   iocall[IOCALL_X_REAL] = gfc_build_library_function_decl_with_spec (
-   get_identifier (PREFIX("transfer_real")), ".wW",
+   get_identifier (PREFIX("transfer_real")), ".wW.",
void_type_node, 3, dt_parm_type, pvoid_type_node, gfc_int4_type_node);
 
   iocall[IOCALL_X_REAL_WRITE] = gfc_build_library_function_decl_with_spec (
-   get_identifier (PREFIX("transfer_real_write")), ".wR",
+   get_identifier (PREFIX("transfer_real_write")), ".wR.",
void_type_node, 3, dt_parm_type, pvoid_type_node, gfc_int4_type_node);
 
   iocall[IOCALL_X_COMPLEX] = gfc_build_library_function_decl_with_spec (
-   get_identifier (PREFIX("transfer_complex")), ".wW",
+   get_identifier (PREFIX("transfer_complex")), ".wW.",
void_type_node, 3, dt_parm_type, pvoid_type_node, gfc_int4_type_node);
 
   iocall[IOCALL_X_COMPLEX_WRITE] = gfc_build_library_function_decl_with_spec (
-   get_identifier (PREFIX("transfer_complex_write")), ".wR",
+   get_identifier (PREFIX("transfer_complex_write")), ".wR.",
void_type_node, 3, dt_parm_type, pvoid_type_node, gfc_int4_type_node);
 
   /* Version for __float128.  */
   iocall[IOCALL_X_REAL128] = gfc_build_library_function_decl_with_spec (
-   get_identifier (PREFIX("transfer_real128")), ".wW",
+   get_identifier (PREFIX("transfer_real128")), ".wW.",
void_type_node, 3, dt_parm_type, pvoid_type_node, gfc_int4_type_node);
 
   iocall[IOCALL_X_REAL128_WRITE] = gfc_build_library_function_decl_with_spec (
-   get_identifier (PREFIX("transfer_real128_write")), ".wR",
+   get_identifier (PREFIX("transfer_real128_write")), ".wR.",
void_type_node, 3, dt_parm_type, pvoid_type_node, gfc_int4_type_node);
 
   iocall[IOCALL_X_COMPLEX128] = gfc_build_library_function_decl_with_spec (
-   get_identifier (PREFIX("transfer_complex128")), ".wW",
+

Fix internal fnspec

2020-09-29 Thread Jan Hubicka

Hi,
this patch fixes accidental \000 in fnspec strings for internal fns.
OK?
Honza

* internal-fn.c (DEF_INTERNAL_FN): Fix call of build_string.
diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
index 8ea3195d31c..c8970820026 100644
--- a/gcc/internal-fn.c
+++ b/gcc/internal-fn.c
@@ -93,7 +93,7 @@ init_internal_fns ()
 {
 #define DEF_INTERNAL_FN(CODE, FLAGS, FNSPEC) \
   if (FNSPEC) internal_fn_fnspec_array[IFN_##CODE] = \
-build_string ((int) sizeof (FNSPEC), FNSPEC ? FNSPEC : "");
+build_string ((int) sizeof (FNSPEC) - 1, FNSPEC ? FNSPEC : "");
 #include "internal-fn.def"
   internal_fn_fnspec_array[IFN_LAST] = 0;
 }

[PATCH] tree-optimization/97241 - fix ICE in reduction vectorization

2020-09-29 Thread Richard Biener

The following moves an ad-hoc attempt at discovering the SLP node
for a stmt to the place where we can find it in lock-step when
we find the stmt itself.

Bootstrapped / tested on x86_64-unknown-linux-gnu, pushed.

2020-09-29  Richard Biener  

PR tree-optimization/97241
* tree-vect-loop.c (vectorizable_reduction): Move finding
the SLP node for the reduction stmt to a better place.

* gcc.dg/vect/pr97241.c: New testcase.
---
 gcc/testsuite/gcc.dg/vect/pr97241.c | 19 +++
 gcc/tree-vect-loop.c| 17 +
 2 files changed, 24 insertions(+), 12 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/pr97241.c

diff --git a/gcc/testsuite/gcc.dg/vect/pr97241.c 
b/gcc/testsuite/gcc.dg/vect/pr97241.c
new file mode 100644
index 000..d4be8f60940
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr97241.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 --param max-loop-header-insns=2" } */
+
+short int *ev;
+int l4;
+
+short int
+a7 (void)
+{
+  short int uo = ev[0], ie = uo;
+
+  for (int kp = 0; kp < l4; kp += 4)
+{
+  uo += ev[kp + 1];
+  ie += ev[kp];
+}
+
+  return uo + ie;
+}
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index f1d6bdde412..ce5d95d7277 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -6357,12 +6357,14 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   gphi *reduc_def_phi = as_a  (phi_info->stmt);
 
   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
- and compute the reduction chain length.  */
+ and compute the reduction chain length.  Discover the real
+ reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
   tree reduc_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
  loop_latch_edge (loop));
   unsigned reduc_chain_length = 0;
   bool only_slp_reduc_chain = true;
   stmt_info = NULL;
+  slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
   while (reduc_def != PHI_RESULT (reduc_def_phi))
 {
   stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
@@ -6405,6 +6407,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
stmt_info = vdef;
   reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
   reduc_chain_length++;
+  if (!stmt_info && slp_node)
+   slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
 }
   /* PHIs should not participate in patterns.  */
   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
@@ -6491,17 +6495,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
  The last use is the reduction variable.  In case of nested cycle this
  assumption is not true: we use reduc_index to record the index of the
  reduction variable.  */
-  /* ???  To get at invariant/constant uses on the SLP node we have to
- get to it here, slp_node is still the reduction PHI.  */
-  slp_tree slp_for_stmt_info = NULL;
-  if (slp_node)
-{
-  slp_for_stmt_info = slp_node_instance->root;
-  /* And then there's reduction chain with a conversion ...  */
-  if (SLP_TREE_REPRESENTATIVE (slp_for_stmt_info) != stmt_info)
-   slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
-  gcc_assert (SLP_TREE_REPRESENTATIVE (slp_for_stmt_info) == stmt_info);
-}
   slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
   /* We need to skip an extra operand for COND_EXPRs with embedded
  comparison.  */
-- 
2.26.2

Re: BoF DWARF5 patches (25% .debug section size reduction)

2020-09-29 Thread Mark Wielaard

On Thu, 2020-09-10 at 13:16 +0200, Jakub Jelinek wrote:
> On Wed, Sep 09, 2020 at 09:57:54PM +0200, Mark Wielaard wrote:
> > --- a/gcc/doc/invoke.texi
> > +++ b/gcc/doc/invoke.texi
> > @@ -9057,13 +9057,14 @@ possible.
> >  @opindex gdwarf
> >  Produce debugging information in DWARF format (if that is supported).
> >  The value of @var{version} may be either 2, 3, 4 or 5; the default version
> > -for most targets is 4.  DWARF Version 5 is only experimental.
> > +for most targets is 5 (with the exception of vxworks and darwin which
> > +default to version 2).
> 
> I think in documentation we should spell these VxWorks and Darwin/Mac OS X

OK. As attached.

Are we ready to flip the default to 5?

Thanks,

Mark
From 409bd1b2c60905b0f96c94fface12154d3be4d32 Mon Sep 17 00:00:00 2001
From: Mark Wielaard 
Date: Tue, 29 Sep 2020 15:52:44 +0200
Subject: [PATCH] Default to DWARF5

gcc/ChangeLog:

	* common.opt (gdwarf-): Init(5).
	* doc/invoke.texi (-gdwarf): Document default to 5.
---
 gcc/common.opt  | 2 +-
 gcc/doc/invoke.texi | 7 ---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/gcc/common.opt b/gcc/common.opt
index 292c2de694ef..d1722de80bf0 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -3148,7 +3148,7 @@ Common Driver JoinedOrMissing Negative(gdwarf-)
 Generate debug information in default version of DWARF format.
 
 gdwarf-
-Common Driver Joined UInteger Var(dwarf_version) Init(4) Negative(gstabs)
+Common Driver Joined UInteger Var(dwarf_version) Init(5) Negative(gstabs)
 Generate debug information in DWARF v2 (or later) format.
 
 ggdb
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 2091e0cd23b9..e6453374bcd4 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -9210,14 +9210,15 @@ possible.
 @itemx -gdwarf-@var{version}
 @opindex gdwarf
 Produce debugging information in DWARF format (if that is supported).
-The value of @var{version} may be either 2, 3, 4 or 5; the default version
-for most targets is 4.  DWARF Version 5 is only experimental.
+The value of @var{version} may be either 2, 3, 4 or 5; the default
+version for most targets is 5 (with the exception of VxWorks and
+Darwin/Mac OS X which default to version 2).
 
 Note that with DWARF Version 2, some ports require and always
 use some non-conflicting DWARF 3 extensions in the unwind tables.
 
 Version 4 may require GDB 7.0 and @option{-fvar-tracking-assignments}
-for maximum benefit.
+for maximum benefit. Version 5 requires GDB 8.0 or higher.
 
 GCC no longer supports DWARF Version 1, which is substantially
 different than Version 2 and later.  For historical reasons, some
-- 
2.18.4

Re: PING^3 [GCC 10] [PATCH] IRA: Don't make a global register eliminable

2020-09-29 Thread Vladimir Makarov via Gcc-patches




On 2020-09-29 8:38 a.m., H.J. Lu wrote:

On Fri, Sep 25, 2020 at 6:46 AM H.J. Lu  wrote:

OK for GCC 10 branch?

Thanks.

PING:

https://gcc.gnu.org/pipermail/gcc-patches/2020-September/554268.html


PING.


PING.

Sorry, I thought Jeff Law already approved this.  In any case the patch 
is also ok for me for the trunk and gcc-10 branch.

RE: [PATCH][GCC 8] arm: Add support for Neoverse V1 CPU

2020-09-29 Thread Kyrylo Tkachov



> -Original Message-
> From: Alex Coplan 
> Sent: 29 September 2020 13:17
> To: gcc-patches@gcc.gnu.org
> Cc: ni...@redhat.com; Richard Earnshaw ;
> Ramana Radhakrishnan ; Kyrylo
> Tkachov 
> Subject: [PATCH][GCC 8] arm: Add support for Neoverse V1 CPU
> 
> Hello,
> 
> This patch backports the AArch32 support for Arm's Neoverse V1 CPU to
> GCC 8.
> 
> Testing:
>  * Bootstrapped and regtested on arm-none-linux-gnueabihf.
> 
> OK for GCC 8 branch?

Ok.
Thanks,
Kyrill

> 
> Thanks,
> Alex
> 
> ---
> 
> gcc/ChangeLog:
> 
>   * config/arm/arm-cpus.in (neoverse-v1): New.
>   * config/arm/arm-tables.opt: Regenerate.
>   * config/arm/arm-tune.md: Regenerate.
>   * doc/invoke.texi: Document AArch32 support for Neoverse V1.

Re: [PATCH] Fortran : Two further previously missed ICEs PR53298

2020-09-29 Thread Mark Eggleston




On 16/09/2020 08:02, Andre Vehreschild wrote:

Hi Mark,

a few remarks:

[...]


[PATCH] Fortran  : Two further previously missed ICEs PR53298

There were 3 ICEs with different call stacks in the comments of this
PR.  A previous commit fixed only one of those ICEs.

The ICEs fixed here are in trans-array.c and trans-expr.c.

The first ICE occurred when the array reference is not AR_ELEMENT
gfc_conv_scalarized_array_ref is called with se and ar, if se->ss is
NULL the ICE occurs.  If se->ss is NULL there is nothing to do before
the return.

The second ICE occurs in code that did not match its comments.  Fixing
the code to match the comments fixes the ICE.  A side affect is that
the in the tree dumps for finalize_35.f90 and finalize_36.f90 contain

   ^^^
   Spurious "the" found.

wording has been updated.


[...]

diff --git a/gcc/fortran/trans-array.c b/gcc/fortran/trans-array.c
index 6566c47d4ae..06268739515 100644
--- a/gcc/fortran/trans-array.c
+++ b/gcc/fortran/trans-array.c
@@ -3638,8 +3638,11 @@ gfc_conv_array_ref (gfc_se * se, gfc_array_ref *
ar, gfc_expr *expr, /* Handle scalarized references separately.  */
if (ar->type != AR_ELEMENT)
  {
-  gfc_conv_scalarized_array_ref (se, ar);
-  gfc_advance_se_ss_chain (se);
+  if (se->ss)
+   {
+ gfc_conv_scalarized_array_ref (se, ar);
+ gfc_advance_se_ss_chain (se);
+   }

Why is this only in element ref needed and not every else? When I tried
to fix ICEs this way, I was usually asked if was fixing symptom and not
the error.


This is for references that aren't elements. As I understand it se 
corresponds to the upper bound, e.g. for a(1:) the upper bound may not 
be known at this point so se is NULL resulting in a crash due to it 
being de-referenced in gfc_conv_scalarized_array_ref.




return;
  }
  
diff --git a/gcc/fortran/trans-expr.c b/gcc/fortran/trans-expr.c

index 36ff9b5cbc6..193553ace0b 100644
--- a/gcc/fortran/trans-expr.c
+++ b/gcc/fortran/trans-expr.c
@@ -2474,8 +2474,8 @@ gfc_conv_component_ref (gfc_se * se, gfc_ref *
ref) RECORD_TYPE within a UNION_TYPE) always use the given FIELD_DECL.
*/
if (context != TREE_TYPE (decl)
-  && !(   TREE_CODE (TREE_TYPE (field)) == UNION_TYPE /* Field is
union */
-   || TREE_CODE (context) == UNION_TYPE)) /* Field is
map */
+  && (   TREE_CODE (context) == UNION_TYPE /* Field is union */
+  || TREE_CODE (context) == MAP_TYPE)) /* Field is map */
  {
tree f2 = c->norestrict_decl;
if (!f2 || DECL_FIELD_CONTEXT (f2) != TREE_TYPE (decl))

(Sorry for the line breaks).

I can't help it, but the old code looked so dubious that I wonder why
it worked in the first place. Have you tried with a mapped type?

structure /st/
  union
    map
  character(5) a, b
    end map
    map
  character(10) c
    end map
  end union
end structure

record /st/ r

r.c = "abcde12345"
write(*,*) r.a(2:), r.b(3:), r.c(6:)

end

outputs (as expected) with and without the patch

 bcde34512345

I can add this as an extra test case if required.



Regards,
Andre



--
https://www.codethink.co.uk/privacy.html

Re: Implement iterative dataflow in modref to track parameters

2020-09-29 Thread Martin Liška


On 9/29/20 10:13 AM, Martin Liška wrote:

Hello.

The patch caused:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97235

Martin


And these 2 PRs:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97243
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97244

Thanks,
Martin

[PATCH] arm: Fix ICEs in no-literal-pool.c on MVE

2020-09-29 Thread Alex Coplan

Hello,

This patch fixes ICEs when compiling
gcc/testsuite/gcc.target/arm/pure-code/no-literal-pool.c with
-mfp16-format=ieee -mfloat-abi=hard -march=armv8.1-m.main+mve
-mpure-code.

The existing conditions in the movsf/movdf expanders (as well as the
no_literal_pool patterns) were too restrictive, requiring
TARGET_HARD_FLOAT instead of TARGET_VFP_BASE, which caused unrecognised
insns when compiling this testcase with integer MVE and -mpure-code.

Testing:
 * Bootstrapped and regtested on arm-none-linux-gnueabihf.
 * Regtested an MVE cross build.

Comparison of test results before/after patch on MVE build:

UNRESOLVED->PASS: gcc.target/arm/pure-code/no-literal-pool.c   -O0   
scan-assembler-not \\.(float|l\\?double|d?byte|short|int|long|quad|word)\\s+[^.]
UNRESOLVED->PASS: gcc.target/arm/pure-code/no-literal-pool.c   -O0   
scan-assembler text,"0x2006"
FAIL->PASS: gcc.target/arm/pure-code/no-literal-pool.c   -O0  (test for excess 
errors)
UNRESOLVED->PASS: gcc.target/arm/pure-code/no-literal-pool.c   -O1   
scan-assembler-not \\.(float|l\\?double|d?byte|short|int|long|quad|word)\\s+[^.]
UNRESOLVED->PASS: gcc.target/arm/pure-code/no-literal-pool.c   -O1   
scan-assembler text,"0x2006"
FAIL->PASS: gcc.target/arm/pure-code/no-literal-pool.c   -O1  (test for excess 
errors)
UNRESOLVED->PASS: gcc.target/arm/pure-code/no-literal-pool.c  -O2 -flto 
-fno-use-linker-plugin -flto-partition=none -ffat-lto-objects  
scan-assembler-not \\.(float|l\\?double|d?byte|short|int|long|quad|word)\\s+[^.]
UNRESOLVED->PASS: gcc.target/arm/pure-code/no-literal-pool.c  -O2 -flto 
-fno-use-linker-plugin -flto-partition=none -ffat-lto-objects  scan-assembler 
text,"0x2006"
FAIL->PASS: gcc.target/arm/pure-code/no-literal-pool.c  -O2 -flto 
-fno-use-linker-plugin -flto-partition=none -ffat-lto-objects (test for excess 
errors)
UNRESOLVED->PASS: gcc.target/arm/pure-code/no-literal-pool.c  -O2 -flto 
-fuse-linker-plugin -fno-fat-lto-objects -ffat-lto-objects  scan-assembler-not 
\\.(float|l\\?double|d?byte|short|int|long|quad|word)\\s+[^.]
UNRESOLVED->PASS: gcc.target/arm/pure-code/no-literal-pool.c  -O2 -flto 
-fuse-linker-plugin -fno-fat-lto-objects -ffat-lto-objects  scan-assembler 
text,"0x2006"
FAIL->PASS: gcc.target/arm/pure-code/no-literal-pool.c  -O2 -flto 
-fuse-linker-plugin -fno-fat-lto-objects -ffat-lto-objects (test for excess 
errors)
UNRESOLVED->PASS: gcc.target/arm/pure-code/no-literal-pool.c   -O2   
scan-assembler-not \\.(float|l\\?double|d?byte|short|int|long|quad|word)\\s+[^.]
UNRESOLVED->PASS: gcc.target/arm/pure-code/no-literal-pool.c   -O2   
scan-assembler text,"0x2006"
FAIL->PASS: gcc.target/arm/pure-code/no-literal-pool.c   -O2  (test for excess 
errors)
UNRESOLVED->PASS: gcc.target/arm/pure-code/no-literal-pool.c   -O3 
-fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions   
scan-assembler-not \\.(float|l\\?double|d?byte|short|int|long|quad|word)\\s+[^.]
UNRESOLVED->PASS: gcc.target/arm/pure-code/no-literal-pool.c   -O3 
-fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions   
scan-assembler text,"0x2006"
FAIL->PASS: gcc.target/arm/pure-code/no-literal-pool.c   -O3 
-fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions  
(test for excess errors)
UNRESOLVED->PASS: gcc.target/arm/pure-code/no-literal-pool.c   -Os   
scan-assembler-not \\.(float|l\\?double|d?byte|short|int|long|quad|word)\\s+[^.]
UNRESOLVED->PASS: gcc.target/arm/pure-code/no-literal-pool.c   -Os   
scan-assembler text,"0x2006"
FAIL->PASS: gcc.target/arm/pure-code/no-literal-pool.c   -Os  (test for excess 
errors)
UNRESOLVED->PASS: gcc.target/arm/thumb2-slow-flash-data-1.c scan-assembler-not 
\\.(float|l\\?double|d?byte|short|int|long|quad|word)\\s+[^.]
FAIL->PASS: gcc.target/arm/thumb2-slow-flash-data-1.c (test for excess errors)

OK for trunk?

Thanks,
Alex

---

gcc/ChangeLog:

* config/arm/arm.md (movsf): Relax TARGET_HARD_FLOAT to
TARGET_VFP_BASE.
(movdf): Likewise.
* config/arm/vfp.md (no_literal_pool_df_immediate): Likewise.
(no_literal_pool_sf_immediate): Likewise.
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index 147c4a50c72..1a8e498ba4c 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -7357,7 +7357,7 @@ (define_expand "movsf"
   if (arm_disable_literal_pool
   && (REG_P (operands[0]) || SUBREG_P (operands[0]))
   && CONST_DOUBLE_P (operands[1])
-  && TARGET_HARD_FLOAT
+  && TARGET_VFP_BASE
   && !vfp3_const_double_rtx (operands[1]))
 {
   rtx clobreg = gen_reg_rtx (SFmode);
@@ -7454,7 +7454,7 @@ (define_expand "movdf"
   if (arm_disable_literal_pool
   && (REG_P (operands[0]) || SUBREG_P (operands[0]))
   && CONSTANT_P (operands[1])
-  && TARGET_HARD_FLOAT
+  && TARGET_VFP_BASE
   && !arm_const_double_rtx (operands[1])
   && !(TARGET_VFP_DOUBLE && vfp3_const_double_rtx (operands[1])))
 {
diff --git

Re: [PATCH] assorted improvements for fold_truth_andor_1

2020-09-29 Thread Alexandre Oliva

On Sep 29, 2020, Richard Biener  wrote:

> On Tue, Sep 29, 2020 at 9:23 AM Alexandre Oliva  wrote:

>> On Sep 28, 2020, Richard Biener  wrote:

> ifcombine should stop using fold*, yeah

Wow, that's quite a lot of work for no expected improvement in codegen.
I don't expect to be able to justify such an undertaking :-(

> I also think it will not end up using the simplifications using loads.

Yeah, ifcombine's bb_no_side_effects_p gives up on any gimple_vuse in
the inner block.  that won't do when the whole point is to merge loads
from memory.

That seems excessive.  Since we rule out any memory-changing side
effects, I suppose we could get away with checking for volatile operands
there.  Then, adding just a little SSA_DEF chasing, I believe I could
bring all of the fold_truth_andor_1 logic I've worked on into ifcombine
without much difficulty, and then we could do away with at least that
part of fold_truth_andor.

> Specifically your patch seems to introduce splitting of loads
> at alignment boundaries

... when there's another compare involving a load from either side of
the crossed alignment boundary.  Even on arches that can do unaligned
loads, the result is no worse, and if there are multiple fields crossing
consecutive alignment boundaries, the codegen and performance difference
can be pretty significant.

>> I *think* ifcombine could even be extended so as to reuse the
>> separate-test logic I put in, by looking for non-immediate dominating
>> outer conditions for the inner condition.  A further modified version of
>> fold_truth_andor_1 could then be used to combine the separate tests.

> I think the structure of ifcombine doesn't exactly match what
> fold_truth_andor does

How so?  AFAICT ifcombine_ifandif deals exactly with the (gimplified
version of the) structure I described in the patch that started the
thread:

  (a.x1 EQNE b.x1)  ANDOR  (a.y1 EQNE b.y1)

-- 
Alexandre Oliva, happy hacker
https://FSFLA.org/blogs/lxo/
Free Software Activist
GNU Toolchain Engineer

Re: [PATCH] pass: Run cleanup passes before SLP [PR96789]

2020-09-29 Thread Kewen.Lin via Gcc-patches

Hi Richard,

Thanks for the comments!

> diff --git a/gcc/tree-ssa-loop-ivcanon.c b/gcc/tree-ssa-loop-ivcanon.c
> index 298ab215530..7016f993339 100644
> --- a/gcc/tree-ssa-loop-ivcanon.c
> +++ b/gcc/tree-ssa-loop-ivcanon.c
> @@ -1605,6 +1605,14 @@ pass_complete_unroll::execute (function *fun)
>  peeled_loops = BITMAP_ALLOC (NULL);
>unsigned int val = tree_unroll_loops_completely (flag_cunroll_grow_size,
>true);
> +
> +  /* There are no loops after unrolling, we assume that it's not so costly
> + to do the scalar cleanup since here.  FIXME: Some heuristics can be
> + further added to guard the cost level, like nodes number total, all
> + the original loops should be with single exits, etc.  */
> +  if (!current_loops->tree_root->inner)
> +val |= TODO_force_next_scalar_cleanup;
> +
> 
> so this is not the appropriate way to guard this.  Instead in
> 
> static unsigned int
> tree_unroll_loops_completely (bool may_increase_size, bool unroll_outer)
> {
> 
> look where we do
> 
>   bitmap fathers = BITMAP_ALLOC (NULL);
>   EXECUTE_IF_SET_IN_BITMAP (father_bbs, 0, i, bi)
> {
>   basic_block unrolled_loop_bb = BASIC_BLOCK_FOR_FN (cfun, i);
>   if (! unrolled_loop_bb)
> continue;
>   if (loop_outer (unrolled_loop_bb->loop_father))
> bitmap_set_bit (fathers,
> unrolled_loop_bb->loop_father->num);
> 
> and in the else case return TODO_force_next_scalar_cleanup because
> then we know we have unrolled an outermost loop and not ran VN
> immediately on it.

OK, I'll move it there with:
  1) set one bool var once we have any outermost loop unrolled when iterating.
  2) after iterating, check whether no loops, flag TODO if yes.

I had one question that we will have some heuristics here to guard this cleanup
to execute or not, then do we want SEME VN to run on each outermost unrolled
loop with some pre-record info?  Not sure whether it's worthy, though it can
help a bit when the cleanup isn't triggered.

> 
> +/* Scalar cleanup, it has several gated cleanup passes like FRE, DSE.  */
> +
> +namespace {
> +
> +const pass_data pass_data_scalar_cleanup =
> +{
> +  GIMPLE_PASS, /* type */
> +  "*scalar_cleanup", /* name */
> +  OPTGROUP_LOOP, /* optinfo_flags */
> 
> this new "pass" doesn't have to do anything with tree-ssa-loop-ivcanon.c
> so please add it to passes.c instead (there's already a bunch of
> pass definitions in there).

Will fix.

> 
> Can you repeat the compile-time measurement there?  I also wonder
> whether we should worry about compile-time at -O[12] when SLP is not run.
> Thus, probably rename the cleanup pass to pre_slp_scalar_cleanup and
> gate it on && flag_slp_vectorize

Good idea, will evaluate it.

> 
> Note this is probably the cleanest way to implement this hack.  But it
> still is what it is - a hack.  Not a proper fix for whatever the actual issue 
> is
> which means I'm not the one that's going to ack it (since I've suggested it).
> 

Got it.  Thanks for the suggestion again!  :-)

BR,
Kewen

[PATCH] Fortran : ICE in build_field PR95614 (2nd attempt)

2020-09-29 Thread Mark Eggleston


For review.

When the first attempt was committed the result was PR97224 i.e. it 
broke the build of SPECCPU 2006 Games.


I've changed the condition under which the error is produced. It was 
produced in the local symbol was also found as a global symbol and the 
the type of the symbol was not GSYM_UNKNOWN and not GSYM_COMMON.  This 
meant that subroutine names in commons in the SPECCPU source code were 
rejected.


The condition no produces an error if the global symbol is either 
GSYM_MODULE or GSYM_PROGRAM.


The relevant section in the standard (19.3.1 (2)):

"Within its scope, a local identifier of an entity of class (1) or class 
(4) shall not be the same as a global identifier used in that scope 
unless the global identifier


 * is used only as the use-name of a rename in a USE statement,
 * is a common block name (19.3.2),
 * is an external procedure name that is also a generic name, or
 * is an external function name and the inclusive scope is its defining
   subprogram (19.3.3)."

I've added two new test cases for subroutine and function.

I'm not certain about the restriction that the external procedure should 
be a generic name. I have found the earlier standards somewhat confusing 
on the subject, so I haven't determined whether there should be any 
standards dependent code.


--
https://www.codethink.co.uk/privacy.html

>From 56cd489ae564640e6cd397250f71947d768b796c Mon Sep 17 00:00:00 2001
From: Mark Eggleston 
Date: Thu, 11 Jun 2020 14:33:51 +0100
Subject: [PATCH] Fortran  :  ICE in build_field PR95614

Local identifiers can not be the same as a module name.  Original
patch by Steve Kargl resulted in name clashes between common block
names and local identifiers.  A local identifier can be the same as
a global identier if that identifier is not a module or a program.
The original patch was modified to reject global identifiers that
represent a module or a program.

2020-09-29  Steven G. Kargl  
	Mark Eggleston  

gcc/fortran/

	PR fortran/95614
	* decl.c (gfc_get_common): Use gfc_match_common_name instead
	of match_common_name.
	* decl.c (gfc_bind_idents): Use gfc_match_common_name instead
	of match_common_name.
	* match.c : Rename match_common_name to gfc_match_common_name.
	* match.c (gfc_match_common): Use gfc_match_common_name instead
	of match_common_name.
	* match.h : Rename match_common_name to gfc_match_common_name.
	* resolve.c (resolve_common_vars): Check each symbol in a
	common block has a global symbol.  If there is a global symbol
	issue an error if the symbol type is a module or a program.

2020-09-29  Mark Eggleston  

gcc/testsuite/

	PR fortran/95614
	* gfortran.dg/pr95614_1.f90: New test.
	* gfortran.dg/pr95614_2.f90: New test.
	* gfortran.dg/pr95614_3.f90: New test.
	* gfortran.dg/pr95614_4.f90: New test.
---
 gcc/fortran/decl.c  | 4 ++--
 gcc/fortran/match.c | 5 +++--
 gcc/fortran/match.h | 6 ++
 gcc/fortran/resolve.c   | 7 +++
 gcc/testsuite/gfortran.dg/pr95614_1.f90 | 6 ++
 gcc/testsuite/gfortran.dg/pr95614_2.f90 | 6 ++
 gcc/testsuite/gfortran.dg/pr95614_3.f90 | 9 +
 gcc/testsuite/gfortran.dg/pr95614_4.f90 | 9 +
 8 files changed, 44 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gfortran.dg/pr95614_1.f90
 create mode 100644 gcc/testsuite/gfortran.dg/pr95614_2.f90
 create mode 100644 gcc/testsuite/gfortran.dg/pr95614_3.f90
 create mode 100644 gcc/testsuite/gfortran.dg/pr95614_4.f90

diff --git a/gcc/fortran/decl.c b/gcc/fortran/decl.c
index 326e6f5db7a..9bfaa60418a 100644
--- a/gcc/fortran/decl.c
+++ b/gcc/fortran/decl.c
@@ -6007,7 +6007,7 @@ get_bind_c_idents (void)
   found_id = MATCH_YES;
   gfc_get_ha_symbol (name, _sym);
 }
-  else if (match_common_name (name) == MATCH_YES)
+  else if (gfc_match_common_name (name) == MATCH_YES)
 {
   found_id = MATCH_YES;
   com_block = gfc_get_common (name, 0);
@@ -6052,7 +6052,7 @@ get_bind_c_idents (void)
 	  found_id = MATCH_YES;
 	  gfc_get_ha_symbol (name, _sym);
 	}
-	  else if (match_common_name (name) == MATCH_YES)
+	  else if (gfc_match_common_name (name) == MATCH_YES)
 	{
 	  found_id = MATCH_YES;
 	  com_block = gfc_get_common (name, 0);
diff --git a/gcc/fortran/match.c b/gcc/fortran/match.c
index cb09c5f8ec5..bee73e7b008 100644
--- a/gcc/fortran/match.c
+++ b/gcc/fortran/match.c
@@ -5166,7 +5166,8 @@ gfc_get_common (const char *name, int from_module)
 
 /* Match a common block name.  */
 
-match match_common_name (char *name)
+match
+gfc_match_common_name (char *name)
 {
   match m;
 
@@ -5218,7 +5219,7 @@ gfc_match_common (void)
 
   for (;;)
 {
-  m = match_common_name (name);
+  m = gfc_match_common_name (name);
   if (m == MATCH_ERROR)
 	goto cleanup;
 
diff --git a/gcc/fortran/match.h b/gcc/fortran/match.h
index 7bf70d77016..4ccb5961d2b 100644
--- a/gcc/fortran/match.h
+++ b/gcc/fortran/match.h
@@ -103,11 +103,9 @@ match

[PATCH] move permute optimization to optimize-slp

2020-09-29 Thread Richard Biener

This moves optimizing permutes of SLP reductions to vect_optimize_slp,
eliding the global slp_loads array.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

2020-09-29  Richard Biener  

* tree-vect-slp.c (vect_analyze_slp): Move SLP reduction
re-arrangement and SLP graph load gathering...
(vect_optimize_slp): ... here.
* tree-vectorizer.h (vec_info::slp_loads): Remove.
---
 gcc/tree-vect-slp.c   | 19 ++-
 gcc/tree-vectorizer.h |  1 -
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index c44fd396bf0..8de24802538 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -2486,8 +2486,15 @@ vect_analyze_slp (vec_info *vinfo, unsigned 
max_tree_size)
   vect_free_slp_tree ((*it).second);
   delete bst_map;
 
+  return opt_result::success ();
+}
+
+void
+vect_optimize_slp (vec_info *vinfo)
+{
   /* Optimize permutations in SLP reductions.  */
   slp_instance instance;
+  unsigned i;
   FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
 {
   slp_tree node = SLP_INSTANCE_TREE (instance);
@@ -2500,20 +2507,14 @@ vect_analyze_slp (vec_info *vinfo, unsigned 
max_tree_size)
 }
 
   /* Gather all loads in the SLP graph.  */
+  auto_vec slp_loads;
   hash_set visited;
   FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
-vect_gather_slp_loads (vinfo->slp_loads, SLP_INSTANCE_TREE (instance),
+vect_gather_slp_loads (slp_loads, SLP_INSTANCE_TREE (instance),
   visited);
 
-  return opt_result::success ();
-}
-
-void
-vect_optimize_slp (vec_info *vinfo)
-{
   slp_tree node;
-  unsigned i;
-  FOR_EACH_VEC_ELT (vinfo->slp_loads, i, node)
+  FOR_EACH_VEC_ELT (slp_loads, i, node)
 {
   if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
continue;
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index b7fa6bc8d2f..e62f1ccee8d 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -359,7 +359,6 @@ public:
 
   /* The SLP graph.  */
   auto_vec slp_instances;
-  auto_vec slp_loads;
 
   /* Maps base addresses to an innermost_loop_behavior that gives the maximum
  known alignment for that base.  */
-- 
2.26.2

[PATCH] tree-optimization/97238 - fix typo causing ICE

2020-09-29 Thread Richard Biener

This fixes a typo causing a NULL dereference.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

2020-09-29  Richard Biener  

PR tree-optimization/97238
* tree-ssa-reassoc.c (ovce_extract_ops): Fix typo.

* gcc.dg/pr97238.c: New testcase.
---
 gcc/testsuite/gcc.dg/pr97238.c | 12 
 gcc/tree-ssa-reassoc.c |  2 +-
 2 files changed, 13 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.dg/pr97238.c

diff --git a/gcc/testsuite/gcc.dg/pr97238.c b/gcc/testsuite/gcc.dg/pr97238.c
new file mode 100644
index 000..746e93a9750
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr97238.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O -Wno-psabi -w" } */
+
+typedef int __attribute__ ((__vector_size__ (8))) V;
+int b, c, e;
+V d;
+
+V
+foo (void)
+{
+  return (b || e) | c > d | ((b || e) | c > d);
+}
diff --git a/gcc/tree-ssa-reassoc.c b/gcc/tree-ssa-reassoc.c
index facc794cdcc..a2ca1713d4b 100644
--- a/gcc/tree-ssa-reassoc.c
+++ b/gcc/tree-ssa-reassoc.c
@@ -3910,7 +3910,7 @@ ovce_extract_ops (tree var, gassign **rets, bool *reti, 
tree *type,
 return ERROR_MARK;
 
   gassign *assign = dyn_cast (SSA_NAME_DEF_STMT (cond));
-  if (stmt == NULL
+  if (assign == NULL
   || TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) != tcc_comparison)
 return ERROR_MARK;
 
-- 
2.26.2

PING^3 [GCC 10] [PATCH] IRA: Don't make a global register eliminable

2020-09-29 Thread H.J. Lu via Gcc-patches

On Fri, Sep 25, 2020 at 6:46 AM H.J. Lu  wrote:
>
> On Tue, Sep 22, 2020 at 10:48 AM H.J. Lu  wrote:
> >
> > On Fri, Sep 18, 2020 at 10:21 AM H.J. Lu  wrote:
> > >
> > > On Thu, Sep 17, 2020 at 3:52 PM Jeff Law  wrote:
> > > >
> > > >
> > > > On 9/16/20 8:46 AM, Richard Sandiford wrote:
> > > >
> > > > "H.J. Lu"  writes:
> > > >
> > > > On Tue, Sep 15, 2020 at 7:44 AM Richard Sandiford
> > > >  wrote:
> > > >
> > > > Thanks for looking at this.
> > > >
> > > > "H.J. Lu"  writes:
> > > >
> > > > commit 1bcb4c4faa4bd6b1c917c75b100d618faf9e628c
> > > > Author: Richard Sandiford 
> > > > Date:   Wed Oct 2 07:37:10 2019 +
> > > >
> > > > [LRA] Don't make eliminable registers live (PR91957)
> > > >
> > > > didn't make eliminable registers live which breaks
> > > >
> > > > register void *cur_pro asm("reg");
> > > >
> > > > where "reg" is an eliminable register.  Make fixed eliminable registers
> > > > live to fix it.
> > > >
> > > > I don't think fixedness itself is the issue here: it's usual for at
> > > > least some registers involved in eliminations to be fixed registers.
> > > >
> > > > I think what makes this case different is instead that cur_pro/ebp
> > > > is a global register.  But IMO things have already gone wrong if we
> > > > think that a global register is eliminable.
> > > >
> > > > So I wonder if instead we should check global_regs at the beginning of:
> > > >
> > > >   for (i = 0; i < fp_reg_count; i++)
> > > > if (!TEST_HARD_REG_BIT (crtl->asm_clobbers,
> > > > HARD_FRAME_POINTER_REGNUM + i))
> > > >   {
> > > > SET_HARD_REG_BIT (eliminable_regset,
> > > >   HARD_FRAME_POINTER_REGNUM + i);
> > > > if (frame_pointer_needed)
> > > >   SET_HARD_REG_BIT (ira_no_alloc_regs,
> > > > HARD_FRAME_POINTER_REGNUM + i);
> > > >   }
> > > > else if (frame_pointer_needed)
> > > >   error ("%s cannot be used in % here",
> > > >  reg_names[HARD_FRAME_POINTER_REGNUM + i]);
> > > > else
> > > >   df_set_regs_ever_live (HARD_FRAME_POINTER_REGNUM + i, true);
> > > >
> > > > (ira_setup_eliminable_regset), and handle the global_regs[] case in
> > > > the same way as the else case, i.e. short-circuiting both of the ifs.
> > > >
> > > > Like this?
> > > >
> > > > Sorry for the delay.  I was testing this in parallel.
> > > >
> > > > Bootstrapped & regression-tested on x86_64-linux-gnu.
> > > >
> > > > Thanks,
> > > > Richard
> > > >
> > > >
> > > > 0001-ira-Fix-elimination-for-global-hard-FPs-PR91957.patch
> > > >
> > > > From af4499845d26fe65573b21197a79fd22fd38694e Mon Sep 17 00:00:00 2001
> > > > From: "H.J. Lu" 
> > > > Date: Tue, 15 Sep 2020 06:23:26 -0700
> > > > Subject: [PATCH] ira: Fix elimination for global hard FPs [PR91957]
> > > > MIME-Version: 1.0
> > > > Content-Type: text/plain; charset=UTF-8
> > > > Content-Transfer-Encoding: 8bit
> > > >
> > > > If the hard frame pointer is being used as a global register,
> > > > we should skip the usual handling for eliminations.  As the
> > > > comment says, the register cannot in that case be eliminated
> > > > (or eliminated to) and is already marked live where appropriate.
> > > >
> > > > Doing this removes the duplicate error for gcc.target/i386/pr82673.c.
> > > > The “cannot be used in 'asm' here” message is meant to be for asm
> > > > statements rather than register asms, and the function that the
> > > > error is reported against doesn't use asm.
> > > >
> > > > gcc/
> > > > 2020-09-16  Richard Sandiford  
> > > >
> > > > PR middle-end/91957
> > > > * ira.c (ira_setup_eliminable_regset): Skip the special elimination
> > > > handling of the hard frame pointer if the hard frame pointer is fixed.
> > > >
> > > > gcc/testsuite/
> > > > 2020-09-16  H.J. Lu  
> > > >Richard Sandiford  
> > > >
> > > > PR middle-end/91957
> > > > * g++.target/i386/pr97054.C: New test.
> > > > * gcc.target/i386/pr82673.c: Remove redundant extra message.
> > > >
> > > > OK
> > >
> > > OK for GCC 10 branch?
> > >
> > > Thanks.
> >
> > PING:
> >
> > https://gcc.gnu.org/pipermail/gcc-patches/2020-September/554268.html
> >
>
> PING.
>

PING.

-- 
H.J.

Re: [PATCH] Add type arg to TARGET_LIBC_HAS_FUNCTION

2020-09-29 Thread Tom de Vries

On 9/29/20 8:59 AM, Richard Biener wrote:
> On Mon, Sep 28, 2020 at 7:28 PM Tom de Vries  wrote:
>>
>> [ was: Re: [Patch][nvptx] return true in libc_has_function for
>> function_sincos ]
>>
>> On 9/26/20 6:47 PM, Tobias Burnus wrote:
>>> Found when looking at PR97203 (but having no effect there).
>>>
>>> The GCC ME optimizes with -O1 (or higher) the
>>>   a = sinf(x)
>>>   b = cosf(x)
>>> to __builtin_cexpi(x, , )
>>> (...i as in internal; like cexp(z) but with with __real__ z == 0)
>>>
>>>
>>> In expand_builtin_cexpi, that is handles as:
>>>   if (optab_handler (sincos_optab, mode) != CODE_FOR_nothing)
>>> ...
>>>   else if (targetm.libc_has_function (function_sincos))
>>> ...
>>>   else
>>> fn = builtin_decl_explicit (BUILT_IN_CEXPF);
>>>
>>> And the latter is done. As newlib's cexpf does not know that
>>> __real__ z == 0, it calculates 'r = expf (__real__ z)' before
>>> invoking sinf and cosf on __imag__ z.
>>>
>>> Thus, it is much faster to call 'sincosf', which also exists
>>> in newlib.
>>>
>>> Solution: Return true for targetm.libc_has_function (function_sincos).
>>>
>>>
>>> NOTE: With -funsafe-math-optimizations (-O0 or higher),
>>> sinf/cosf and sincosf invoke .sin.approx/.cos/.approx instead of
>>> doing a library call.
>>
>> This version takes care to enable sincos and sincosf, but not sincosl.
>>
>> Target hook changes OK for trunk?
> 
> @@ -9770,7 +9770,7 @@ fold_builtin_sincos (location_t loc,
>  }
>if (!call)
>  {
> -  if (!targetm.libc_has_function (function_c99_math_complex)
> +  if (!targetm.libc_has_function (function_c99_math_complex, NULL_TREE)
> 
> why pass NULL_TREE and not 'type' here?
> 
>   || !builtin_decl_implicit_p (fn))
> return NULL_TREE;
> 

I was trying to do the minimal, sincos-only implementation.

> similar for the builtins.def change for the cases where math functions
> are affected?  I guess it's a bit awkward to make it work there, so OK.
> 
>  bool
> -darwin_libc_has_function (enum function_class fn_class)
> +darwin_libc_has_function (enum function_class fn_class, tree type)
>  {
> -  if (fn_class == function_sincos)
> +  if (type != NULL_TREE)
> +{
> +  switch (fn_class)
> +   {
> +   case function_sincos:
> + break;
> +   default:
> + /* Not implemented.  */
> + gcc_unreachable ();
> +   }
> +}
> 
> huh.  I think special-casing this just for sincos is a bit awkward,
> esp. ICEing for other queries with a type.  Specifically
> 
> -@deftypefn {Target Hook} bool TARGET_LIBC_HAS_FUNCTION (enum
> function_class @var{fn_class})
> +@deftypefn {Target Hook} bool TARGET_LIBC_HAS_FUNCTION (enum
> function_class @var{fn_class}, tree @var{type})
>  This hook determines whether a function from a class of functions
> -@var{fn_class} is present in the target C library.
> +@var{fn_class} is present in the target C library.  The @var{type} argument
> +can be used to distinguish between float, double and long double versions.
>  @end deftypefn
> 
> This doesn't mention we'll ICE for anything but sincos.  A sensible
> semantics would be that if TYPE is NULL the caller asks for support
> for all standard (float, double, long double) types while with TYPE
> non-NULL it can ask for a specific type including for example the
> new _FloatN, etc. types.
> 

Ack, updated accordingly and retested.

OK for trunk?

Thanks,
- Tom
[nvptx] Add type arg to TARGET_LIBC_HAS_FUNCTION

GCC has a target hook TARGET_LIBC_HAS_FUNCTION, which tells the compiler
which functions it can expect to be present in libc.

The default target hook does not include the sincos functions.

The nvptx port of newlib does include sincos and sincosf, but not sincosl.

The target hook TARGET_LIBC_HAS_FUNCTION does not distinguish between sincos,
sincosf and sincosl, so if we enable it for the sincos functions, then for
test.c:
...
long double x, a, b;
int main (void) {
  x = 0.5;
  a = sinl (x);
  b = cosl (x);
  printf ("a: %f\n", (double)a);
  printf ("b: %f\n", (double)b);
  return 0;
}
...
we introduce a regression:
...
$ gcc test.c -lm -O2
unresolved symbol sincosl
collect2: error: ld returned 1 exit status
...

Add a type argument to target hook TARGET_LIBC_HAS_FUNCTION_TYPE, and use it
in nvptx_libc_has_function_type to enable sincos and sincosf, but not sincosl.

Build and reg-tested on x86_64-linux.

Build and tested on nvptx.

2020-09-28  Tobias Burnus  
	Tom de Vries  

	* builtins.c (expand_builtin_cexpi, fold_builtin_sincos): Update
	targetm.libc_has_function call.
	* builtins.def (DEF_C94_BUILTIN, DEF_C99_BUILTIN, DEF_C11_BUILTIN):
	(DEF_C2X_BUILTIN, DEF_C99_COMPL_BUILTIN, DEF_C99_C90RES_BUILTIN):
	Same.
	* config/darwin-protos.h (darwin_libc_has_function): Update prototype.
	* config/darwin.c (darwin_libc_has_function): Add arg.
	* config/linux-protos.h (linux_libc_has_function): Update prototype.
	* config/linux.c (linux_libc_has_function): Add arg.
	* config/i386/i386.c (ix86_libc_has_function):

[PATCH][GCC 8] arm: Add support for Neoverse V1 CPU

2020-09-29 Thread Alex Coplan

Hello,

This patch backports the AArch32 support for Arm's Neoverse V1 CPU to
GCC 8.

Testing:
 * Bootstrapped and regtested on arm-none-linux-gnueabihf.

OK for GCC 8 branch?

Thanks,
Alex

---

gcc/ChangeLog:

* config/arm/arm-cpus.in (neoverse-v1): New.
* config/arm/arm-tables.opt: Regenerate.
* config/arm/arm-tune.md: Regenerate.
* doc/invoke.texi: Document AArch32 support for Neoverse V1.
diff --git a/gcc/config/arm/arm-cpus.in b/gcc/config/arm/arm-cpus.in
index ba194a80229..edfe5b378da 100644
--- a/gcc/config/arm/arm-cpus.in
+++ b/gcc/config/arm/arm-cpus.in
@@ -1576,6 +1576,18 @@ begin cpu cortex-a75.cortex-a55
  costs cortex_a73
 end cpu cortex-a75.cortex-a55
 
+
+# Armv8.4 A-profile Architecture Processors
+begin cpu neoverse-v1
+  cname neoversev1
+  tune for cortex-a57
+  tune flags LDSCHED
+  architecture armv8.4-a+fp16
+  option crypto add FP_ARMv8 CRYPTO
+  costs cortex_a57
+end cpu neoverse-v1
+
+
 # V8 M-profile implementations.
 begin cpu cortex-m23
  cname cortexm23
diff --git a/gcc/config/arm/arm-tables.opt b/gcc/config/arm/arm-tables.opt
index 60e5065b398..36dba62003a 100644
--- a/gcc/config/arm/arm-tables.opt
+++ b/gcc/config/arm/arm-tables.opt
@@ -351,6 +351,9 @@ Enum(processor_type) String(cortex-a75) Value( 
TARGET_CPU_cortexa75)
 EnumValue
 Enum(processor_type) String(cortex-a75.cortex-a55) Value( 
TARGET_CPU_cortexa75cortexa55)
 
+EnumValue
+Enum(processor_type) String(neoverse-v1) Value( TARGET_CPU_neoversev1)
+
 EnumValue
 Enum(processor_type) String(cortex-m23) Value( TARGET_CPU_cortexm23)
 
diff --git a/gcc/config/arm/arm-tune.md b/gcc/config/arm/arm-tune.md
index df43a1ccbb2..c972ce55576 100644
--- a/gcc/config/arm/arm-tune.md
+++ b/gcc/config/arm/arm-tune.md
@@ -57,6 +57,6 @@
cortexa73,exynosm1,xgene1,
cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,
cortexa73cortexa53,cortexa55,cortexa75,
-   cortexa75cortexa55,cortexm23,cortexm33,
-   cortexr52"
+   cortexa75cortexa55,neoversev1,cortexm23,
+   cortexm33,cortexr52"
(const (symbol_ref "((enum attr_tune) arm_tune)")))
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index a46a9cb31f7..6b40362e412 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -16334,8 +16334,8 @@ Permissible names are: @samp{arm2}, @samp{arm250},
 @samp{cortex-a9}, @samp{cortex-a12}, @samp{cortex-a15}, @samp{cortex-a17},
 @samp{cortex-a32}, @samp{cortex-a35}, @samp{cortex-a53}, @samp{cortex-a55},
 @samp{cortex-a57}, @samp{cortex-a72}, @samp{cortex-a73}, @samp{cortex-a75},
-@samp{cortex-r4}, @samp{cortex-r4f}, @samp{cortex-r5}, @samp{cortex-r7},
-@samp{cortex-r8}, @samp{cortex-r52},
+@samp{neoverse-v1}, @samp{cortex-r4}, @samp{cortex-r4f}, @samp{cortex-r5},
+@samp{cortex-r7}, @samp{cortex-r8}, @samp{cortex-r52},
 @samp{cortex-m33},
 @samp{cortex-m23},
 @samp{cortex-m7},

Re: [PATCH] pass: Run cleanup passes before SLP [PR96789]

2020-09-29 Thread Richard Biener via Gcc-patches

On Tue, Sep 29, 2020 at 1:30 PM Kewen.Lin  wrote:
>
> Hi,
>
> As the discussion in PR96789, we found that some scalar stmts
> which can be eliminated by some passes after SLP, but we still
> modeled their costs when trying to SLP, it could impact
> vectorizer's decision.  One typical case is the case in PR on
> target Power.
>
> As Richard suggested there, this patch is to introduce one pass
> called scalar_cleanup which has some secondary clean up passes,
> for now they are FRE and DSE.  It's only triggered when seeing
> one TODO flag called TODO_force_next_scalar_cleanup set, unlike
> normal TODO flags, the flag is kept in one global variable
> pending_TODOs and expects its downstream consumer to handle it.
>
> I verified that it can get x264's runtime performance back on
> Power, I also evaluated the compilation time for the SPEC2017
> int benchmarks build with one single job, this patch increase
> the compilation time by 0.74%.
>
> Bootstrapped/regtested on powerpc64le-linux-gnu P9.
>
> Is it ok for trunk?

diff --git a/gcc/tree-ssa-loop-ivcanon.c b/gcc/tree-ssa-loop-ivcanon.c
index 298ab215530..7016f993339 100644
--- a/gcc/tree-ssa-loop-ivcanon.c
+++ b/gcc/tree-ssa-loop-ivcanon.c
@@ -1605,6 +1605,14 @@ pass_complete_unroll::execute (function *fun)
 peeled_loops = BITMAP_ALLOC (NULL);
   unsigned int val = tree_unroll_loops_completely (flag_cunroll_grow_size,
   true);
+
+  /* There are no loops after unrolling, we assume that it's not so costly
+ to do the scalar cleanup since here.  FIXME: Some heuristics can be
+ further added to guard the cost level, like nodes number total, all
+ the original loops should be with single exits, etc.  */
+  if (!current_loops->tree_root->inner)
+val |= TODO_force_next_scalar_cleanup;
+

so this is not the appropriate way to guard this.  Instead in

static unsigned int
tree_unroll_loops_completely (bool may_increase_size, bool unroll_outer)
{

look where we do

  bitmap fathers = BITMAP_ALLOC (NULL);
  EXECUTE_IF_SET_IN_BITMAP (father_bbs, 0, i, bi)
{
  basic_block unrolled_loop_bb = BASIC_BLOCK_FOR_FN (cfun, i);
  if (! unrolled_loop_bb)
continue;
  if (loop_outer (unrolled_loop_bb->loop_father))
bitmap_set_bit (fathers,
unrolled_loop_bb->loop_father->num);

and in the else case return TODO_force_next_scalar_cleanup because
then we know we have unrolled an outermost loop and not ran VN
immediately on it.

+/* Scalar cleanup, it has several gated cleanup passes like FRE, DSE.  */
+
+namespace {
+
+const pass_data pass_data_scalar_cleanup =
+{
+  GIMPLE_PASS, /* type */
+  "*scalar_cleanup", /* name */
+  OPTGROUP_LOOP, /* optinfo_flags */

this new "pass" doesn't have to do anything with tree-ssa-loop-ivcanon.c
so please add it to passes.c instead (there's already a bunch of
pass definitions in there).

Can you repeat the compile-time measurement there?  I also wonder
whether we should worry about compile-time at -O[12] when SLP is not run.
Thus, probably rename the cleanup pass to pre_slp_scalar_cleanup and
gate it on && flag_slp_vectorize

Note this is probably the cleanest way to implement this hack.  But it
still is what it is - a hack.  Not a proper fix for whatever the actual issue is
which means I'm not the one that's going to ack it (since I've suggested it).

Thanks,
Richard.

> BR,
> Kewen
> ---
> gcc/ChangeLog:
>
> PR tree-optimization/96789
> * passes.c (execute_one_pass): Add support for
> TODO_force_next_scalar_cleanup.
> (pending_TODOs): Init.
> * passes.def (pass_scalar_cleanup): New pass, add pass_fre and
> pass_dse as its children.
> * timevar.def (TV_SCALAR_CLEANUP): New timevar.
> * tree-pass.h (TODO_force_next_scalar_cleanup): New TODO flag.
> (make_pass_scalar_cleanup): New function declare.
> (pending_TODOs): New variable declare.
> * tree-ssa-loop-ivcanon.c (pass_complete_unroll::execute): Set
> TODO_force_next_scalar_cleanup if there are no loops.
> (class pass_scalar_cleanup): New class.
> (pass_data_scalar_cleanup): New pass data.
> (make_pass_scalar_cleanup): New function.
>
> gcc/testsuite/ChangeLog:
>
> PR tree-optimization/96789
> * gcc.dg/tree-ssa/ssa-dse-28.c: Adjust.
> * gcc.dg/tree-ssa/ssa-dse-29.c: Likewise.
> * gcc.dg/tree-ssa/pr96789.c: New test.

Re: [committed] libstdc++: Use __libc_single_threaded to optimise atomics [PR 96817]

2020-09-29 Thread Christophe Lyon via Gcc-patches

On Sat, 26 Sep 2020 at 21:42, Jonathan Wakely via Gcc-patches
 wrote:
>
> Glibc 2.32 adds a global variable that says whether the process is
> single-threaded. We can use this to decide whether to elide atomic
> operations, as a more precise and reliable indicator than
> __gthread_active_p.
>
> This means that guard variables for statics and reference counting in
> shared_ptr can use less expensive, non-atomic ops even in processes that
> are linked to libpthread, as long as no threads have been created yet.
> It also means that we switch to using atomics if libpthread gets loaded
> later via dlopen (this still isn't supported in general, for other
> reasons).
>
> We can't use __libc_single_threaded to replace __gthread_active_p
> everywhere. If we replaced the uses of __gthread_active_p in std::mutex
> then we would elide the pthread_mutex_lock in the code below, but not
> the pthread_mutex_unlock:
>
>   std::mutex m;
>   m.lock();// pthread_mutex_lock
>   std::thread t([]{}); // __libc_single_threaded = false
>   t.join();
>   m.unlock();  // pthread_mutex_unlock
>
> We need the lock and unlock to use the same "is threading enabled"
> predicate, and similarly for init/destroy pairs for mutexes and
> condition variables, so that we don't try to release resources that were
> never acquired.
>
> There are other places that could use __libc_single_threaded, such as
> _Sp_locker in src/c++11/shared_ptr.cc and locale init functions, but
> they can be changed later.
>
> libstdc++-v3/ChangeLog:
>
> PR libstdc++/96817
> * include/ext/atomicity.h (__gnu_cxx::__is_single_threaded()):
> New function wrapping __libc_single_threaded if available.
> (__exchange_and_add_dispatch, __atomic_add_dispatch): Use it.
> * libsupc++/guard.cc (__cxa_guard_acquire, __cxa_guard_abort)
> (__cxa_guard_release): Likewise.
> * testsuite/18_support/96817.cc: New test.
>
> Tested powerpc64le-linux, with glibc 2.31 and 2.32. Committed to trunk.

Hi,

This patch introduced regressions on armeb-linux-gnueabhf:
--target armeb-none-linux-gnueabihf --with-cpu cortex-a9
g++.dg/compat/init/init-ref2 cp_compat_x_tst.o-cp_compat_y_tst.o execute
g++.dg/cpp2a/decomp1.C  -std=gnu++14 execution test
g++.dg/cpp2a/decomp1.C  -std=gnu++17 execution test
g++.dg/cpp2a/decomp1.C  -std=gnu++2a execution test
g++.dg/init/init-ref2.C  -std=c++14 execution test
g++.dg/init/init-ref2.C  -std=c++17 execution test
g++.dg/init/init-ref2.C  -std=c++2a execution test
g++.dg/init/init-ref2.C  -std=c++98 execution test
g++.dg/init/ref15.C  -std=c++14 execution test
g++.dg/init/ref15.C  -std=c++17 execution test
g++.dg/init/ref15.C  -std=c++2a execution test
g++.dg/init/ref15.C  -std=c++98 execution test
g++.old-deja/g++.jason/pmf7.C  -std=c++98 execution test
g++.old-deja/g++.mike/leak1.C  -std=c++14 execution test
g++.old-deja/g++.mike/leak1.C  -std=c++17 execution test
g++.old-deja/g++.mike/leak1.C  -std=c++2a execution test
g++.old-deja/g++.mike/leak1.C  -std=c++98 execution test
g++.old-deja/g++.other/init19.C  -std=c++14 execution test
g++.old-deja/g++.other/init19.C  -std=c++17 execution test
g++.old-deja/g++.other/init19.C  -std=c++2a execution test
g++.old-deja/g++.other/init19.C  -std=c++98 execution test

and probably some (280) in libstdc++ tests: (I didn't bisect those):
19_diagnostics/error_category/generic_category.cc execution test
19_diagnostics/error_category/system_category.cc execution test
20_util/scoped_allocator/1.cc execution test
20_util/scoped_allocator/2.cc execution test
20_util/scoped_allocator/construct_pair_c++2a.cc execution test
20_util/to_address/debug.cc execution test
20_util/variant/run.cc execution test

Christophe

[PATCH] pass: Run cleanup passes before SLP [PR96789]

2020-09-29 Thread Kewen.Lin via Gcc-patches

Hi,

As the discussion in PR96789, we found that some scalar stmts
which can be eliminated by some passes after SLP, but we still
modeled their costs when trying to SLP, it could impact
vectorizer's decision.  One typical case is the case in PR on
target Power.

As Richard suggested there, this patch is to introduce one pass
called scalar_cleanup which has some secondary clean up passes,
for now they are FRE and DSE.  It's only triggered when seeing
one TODO flag called TODO_force_next_scalar_cleanup set, unlike
normal TODO flags, the flag is kept in one global variable
pending_TODOs and expects its downstream consumer to handle it.

I verified that it can get x264's runtime performance back on
Power, I also evaluated the compilation time for the SPEC2017
int benchmarks build with one single job, this patch increase
the compilation time by 0.74%.

Bootstrapped/regtested on powerpc64le-linux-gnu P9.

Is it ok for trunk?

BR,
Kewen
---
gcc/ChangeLog:

PR tree-optimization/96789
* passes.c (execute_one_pass): Add support for
TODO_force_next_scalar_cleanup.
(pending_TODOs): Init.
* passes.def (pass_scalar_cleanup): New pass, add pass_fre and
pass_dse as its children.
* timevar.def (TV_SCALAR_CLEANUP): New timevar.
* tree-pass.h (TODO_force_next_scalar_cleanup): New TODO flag.
(make_pass_scalar_cleanup): New function declare.
(pending_TODOs): New variable declare. 
* tree-ssa-loop-ivcanon.c (pass_complete_unroll::execute): Set
TODO_force_next_scalar_cleanup if there are no loops.
(class pass_scalar_cleanup): New class.
(pass_data_scalar_cleanup): New pass data.
(make_pass_scalar_cleanup): New function.

gcc/testsuite/ChangeLog:

PR tree-optimization/96789
* gcc.dg/tree-ssa/ssa-dse-28.c: Adjust.
* gcc.dg/tree-ssa/ssa-dse-29.c: Likewise.
* gcc.dg/tree-ssa/pr96789.c: New test.
diff --git a/gcc/passes.c b/gcc/passes.c
index 6ff31ec37d7..b0ab9f66557 100644
--- a/gcc/passes.c
+++ b/gcc/passes.c
@@ -71,6 +71,8 @@ using namespace gcc;
The variable current_pass is also used for statistics and plugins.  */
 opt_pass *current_pass;
 
+unsigned int pending_TODOs = 0;
+
 /* Most passes are single-instance (within their context) and thus don't
need to implement cloning, but passes that support multiple instances
*must* provide their own implementation of the clone method.
@@ -2538,6 +2540,12 @@ execute_one_pass (opt_pass *pass)
   return true;
 }
 
+  if (todo_after & TODO_force_next_scalar_cleanup)
+{
+  todo_after &= ~TODO_force_next_scalar_cleanup;
+  pending_TODOs |= TODO_force_next_scalar_cleanup;
+}
+
   do_per_function (clear_last_verified, NULL);
 
   do_per_function (update_properties_after_pass, pass);
diff --git a/gcc/passes.def b/gcc/passes.def
index f865bdc19ac..3d9cccb6df1 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -290,11 +290,16 @@ along with GCC; see the file COPYING3.  If not see
  /* pass_vectorize must immediately follow pass_if_conversion.
 Please do not add any other passes in between.  */
  NEXT_PASS (pass_vectorize);
-  PUSH_INSERT_PASSES_WITHIN (pass_vectorize)
+ PUSH_INSERT_PASSES_WITHIN (pass_vectorize)
  NEXT_PASS (pass_dce);
-  POP_INSERT_PASSES ()
-  NEXT_PASS (pass_predcom);
+ POP_INSERT_PASSES ()
+ NEXT_PASS (pass_predcom);
  NEXT_PASS (pass_complete_unroll);
+ NEXT_PASS (pass_scalar_cleanup);
+  PUSH_INSERT_PASSES_WITHIN (pass_scalar_cleanup)
+ NEXT_PASS (pass_fre, false /* may_iterate */);
+ NEXT_PASS (pass_dse);
+  POP_INSERT_PASSES ()
  NEXT_PASS (pass_slp_vectorize);
  NEXT_PASS (pass_loop_prefetch);
  /* Run IVOPTs after the last pass that uses data-reference analysis
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr96789.c 
b/gcc/testsuite/gcc.dg/tree-ssa/pr96789.c
new file mode 100644
index 000..46425d83b02
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr96789.c
@@ -0,0 +1,58 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -funroll-loops -fdump-tree-dse-details" } */
+
+/* Test if scalar cleanup pass takes effects, mainly check
+   its secondary pass DSE can remove dead stores on array
+   tmp.  */
+
+#include "stdint.h"
+
+static inline void
+foo (int16_t *diff, int i_size, uint8_t *val1, int i_val1, uint8_t *val2,
+ int i_val2)
+{
+  for (int y = 0; y < i_size; y++)
+{
+  for (int x = 0; x < i_size; x++)
+   diff[x + y * i_size] = val1[x] - val2[x];
+  val1 += i_val1;
+  val2 += i_val2;
+}
+}
+
+void
+bar (int16_t res[16], uint8_t *val1, uint8_t *val2)
+{
+  int16_t d[16];
+  int16_t tmp[16];
+
+  foo (d, 4, val1, 16, val2, 32);
+
+  for (int i = 0; i < 4; i++)
+{
+  int s03 = d[i * 4 + 0] + d[i * 4 + 3];
+  int s12 = d[i * 4 + 1] + d[i * 4 + 2];
+  int d03 = d[i

Re: [PATCH] c++: Diagnose visitors with different return types for std::visit [PR95904]

2020-09-29 Thread Jonathan Wakely via Gcc-patches


On 29/09/20 01:12 +0300, Ville Voutilainen via Libstdc++ wrote:

Not completely tested yet. This does fix the problem of converting
incompatible pointer-to-function types, and thus gets rid of the suggestion
that compiling the code with -fpermissive is a possibility. There
is a special-casing for visit() for visitation of a single variant, and there
we don't even instantiate the whole table mechanism. We should really
entertain the possibility of flattening the whole visitation table; then
this check could (at least in theory) be uniformly written as just
an iteration of all table elements, which is not so convenient to do
with the nested multitable. This seems like a worthy incremental
improvement to me.

2020-09-29  Ville Voutilainen  

   PR libstdc++/95904
   * include/std/variant (__same_types): New.
   (__check_visitor_result): Likewise.
   (__check_visitor_results): Likewise.
   (visit(_Visitor&&, _Variants&&...)): Use __check_visitor_results
   in case we're visiting just one variant.
   (__gen_vtable_impl::_S_apply):
   Check the visitor return type.



diff --git a/libstdc++-v3/include/std/variant b/libstdc++-v3/include/std/variant
index dd8847cf829..56de78407c4 100644
--- a/libstdc++-v3/include/std/variant
+++ b/libstdc++-v3/include/std/variant
@@ -182,7 +182,7 @@ namespace __variant
  // used for raw visitation with indices passed in
  struct __variant_idx_cookie { using type = __variant_idx_cookie; };
  // Used to enable deduction (and same-type checking) for std::visit:
-  template struct __deduce_visit_result { };
+  template struct __deduce_visit_result { using type = _Tp; };

  // Visit variants that might be valueless.
  template
@@ -1017,7 +1017,22 @@ namespace __variant

  static constexpr auto
  _S_apply()
-  { return _Array_type{&__visit_invoke}; }
+  {
+   constexpr bool __visit_ret_type_mismatch =
+ _Array_type::__result_is_deduced::value
+ && !is_same_v(),
+   std::declval<_Variants>()...))>;
+   if constexpr (__visit_ret_type_mismatch)
+ {
+   static_assert(!__visit_ret_type_mismatch,
+ "std::visit requires the visitor to have the same "
+ "return type for all alternatives of a variant");
+   return __nonesuch{};
+ }
+   else
+ return _Array_type{&__visit_invoke};
+  }
};

  template
@@ -1692,6 +1707,27 @@ namespace __variant
   std::forward<_Variants>(__variants)...);
}

+  template 
+struct __same_types : public std::bool_constant<
+std::__and_...>::value> {};


This would be cheaper:

  template
using __same_types = typename __and_...>::type;

Although didn't we make changes in std::variant to stop using __and_
because it exceeds the template instantiation depth for large
variants?

I think this is what we want:

  template
constexpr inline __same_types = (is_same_v<_Tp, _Types> && ...);

is_same_v is very cheap, it uses the built-in directly, so you don't
need to instantiate any class templates at all.


+
+  template 


typename not class please.


+decltype(auto) __check_visitor_result(_Visitor&& __vis,


New line after the decltype(auto) please, not in the middle of the
parameter list.

I'll keep staring at this to review the actual content rather than the
window dressing that I've commented on above.

Re: [PATCH 1/5] Don't enable -gvariable-location-views by default for DWARF5.

2020-09-29 Thread Mark Wielaard

Hi,

On Mon, 2020-08-24 at 19:38 +0200, Jakub Jelinek wrote:
> On Mon, Aug 24, 2020 at 02:56:54PM +0200, Mark Wielaard wrote:
> > DWARF5 makes it possible to read loclists tables without consulting
> > the debuginfo tree by introducing a table header. Adding location
> > views
> > breaks this (at least for binutils and elfutils). So don't enable
> > variable-location-views by default if DWARF5 or higher is selected.
> 
> This should be discussed with Alex, CCed.
> I'd say elfutils/binutils should just show .debug_loclists
> independent of
> .debug_info if there are no loc views and use .debug_info otherwise.

So it turned out that it was really bugs in elfutils and binutils.
For elfutils it now tracks locviews in .debug_loclists just like it did
for .debug_loc:
https://sourceware.org/pipermail/elfutils-devel/2020q3/002900.html

For binutils it actually tracked locviews correctly, but didn't handle
DW_LLE_start_end and DW_LLE_start_length. Patch submitted:
https://sourceware.org/pipermail/binutils/2020-September/113510.html

For tools that access the location lists (and locviews) through the DIE
attributes this never was an issue.

Patch retracted.

Cheers,

Mark

Re: [PATCH v2 6/16]middle-end Add Complex Addition with rotation detection

2020-09-29 Thread Richard Biener

On Tue, 29 Sep 2020, Richard Sandiford wrote:

> Tamar Christina  writes:
> > diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> > index 
> > 2b46286943778e16d95b15def4299bcbf8db7eb8..71e226505b2619d10982b59a4ebbed73a70f29be
> >  100644
> > --- a/gcc/doc/md.texi
> > +++ b/gcc/doc/md.texi
> > @@ -6132,6 +6132,17 @@ floating-point mode.
> >  
> >  This pattern is not allowed to @code{FAIL}.
> >  
> > +@cindex @code{cadd@var{m}@var{n}3} instruction pattern
> > +@item @samp{cadd@var{m}@var{n}3}
> > +Perform a vector addition of complex numbers in operand 1 with operand 2
> > +rotated by @var{m} degrees around the argand plane and storing the result 
> > in
> > +operand 0.  The instruction must perform the operation on data loaded
> > +contiguously into the vectors.
> 
> Nitpicking, sorry, but I think it would be better to describe the
> layout directly rather than in terms of loads, since the preceding
> operation might not be a load.

So if we're at that and since GCC vectors do not have complex
components can we formulate this in terms avoiding 'complex'?
Isn't this an add of one vector to a vector with adjacant
lanes swapped and possibly negated?  Mentioning that this would
match a complex add in case lanes happen to match up with
complex real/imag parts is OK but the pattern should work
equally well if there's no complex numbers involved?

> I guess the main question is: what representation do we expect for
> big-endian?  A normal Advanced SIMD LDR would give this (for floats):
> 
>  MEMORY
>+-+-+-+-+
>| r0  | i0  | r1  | i1  |
>+-+-+-+-+
>|  0  |  1  |  2  |  3  |   array numbering
>+-+-+-+-+
>   V V V V  Advanced SIMD LDR
>+-+-+-+-+
>| r0  | i0  | r1  | i1  |
>+-+-+-+-+
>|  0  |  1  |  2  |  3  |   GCC lane numbering
>+-+-+-+-+
>|  3  |  2  |  1  |  0  |   Arm lane numbering
>+-+-+-+-+
>   MSB   REGISTER  LSB
> 
> but the FC* instructions put the imaginary parts in the more
> significant lane, so the pairs of elements above would need
> to be reversed:
> 
>  MEMORY
>+-+-+-+-+
>| r0  | i0  | r1  | i1  |
>+-+-+-+-+
>|  0  |  1  |  2  |  3  |   array numbering
>+-+-+-+-+
>\   /   \   /
> \ / \ /
>  X   X Load and permute
> / \ / \
>/   \   /   \
>+-+-+-+-+
>| i0  | r0  | i1  | r1  |
>+-+-+-+-+
>|  0  |  1  |  2  |  3  |   GCC lane numbering
>+-+-+-+-+
>|  3  |  2  |  1  |  0  |   Arm lane numbering
>+-+-+-+-+
>   MSB   REGISTER  LSB
> 
> (Or the whole vector could be reversed.)
> 
> We might decide that it just isn't worth doing this for Advanced SIMD.
> But should the semantics of the optab be that:
> 
> (1) GCC lane number 0 holds a real part, or
> (2) the least significant lane holds a real part?
> 
> With (1), it would be up to the target to hide the permute above.
> With (2), the vectoriser would need to introduce the permute itself.
> 
> I'm not sure there's a perfect answer even for Arm targets.  (2) matches
> the Advanced SIMD semantics.  But for SVE, the register layout follows
> LD1 rather than LDR, and the GCC and architectural lane numbering match up.
> (1) would therefore be better than (2) for SVE (and so no permute would be
> needed for either endianness on SVE).
> 
> > +The operation is only supported for vector modes @var{n} and with
> > +rotations @var{m} of 90 or 270.
> > +
> > +This pattern is not allowed to @code{FAIL}.
> > +
> >  @cindex @code{ffs@var{m}2} instruction pattern
> >  @item @samp{ffs@var{m}2}
> >  Store into operand 0 one plus the index of the least significant 1-bit
> > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > index 
> > 13e60828fcf5db6c5f15aae2bacd4cf04029e430..956a65a338c157b51de7e78a3fb005b5af78ef31
> >  100644
> > --- a/gcc/internal-fn.def
> > +++ b/gcc/internal-fn.def
> > @@ -275,6 +275,8 @@ DEF_INTERNAL_FLT_FN (SCALB, ECF_CONST, scalb, binary)
> >  DEF_INTERNAL_FLT_FLOATN_FN (FMIN, ECF_CONST, fmin, binary)
> >  DEF_INTERNAL_FLT_FLOATN_FN (FMAX, ECF_CONST, fmax, binary)
> >  DEF_INTERNAL_OPTAB_FN (XORSIGN, ECF_CONST, xorsign, binary)
> > +DEF_INTERNAL_OPTAB_FN (COMPLEX_ADD_ROT90, ECF_CONST, cadd90, binary)
> > +DEF_INTERNAL_OPTAB_FN (COMPLEX_ADD_ROT270, ECF_CONST, cadd270, binary)
> >  
> >  /* FP scales.  */
> >  DEF_INTERNAL_FLT_FN (LDEXP, ECF_CONST, ldexp, binary)
> > diff --git a/gcc/optabs.def b/gcc/optabs.def
> > index 
> > 78409aa14537d259bf90277751aac00d452a0d3f..2bb0bf857977035bf562a77f5f6848e80edf936d
> >  100644
> > --- a/gcc/optabs.def
> > +++ b/gcc/optabs.def
> > @@ -290,6 +290,8 @@ OPTAB_D (atan_optab, "atan$a2")
> >  OPTAB_D (atanh_optab, "atanh$a2")
> >  OPTAB_D (copysign_optab,

RE: Ping: [PATCH] arm: Add new vector mode macros

2020-09-29 Thread Kyrylo Tkachov




> -Original Message-
> From: Richard Sandiford 
> Sent: 29 September 2020 11:27
> To: Kyrylo Tkachov 
> Cc: gcc-patches@gcc.gnu.org; ni...@redhat.com; Richard Earnshaw
> ; Ramana Radhakrishnan
> ; Dennis Zhang
> 
> Subject: Ping: [PATCH] arm: Add new vector mode macros
> 
> Ping
> 
> Richard Sandiford  writes:
> > Kyrylo Tkachov  writes:
> >> This looks like a productive way forward to me.
> >> Okay if the other maintainer don't object by the end of the week.
> >
> > Thanks.  Dennis pointed out off-list that it regressed
> > armv8_2-fp16-arith-2.c, which was expecting FP16 vectorisation
> > to be rejected for -fno-fast-math.  As mentioned above, that shouldn't
> > be necessary given that FP16 arithmetic (unlike FP32 arithmetic) has a
> > flush-to-zero control.
> >
> > This version therefore updates the test to expect the same output
> > as the -ffast-math version.
> >
> > Tested on arm-linux-gnueabi (hopefully for real this time -- I must
> > have messed up the testing last time).  OK for trunk?
> >

Ok.
Thanks,
Kyrill

> > FWIW, the non-testsuite part is the same as before.
> >
> > Richard
> >
> >
> > gcc/
> > * config/arm/arm.h (ARM_HAVE_NEON_V8QI_ARITH,
> ARM_HAVE_NEON_V4HI_ARITH)
> > (ARM_HAVE_NEON_V2SI_ARITH, ARM_HAVE_NEON_V16QI_ARITH):
> New macros.
> > (ARM_HAVE_NEON_V8HI_ARITH, ARM_HAVE_NEON_V4SI_ARITH):
> Likewise.
> > (ARM_HAVE_NEON_V2DI_ARITH, ARM_HAVE_NEON_V4HF_ARITH):
> Likewise.
> > (ARM_HAVE_NEON_V8HF_ARITH, ARM_HAVE_NEON_V2SF_ARITH):
> Likewise.
> > (ARM_HAVE_NEON_V4SF_ARITH, ARM_HAVE_V8QI_ARITH,
> ARM_HAVE_V4HI_ARITH)
> > (ARM_HAVE_V2SI_ARITH, ARM_HAVE_V16QI_ARITH,
> ARM_HAVE_V8HI_ARITH)
> > (ARM_HAVE_V4SI_ARITH, ARM_HAVE_V2DI_ARITH,
> ARM_HAVE_V4HF_ARITH)
> > (ARM_HAVE_V2SF_ARITH, ARM_HAVE_V8HF_ARITH,
> ARM_HAVE_V4SF_ARITH):
> > Likewise.
> > * config/arm/iterators.md (VNIM, VNINOTM): Delete.
> > * config/arm/vec-common.md (add3, addv8hf3)
> > (add3): Replace with...
> > (add3): ...this new expander.
> > * config/arm/neon.md (*add3_neon): Use the new
> > ARM_HAVE_NEON__ARITH macros as the C condition.
> > (addv8hf3_neon, addv4hf3, add3_fp16): Delete in
> > favor of the above.
> > (neon_vadd): Use gen_add3 instead of
> > gen_add3_fp16.
> >
> > gcc/testsuite/
> > * gcc.target/arm/armv8_2-fp16-arith-2.c: Expect FP16 vectorization
> > even without -ffast-math.
> > ---
> >  gcc/config/arm/arm.h  | 41 
> >  gcc/config/arm/iterators.md   |  8 
> >  gcc/config/arm/neon.md| 47 +--
> >  gcc/config/arm/vec-common.md  | 42 ++---
> >  .../gcc.target/arm/armv8_2-fp16-arith-2.c | 20 +---
> >  5 files changed, 61 insertions(+), 97 deletions(-)
> >
> > diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h
> > index f4d3676c5bc..4a63d33c70d 100644
> > --- a/gcc/config/arm/arm.h
> > +++ b/gcc/config/arm/arm.h
> > @@ -1110,6 +1110,47 @@ extern const int arm_arch_cde_coproc_bits[];
> >  #define VALID_MVE_STRUCT_MODE(MODE) \
> >((MODE) == TImode || (MODE) == OImode || (MODE) == XImode)
> >
> > +/* The conditions under which vector modes are supported for general
> > +   arithmetic using Neon.  */
> > +
> > +#define ARM_HAVE_NEON_V8QI_ARITH TARGET_NEON
> > +#define ARM_HAVE_NEON_V4HI_ARITH TARGET_NEON
> > +#define ARM_HAVE_NEON_V2SI_ARITH TARGET_NEON
> > +
> > +#define ARM_HAVE_NEON_V16QI_ARITH TARGET_NEON
> > +#define ARM_HAVE_NEON_V8HI_ARITH TARGET_NEON
> > +#define ARM_HAVE_NEON_V4SI_ARITH TARGET_NEON
> > +#define ARM_HAVE_NEON_V2DI_ARITH TARGET_NEON
> > +
> > +/* HF operations have their own flush-to-zero control (FPSCR.FZ16).  */
> > +#define ARM_HAVE_NEON_V4HF_ARITH TARGET_NEON_FP16INST
> > +#define ARM_HAVE_NEON_V8HF_ARITH TARGET_NEON_FP16INST
> > +
> > +/* SF operations always flush to zero, regardless of FPSCR.FZ, so we can
> > +   only use them for general arithmetic when -funsafe-math-optimizations
> > +   is in effect.  */
> > +#define ARM_HAVE_NEON_V2SF_ARITH \
> > +  (TARGET_NEON && flag_unsafe_math_optimizations)
> > +#define ARM_HAVE_NEON_V4SF_ARITH ARM_HAVE_NEON_V2SF_ARITH
> > +
> > +/* The conditions under which vector modes are supported for general
> > +   arithmetic by any vector extension.  */
> > +
> > +#define ARM_HAVE_V8QI_ARITH (ARM_HAVE_NEON_V8QI_ARITH ||
> TARGET_REALLY_IWMMXT)
> > +#define ARM_HAVE_V4HI_ARITH (ARM_HAVE_NEON_V4HI_ARITH ||
> TARGET_REALLY_IWMMXT)
> > +#define ARM_HAVE_V2SI_ARITH (ARM_HAVE_NEON_V2SI_ARITH ||
> TARGET_REALLY_IWMMXT)
> > +
> > +#define ARM_HAVE_V16QI_ARITH (ARM_HAVE_NEON_V16QI_ARITH ||
> TARGET_HAVE_MVE)
> > +#define ARM_HAVE_V8HI_ARITH (ARM_HAVE_NEON_V8HI_ARITH ||
> TARGET_HAVE_MVE)
> > +#define ARM_HAVE_V4SI_ARITH (ARM_HAVE_NEON_V4SI_ARITH ||
> TARGET_HAVE_MVE)
> > +#define ARM_HAVE_V2DI_ARITH ARM_HAVE_NEON_V2DI_ARITH
> > +
> > +#define ARM_HAVE_V4HF_ARITH ARM_HAVE_NEON_V4HF_ARITH
> > +#define ARM_HAVE_V2SF_ARITH

Re: [PATCH] aarch64: Add extend-as-extract-with-shift pattern [PR96998]

2020-09-29 Thread Alex Coplan

Hi Segher,

Gentle ping.

Is the combine change (a canonicalization fix, as described below) OK
for trunk in light of this info?

On 22/09/2020 17:08, Richard Sandiford wrote:
> Segher Boessenkool  writes:
> > Hi Alex,
> >
> > On Tue, Sep 22, 2020 at 08:40:07AM +0100, Alex Coplan wrote:
> >> On 21/09/2020 18:35, Segher Boessenkool wrote:
> >> Thanks for doing this testing. The results look good, then: no code size
> >> changes and no build regressions.
> >
> > No *code* changes.  I cannot test aarch64 likme this.
> >
> >> > So, there is no difference for most targets (I checked some targets and
> >> > there really is no difference).  The only exception is aarch64 (which
> >> > the kernel calls "arm64"): the unpatched compiler ICEs!  (At least three
> >> > times, even).
> >> 
> >> Indeed, this is the intended purpose of the patch, see the PR (96998).
> >
> > You want to fix a ICE in LRA caused by an instruction created by LRA,
> > with a patch to combine?!  That doesn't sound right.
> >
> > If what you want to do is a) fix the backend bug, and then b) get some
> > extra performance, then do *that*, and keep the patches separate.
> 
> This patch isn't supposed to be a performance optimisation.  It's supposed
> to be a canonicalisation improvement.
> 
> The situation as things stand is that aarch64 has a bug: it accepts
> an odd sign_extract representation of addresses, but doesn't accept
> that same odd form of address as an LEA.  We have two options:
> 
> (a) add back instructions that recognise the odd form of LEA, or
> (b) remove the code that accepts the odd addresses
> 
> I think (b) is the way to go here.  But doing that on its own
> would regress code quality.  The reason we recognised the odd
> addresses in the first place was because that was the rtl that
> combine happened to generate for an important case.
> 
> Normal operating procedure is to add patterns and address-matching
> code that accepts whatever combine happens to throw at the target,
> regardless of how sensible the representation is.  But sometimes I think
> we should instead think about whether the representation that combine is
> using is the right one.  And IMO this is one such case.
> 
> At the moment combine does this:
> 
> Trying 8 -> 9:
> 8: r98:DI=sign_extend(r92:SI)
>   REG_DEAD r92:SI
> 9: [r98:DI*0x4+r96:DI]=asm_operands
>   REG_DEAD r98:DI
> Failed to match this instruction:
> (set (mem:SI (plus:DI (sign_extract:DI (mult:DI (subreg:DI (reg/v:SI 92 [ g 
> ]) 0)
> (const_int 4 [0x4]))
> (const_int 34 [0x22])
> (const_int 0 [0]))
> (reg/f:DI 96)) [3 *i_5+0 S4 A32])
> (asm_operands:SI ("") ("=Q") 0 []
>  []
>  [] /tmp/foo.c:13))
> allowing combination of insns 8 and 9
> original costs 4 + 4 = 8
> replacement cost 4
> 
> and so that's one of the forms that the aarch64 address code accepts.
> But a natural substitution would simply replace r98 with the rhs of
> the set:
> 
>   (set (mem:SI (plus:DI (mult:DI (sign_extend:DI (reg/v:SI 92))
>  (const_int 4))
> (reg:DI 96)))
>...)
> 
> The only reason we don't do that is because the substitution
> and simplification go through the expand_compound_operation/
> make_compound_operation process.
> 
> The corresponding (ashift ... (const_int 2)) *does* end up using
> the natural sign_extend form rather than sign_extract form.
> The only reason we get this (IMO) weird behaviour for mult is
> the rule that shifts have to be mults in addresses.  Some code
> (like the code being patched) instead expects ashift to be the
> canonical form in all situations.
> 
> If we make the substitution work “naturally” for mult as well as
> ashift, we can remove the addressing-matching code that has no
> corresponding LEA pattern, and make the aarch64 address code
> self-consistent that way instead.
> 
> Thanks,
> Richard

Thanks,
Alex

Ping: [PATCH] arm: Add new vector mode macros

2020-09-29 Thread Richard Sandiford

Ping

Richard Sandiford  writes:
> Kyrylo Tkachov  writes:
>> This looks like a productive way forward to me.
>> Okay if the other maintainer don't object by the end of the week.
>
> Thanks.  Dennis pointed out off-list that it regressed
> armv8_2-fp16-arith-2.c, which was expecting FP16 vectorisation
> to be rejected for -fno-fast-math.  As mentioned above, that shouldn't
> be necessary given that FP16 arithmetic (unlike FP32 arithmetic) has a
> flush-to-zero control.
>
> This version therefore updates the test to expect the same output
> as the -ffast-math version.
>
> Tested on arm-linux-gnueabi (hopefully for real this time -- I must
> have messed up the testing last time).  OK for trunk?
>
> FWIW, the non-testsuite part is the same as before.
>
> Richard
>
>
> gcc/
>   * config/arm/arm.h (ARM_HAVE_NEON_V8QI_ARITH, ARM_HAVE_NEON_V4HI_ARITH)
>   (ARM_HAVE_NEON_V2SI_ARITH, ARM_HAVE_NEON_V16QI_ARITH): New macros.
>   (ARM_HAVE_NEON_V8HI_ARITH, ARM_HAVE_NEON_V4SI_ARITH): Likewise.
>   (ARM_HAVE_NEON_V2DI_ARITH, ARM_HAVE_NEON_V4HF_ARITH): Likewise.
>   (ARM_HAVE_NEON_V8HF_ARITH, ARM_HAVE_NEON_V2SF_ARITH): Likewise.
>   (ARM_HAVE_NEON_V4SF_ARITH, ARM_HAVE_V8QI_ARITH, ARM_HAVE_V4HI_ARITH)
>   (ARM_HAVE_V2SI_ARITH, ARM_HAVE_V16QI_ARITH, ARM_HAVE_V8HI_ARITH)
>   (ARM_HAVE_V4SI_ARITH, ARM_HAVE_V2DI_ARITH, ARM_HAVE_V4HF_ARITH)
>   (ARM_HAVE_V2SF_ARITH, ARM_HAVE_V8HF_ARITH, ARM_HAVE_V4SF_ARITH):
>   Likewise.
>   * config/arm/iterators.md (VNIM, VNINOTM): Delete.
>   * config/arm/vec-common.md (add3, addv8hf3)
>   (add3): Replace with...
>   (add3): ...this new expander.
>   * config/arm/neon.md (*add3_neon): Use the new
>   ARM_HAVE_NEON__ARITH macros as the C condition.
>   (addv8hf3_neon, addv4hf3, add3_fp16): Delete in
>   favor of the above.
>   (neon_vadd): Use gen_add3 instead of
>   gen_add3_fp16.
>
> gcc/testsuite/
>   * gcc.target/arm/armv8_2-fp16-arith-2.c: Expect FP16 vectorization
>   even without -ffast-math.
> ---
>  gcc/config/arm/arm.h  | 41 
>  gcc/config/arm/iterators.md   |  8 
>  gcc/config/arm/neon.md| 47 +--
>  gcc/config/arm/vec-common.md  | 42 ++---
>  .../gcc.target/arm/armv8_2-fp16-arith-2.c | 20 +---
>  5 files changed, 61 insertions(+), 97 deletions(-)
>
> diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h
> index f4d3676c5bc..4a63d33c70d 100644
> --- a/gcc/config/arm/arm.h
> +++ b/gcc/config/arm/arm.h
> @@ -1110,6 +1110,47 @@ extern const int arm_arch_cde_coproc_bits[];
>  #define VALID_MVE_STRUCT_MODE(MODE) \
>((MODE) == TImode || (MODE) == OImode || (MODE) == XImode)
>  
> +/* The conditions under which vector modes are supported for general
> +   arithmetic using Neon.  */
> +
> +#define ARM_HAVE_NEON_V8QI_ARITH TARGET_NEON
> +#define ARM_HAVE_NEON_V4HI_ARITH TARGET_NEON
> +#define ARM_HAVE_NEON_V2SI_ARITH TARGET_NEON
> +
> +#define ARM_HAVE_NEON_V16QI_ARITH TARGET_NEON
> +#define ARM_HAVE_NEON_V8HI_ARITH TARGET_NEON
> +#define ARM_HAVE_NEON_V4SI_ARITH TARGET_NEON
> +#define ARM_HAVE_NEON_V2DI_ARITH TARGET_NEON
> +
> +/* HF operations have their own flush-to-zero control (FPSCR.FZ16).  */
> +#define ARM_HAVE_NEON_V4HF_ARITH TARGET_NEON_FP16INST
> +#define ARM_HAVE_NEON_V8HF_ARITH TARGET_NEON_FP16INST
> +
> +/* SF operations always flush to zero, regardless of FPSCR.FZ, so we can
> +   only use them for general arithmetic when -funsafe-math-optimizations
> +   is in effect.  */
> +#define ARM_HAVE_NEON_V2SF_ARITH \
> +  (TARGET_NEON && flag_unsafe_math_optimizations)
> +#define ARM_HAVE_NEON_V4SF_ARITH ARM_HAVE_NEON_V2SF_ARITH
> +
> +/* The conditions under which vector modes are supported for general
> +   arithmetic by any vector extension.  */
> +
> +#define ARM_HAVE_V8QI_ARITH (ARM_HAVE_NEON_V8QI_ARITH || 
> TARGET_REALLY_IWMMXT)
> +#define ARM_HAVE_V4HI_ARITH (ARM_HAVE_NEON_V4HI_ARITH || 
> TARGET_REALLY_IWMMXT)
> +#define ARM_HAVE_V2SI_ARITH (ARM_HAVE_NEON_V2SI_ARITH || 
> TARGET_REALLY_IWMMXT)
> +
> +#define ARM_HAVE_V16QI_ARITH (ARM_HAVE_NEON_V16QI_ARITH || TARGET_HAVE_MVE)
> +#define ARM_HAVE_V8HI_ARITH (ARM_HAVE_NEON_V8HI_ARITH || TARGET_HAVE_MVE)
> +#define ARM_HAVE_V4SI_ARITH (ARM_HAVE_NEON_V4SI_ARITH || TARGET_HAVE_MVE)
> +#define ARM_HAVE_V2DI_ARITH ARM_HAVE_NEON_V2DI_ARITH
> +
> +#define ARM_HAVE_V4HF_ARITH ARM_HAVE_NEON_V4HF_ARITH
> +#define ARM_HAVE_V2SF_ARITH ARM_HAVE_NEON_V2SF_ARITH
> +
> +#define ARM_HAVE_V8HF_ARITH (ARM_HAVE_NEON_V8HF_ARITH || 
> TARGET_HAVE_MVE_FLOAT)
> +#define ARM_HAVE_V4SF_ARITH (ARM_HAVE_NEON_V4SF_ARITH || 
> TARGET_HAVE_MVE_FLOAT)
> +
>  /* The register numbers in sequence, for passing to arm_gen_load_multiple.  
> */
>  extern int arm_regs_in_sequence[];
>  
> diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
> index 0bc9eba0722..c70e3bc2731 100644
> --- a/gcc/config/arm/iterators.md
>

[rs6000] Avoid useless masking of count operand for rotation

2020-09-29 Thread Eric Botcazou

Hi,

the Interfaces package of the Ada library defines a pair of rotation operators

   function Rotate_Left  (Value : Unsigned_n; Amount : Natural)
  return Unsigned_n;
   function Rotate_Right (Value : Unsigned_n; Amount : Natural)
  return Unsigned_n;

on modular (aka unsigned) types of n bits.  The translation in GENERIC for the 
Rotate_Left on a 32-bit modular type is:

  Value r<< ((UNSIGNED_32) Amount & 31)

and the masking is present all the way down to the assembly at -O2:

rlwinm 4,4,0,27,31
rotlw 3,3,4

Now this masking is redundant since it's done by the hardware so it would be 
nice to get rid of it.  I have attached a couple of patches to that effect: 
the first one adds new instructions while the second one only adds splitters.

Tested on PowerPC64/Linux, OK (which one) for the mainline?


2020-09-29  Eric Botcazou  

* config/rs6000/rs6000.md (*rotl3_mask): New.
(*rotlsi3_mask_64): Likewise.
(*rotl3_dot): Change to use P mode iterator.
(*rotl3_mask_dot): New.
(*rotl3_dot2): Change to use P mode iterator.
(*rotl3_mask_dot2): New.


2020-09-29  Eric Botcazou  

* gnat.dg/rotate1.adb: New test.

-- 
Eric Botcazoudiff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 694ff70635e..b6c185b80b7 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -4306,6 +4306,18 @@
   [(set_attr "type" "shift")
(set_attr "maybe_var_shift" "yes")])
 
+;; Avoid useless masking of count operand
+(define_insn "*rotl3_mask"
+  [(set (match_operand:GPR 0 "gpc_reg_operand" "=r")
+	(rotate:GPR (match_operand:GPR 1 "gpc_reg_operand" "r")
+		(and:GPR (match_operand:GPR 2 "gpc_reg_operand" "r")
+			 (match_operand:GPR 3 "const_int_operand" "n"]
+  "(UINTVAL (operands[3]) & (GET_MODE_BITSIZE (mode) - 1))
+   == (unsigned HOST_WIDE_INT) (GET_MODE_BITSIZE (mode) - 1)"
+  "rotl%I2 %0,%1,%2"
+  [(set_attr "type" "shift")
+   (set_attr "var_shift" "yes")])
+
 (define_insn "*rotlsi3_64"
   [(set (match_operand:DI 0 "gpc_reg_operand" "=r")
 	(zero_extend:DI
@@ -4316,20 +4328,34 @@
   [(set_attr "type" "shift")
(set_attr "maybe_var_shift" "yes")])
 
+;; Avoid useless masking of count operand
+(define_insn "*rotlsi3_mask_64"
+  [(set (match_operand:DI 0 "gpc_reg_operand" "=r")
+	(zero_extend:DI
+	(rotate:SI (match_operand:SI 1 "gpc_reg_operand" "r")
+		   (and:SI (match_operand:SI 2 "gpc_reg_operand" "r")
+			   (match_operand:SI 3 "const_int_operand" "n")]
+  "TARGET_POWERPC64
+   && (UINTVAL (operands[3]) & (GET_MODE_BITSIZE (SImode) - 1))
+  == (unsigned HOST_WIDE_INT) (GET_MODE_BITSIZE (SImode) - 1)"
+  "rotlw%I2 %0,%1,%h2"
+  [(set_attr "type" "shift")
+   (set_attr "var_shift" "yes")])
+
 (define_insn_and_split "*rotl3_dot"
   [(set (match_operand:CC 3 "cc_reg_operand" "=x,?y")
-	(compare:CC (rotate:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r")
-(match_operand:SI 2 "reg_or_cint_operand" "rn,rn"))
+	(compare:CC (rotate:P (match_operand:P 1 "gpc_reg_operand" "r,r")
+			  (match_operand:SI 2 "reg_or_cint_operand" "rn,rn"))
 		(const_int 0)))
-   (clobber (match_scratch:GPR 0 "=r,r"))]
-  "mode == Pmode"
+   (clobber (match_scratch:P 0 "=r,r"))]
+  ""
   "@
rotl%I2. %0,%1,%2
#"
-  "&& reload_completed && cc_reg_not_cr0_operand (operands[3], CCmode)"
+  "reload_completed && cc_reg_not_cr0_operand (operands[3], CCmode)"
   [(set (match_dup 0)
-	(rotate:GPR (match_dup 1)
-		(match_dup 2)))
+	(rotate:P (match_dup 1)
+		  (match_dup 2)))
(set (match_dup 3)
 	(compare:CC (match_dup 0)
 		(const_int 0)))]
@@ -4339,22 +4365,49 @@
(set_attr "dot" "yes")
(set_attr "length" "4,8")])
 
+;; Avoid useless masking of count operand
+(define_insn_and_split "*rotl3_mask_dot"
+  [(set (match_operand:CC 4 "cc_reg_operand" "=x,?y")
+	(compare:CC (rotate:P (match_operand:P 1 "gpc_reg_operand" "r,r")
+			  (and:P (match_operand:P 2 "gpc_reg_operand" "r,r")
+ (match_operand:P 3 "const_int_operand" "n,n")))
+		(const_int 0)))
+   (clobber (match_scratch:P 0 "=r,r"))]
+  "(UINTVAL (operands[3]) & (GET_MODE_BITSIZE (mode) - 1))
+   == (unsigned HOST_WIDE_INT) (GET_MODE_BITSIZE (mode) - 1)"
+  "@
+   rotl%I2. %0,%1,%2
+   #"
+  "&& reload_completed && cc_reg_not_cr0_operand (operands[4], CCmode)"
+  [(set (match_dup 0)
+	(rotate:P (match_dup 1)
+		  (and:P (match_dup 2)
+			 (match_dup 3
+   (set (match_dup 4)
+	(compare:CC (match_dup 0)
+		(const_int 0)))]
+  ""
+  [(set_attr "type" "shift")
+   (set_attr "var_shift" "yes")
+   (set_attr "dot" "yes")
+   (set_attr "length" "4,8")])
+
 (define_insn_and_split "*rotl3_dot2"
   [(set (match_operand:CC 3 "cc_reg_operand" "=x,?y")
-	(compare:CC (rotate:GPR (match_operand:GPR 1 "gpc_reg_operand" "r,r")
-(match_operand:SI 2 "reg_or_cint_operand" "rn,rn"))
+	(compare:CC (rotate:P (match_operand:P 1 "gpc_reg_operand" "r,r")
+			  (match_operand:SI 2 "reg_or_cint_operand"

aarch64/arm: GCC 10 backportx

2020-09-29 Thread Richard Sandiford

I've backported the following SVE ACLE and stack-protector patches
to GCC 10.  The arm one was approved last week.

Tested on aarch64-linux-gnu and arm-linux-gnueabihf.

Richard


>From 0559badf0176b257d3cba89f8eb4b08948216002 Mon Sep 17 00:00:00 2001
From: Richard Sandiford 
Date: Tue, 29 Sep 2020 11:22:03 +0100
Subject: [PATCH 1/5] aarch64: Update the mangling of single SVE vectors and
 predicates

GCC was implementing an old mangling scheme for single SVE
vectors and predicates (based on the Advanced SIMD one).
The final definition instead put them in the vendor built-in
namespace via the "u" prefix.

gcc/
	* config/aarch64/aarch64-sve-builtins.cc (DEF_SVE_TYPE): Add a
	leading "u" to each mangled name.

gcc/testsuite/
	* g++.target/aarch64/sve/acle/general-c++/mangle_1.C: Add a leading
	"u" to the mangling of each SVE vector and predicate type.
	* g++.target/aarch64/sve/acle/general-c++/mangle_2.C: Likewise.
	* g++.target/aarch64/sve/acle/general-c++/mangle_3.C: Likewise.
	* g++.target/aarch64/sve/acle/general-c++/mangle_5.C: Likewise.

(cherry picked from commit dcb043351307001a85fc1e7d56669f5adc9628f7)
---
 gcc/config/aarch64/aarch64-sve-builtins.cc|  2 +-
 .../aarch64/sve/acle/general-c++/mangle_1.C   | 26 +--
 .../aarch64/sve/acle/general-c++/mangle_2.C   | 26 +--
 .../aarch64/sve/acle/general-c++/mangle_3.C   |  4 +--
 .../aarch64/sve/acle/general-c++/mangle_5.C   |  4 +--
 5 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc b/gcc/config/aarch64/aarch64-sve-builtins.cc
index bdb04e8170d..820cc9f7e17 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
@@ -101,7 +101,7 @@ struct registered_function_hasher : nofree_ptr_hash 
 /* Information about each single-predicate or single-vector type.  */
 static CONSTEXPR const vector_type_info vector_types[] = {
 #define DEF_SVE_TYPE(ACLE_NAME, NCHARS, ABI_NAME, SCALAR_TYPE) \
-  { #ACLE_NAME, #ABI_NAME, #NCHARS #ABI_NAME },
+  { #ACLE_NAME, #ABI_NAME, "u" #NCHARS #ABI_NAME },
 #include "aarch64-sve-builtins.def"
 };
 
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_1.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_1.C
index 1a171248585..36dab3c9b71 100644
--- a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_1.C
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_1.C
@@ -16,16 +16,16 @@ void f11(svfloat32_t) {}
 void f12(svfloat64_t) {}
 void f13(svbfloat16_t) {}
 
-/* { dg-final { scan-assembler "_Z2f110__SVBool_t:" } } */
-/* { dg-final { scan-assembler "_Z2f210__SVInt8_t:" } } */
-/* { dg-final { scan-assembler "_Z2f311__SVInt16_t:" } } */
-/* { dg-final { scan-assembler "_Z2f411__SVInt32_t:" } } */
-/* { dg-final { scan-assembler "_Z2f511__SVInt64_t:" } } */
-/* { dg-final { scan-assembler "_Z2f611__SVUint8_t:" } } */
-/* { dg-final { scan-assembler "_Z2f712__SVUint16_t:" } } */
-/* { dg-final { scan-assembler "_Z2f812__SVUint32_t:" } } */
-/* { dg-final { scan-assembler "_Z2f912__SVUint64_t:" } } */
-/* { dg-final { scan-assembler "_Z3f1013__SVFloat16_t:" } } */
-/* { dg-final { scan-assembler "_Z3f1113__SVFloat32_t:" } } */
-/* { dg-final { scan-assembler "_Z3f1213__SVFloat64_t:" } } */
-/* { dg-final { scan-assembler "_Z3f1314__SVBfloat16_t:" } } */
+/* { dg-final { scan-assembler "_Z2f1u10__SVBool_t:" } } */
+/* { dg-final { scan-assembler "_Z2f2u10__SVInt8_t:" } } */
+/* { dg-final { scan-assembler "_Z2f3u11__SVInt16_t:" } } */
+/* { dg-final { scan-assembler "_Z2f4u11__SVInt32_t:" } } */
+/* { dg-final { scan-assembler "_Z2f5u11__SVInt64_t:" } } */
+/* { dg-final { scan-assembler "_Z2f6u11__SVUint8_t:" } } */
+/* { dg-final { scan-assembler "_Z2f7u12__SVUint16_t:" } } */
+/* { dg-final { scan-assembler "_Z2f8u12__SVUint32_t:" } } */
+/* { dg-final { scan-assembler "_Z2f9u12__SVUint64_t:" } } */
+/* { dg-final { scan-assembler "_Z3f10u13__SVFloat16_t:" } } */
+/* { dg-final { scan-assembler "_Z3f11u13__SVFloat32_t:" } } */
+/* { dg-final { scan-assembler "_Z3f12u13__SVFloat64_t:" } } */
+/* { dg-final { scan-assembler "_Z3f13u14__SVBfloat16_t:" } } */
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_2.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_2.C
index 6792b8a3133..ad4aaee291f 100644
--- a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_2.C
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_2.C
@@ -14,16 +14,16 @@ void f11(__SVFloat32_t) {}
 void f12(__SVFloat64_t) {}
 void f13(__SVBfloat16_t) {}
 
-/* { dg-final { scan-assembler "_Z2f110__SVBool_t:" } } */
-/* { dg-final { scan-assembler "_Z2f210__SVInt8_t:" } } */
-/* { dg-final { scan-assembler "_Z2f311__SVInt16_t:" } } */
-/* { dg-final { scan-assembler "_Z2f411__SVInt32_t:" } } */
-/* { dg-final { scan-assembler "_Z2f511__SVInt64_t:" } } */
-/* { dg-final { scan-assembler "_Z2f611__SVUint8_t:" } } */
-/* {

RE: [PATCH][GCC 9] arm: Add support for Neoverse V1 CPU

2020-09-29 Thread Kyrylo Tkachov



> -Original Message-
> From: Alex Coplan 
> Sent: 29 September 2020 11:18
> To: gcc-patches@gcc.gnu.org
> Cc: ni...@redhat.com; Richard Earnshaw ;
> Ramana Radhakrishnan ; Kyrylo
> Tkachov 
> Subject: [PATCH][GCC 9] arm: Add support for Neoverse V1 CPU
> 
> Hello,
> 
> This patch backports the AArch32 support for Arm's Neoverse V1 CPU to
> GCC 9.
> 
> Testing:
>  * Bootstrapped and regtested on arm-none-linux-gnueabihf.
> 
> OK for GCC 9 branch?

Ok.
Thanks,
Kyrill

> 
> Thanks,
> Alex
> 
> ---
> 
> gcc/ChangeLog:
> 
>   * config/arm/arm-cpus.in (neoverse-v1): New.
>   * config/arm/arm-tables.opt: Regenerate.
>   * config/arm/arm-tune.md: Regenerate.
>   * doc/invoke.texi: Document AArch32 support for Neoverse V1.

[PATCH][GCC 9] arm: Add support for Neoverse V1 CPU

2020-09-29 Thread Alex Coplan

Hello,

This patch backports the AArch32 support for Arm's Neoverse V1 CPU to
GCC 9.

Testing:
 * Bootstrapped and regtested on arm-none-linux-gnueabihf.

OK for GCC 9 branch?

Thanks,
Alex

---

gcc/ChangeLog:

* config/arm/arm-cpus.in (neoverse-v1): New.
* config/arm/arm-tables.opt: Regenerate.
* config/arm/arm-tune.md: Regenerate.
* doc/invoke.texi: Document AArch32 support for Neoverse V1.
diff --git a/gcc/config/arm/arm-cpus.in b/gcc/config/arm/arm-cpus.in
index 3a55f6ac6d2..747767ab386 100644
--- a/gcc/config/arm/arm-cpus.in
+++ b/gcc/config/arm/arm-cpus.in
@@ -1362,6 +1362,16 @@ begin cpu cortex-a76.cortex-a55
  costs cortex_a57
 end cpu cortex-a76.cortex-a55
 
+# Armv8.4 A-profile Architecture Processors
+begin cpu neoverse-v1
+  cname neoversev1
+  tune for cortex-a57
+  tune flags LDSCHED
+  architecture armv8.4-a+fp16
+  option crypto add FP_ARMv8 CRYPTO
+  costs cortex_a57
+end cpu neoverse-v1
+
 # V8 M-profile implementations.
 begin cpu cortex-m23
  cname cortexm23
diff --git a/gcc/config/arm/arm-tables.opt b/gcc/config/arm/arm-tables.opt
index bba54aea3d6..5384284b53a 100644
--- a/gcc/config/arm/arm-tables.opt
+++ b/gcc/config/arm/arm-tables.opt
@@ -243,6 +243,9 @@ Enum(processor_type) String(cortex-a75.cortex-a55) Value( 
TARGET_CPU_cortexa75co
 EnumValue
 Enum(processor_type) String(cortex-a76.cortex-a55) Value( 
TARGET_CPU_cortexa76cortexa55)
 
+EnumValue
+Enum(processor_type) String(neoverse-v1) Value( TARGET_CPU_neoversev1)
+
 EnumValue
 Enum(processor_type) String(cortex-m23) Value( TARGET_CPU_cortexm23)
 
diff --git a/gcc/config/arm/arm-tune.md b/gcc/config/arm/arm-tune.md
index b9dfb66ec84..1257daff074 100644
--- a/gcc/config/arm/arm-tune.md
+++ b/gcc/config/arm/arm-tune.md
@@ -45,6 +45,6 @@
cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,
cortexa73cortexa53,cortexa55,cortexa75,
cortexa76,neoversen1,cortexa75cortexa55,
-   cortexa76cortexa55,cortexm23,cortexm33,
-   cortexr52"
+   cortexa76cortexa55,neoversev1,cortexm23,
+   cortexm33,cortexr52"
(const (symbol_ref "((enum attr_tune) arm_tune)")))
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 67cebf59fb7..c85e31fb02c 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -17570,9 +17570,9 @@ Permissible names are: @samp{arm7tdmi}, 
@samp{arm7tdmi-s}, @samp{arm710t},
 @samp{cortex-m4}, @samp{cortex-m7}, @samp{cortex-m23}, @samp{cortex-m33},
 @samp{cortex-m1.small-multiply}, @samp{cortex-m0.small-multiply},
 @samp{cortex-m0plus.small-multiply}, @samp{exynos-m1}, @samp{marvell-pj4},
-@samp{neoverse-n1}, @samp{xscale}, @samp{iwmmxt}, @samp{iwmmxt2},
-@samp{ep9312}, @samp{fa526}, @samp{fa626}, @samp{fa606te}, @samp{fa626te},
-@samp{fmp626}, @samp{fa726te}, @samp{xgene1}.
+@samp{neoverse-n1}, @samp{neoverse-v1}, @samp{xscale}, @samp{iwmmxt},
+@samp{iwmmxt2}, @samp{ep9312}, @samp{fa526}, @samp{fa626}, @samp{fa606te},
+@samp{fa626te}, @samp{fmp626}, @samp{fa726te}, @samp{xgene1}.
 
 Additionally, this option can specify that GCC should tune the performance
 of the code for a big.LITTLE system.  Permissible names are:

Re: [PATCH, 1/3, OpenMP] Target mapping changes for OpenMP 5.0, front-end parts

2020-09-29 Thread Jakub Jelinek via Gcc-patches

On Tue, Sep 01, 2020 at 09:16:23PM +0800, Chung-Lin Tang wrote:
> this patch set implements parts of the target mapping changes introduced
> in OpenMP 5.0, mainly the attachment requirements for pointer-based
> list items, and the clause ordering.
> 
> The first patch here are the C/C++ front-end changes.

Do you think you could mention in detail which exact target mapping changes
in the spec is the patchset attempting to implement?
5.0 unfortunately contains many target mapping changes and this patchset
can't implement them all and it would be easier to see the list of rules
(e.g. from openmp-diff-full-4.5-5.0.pdf, if you don't have that one, I can
send it to you), rather than trying to guess them from the patchset.

Thanks.

> gcc/c-family/
> * c-common.h (c_omp_adjust_clauses): New declaration.
> * c-omp.c (c_omp_adjust_clauses): New function.

This function name is too broad, it should have target in it as it is
for processing target* construct clauses only.

Jakub

Re: [PATCH v2 6/16]middle-end Add Complex Addition with rotation detection

2020-09-29 Thread Richard Sandiford

Tamar Christina  writes:
> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index 
> 2b46286943778e16d95b15def4299bcbf8db7eb8..71e226505b2619d10982b59a4ebbed73a70f29be
>  100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -6132,6 +6132,17 @@ floating-point mode.
>  
>  This pattern is not allowed to @code{FAIL}.
>  
> +@cindex @code{cadd@var{m}@var{n}3} instruction pattern
> +@item @samp{cadd@var{m}@var{n}3}
> +Perform a vector addition of complex numbers in operand 1 with operand 2
> +rotated by @var{m} degrees around the argand plane and storing the result in
> +operand 0.  The instruction must perform the operation on data loaded
> +contiguously into the vectors.

Nitpicking, sorry, but I think it would be better to describe the
layout directly rather than in terms of loads, since the preceding
operation might not be a load.

I guess the main question is: what representation do we expect for
big-endian?  A normal Advanced SIMD LDR would give this (for floats):

 MEMORY
   +-+-+-+-+
   | r0  | i0  | r1  | i1  |
   +-+-+-+-+
   |  0  |  1  |  2  |  3  |   array numbering
   +-+-+-+-+
  V V V V  Advanced SIMD LDR
   +-+-+-+-+
   | r0  | i0  | r1  | i1  |
   +-+-+-+-+
   |  0  |  1  |  2  |  3  |   GCC lane numbering
   +-+-+-+-+
   |  3  |  2  |  1  |  0  |   Arm lane numbering
   +-+-+-+-+
  MSB   REGISTER  LSB

but the FC* instructions put the imaginary parts in the more
significant lane, so the pairs of elements above would need
to be reversed:

 MEMORY
   +-+-+-+-+
   | r0  | i0  | r1  | i1  |
   +-+-+-+-+
   |  0  |  1  |  2  |  3  |   array numbering
   +-+-+-+-+
   \   /   \   /
\ / \ /
 X   X Load and permute
/ \ / \
   /   \   /   \
   +-+-+-+-+
   | i0  | r0  | i1  | r1  |
   +-+-+-+-+
   |  0  |  1  |  2  |  3  |   GCC lane numbering
   +-+-+-+-+
   |  3  |  2  |  1  |  0  |   Arm lane numbering
   +-+-+-+-+
  MSB   REGISTER  LSB

(Or the whole vector could be reversed.)

We might decide that it just isn't worth doing this for Advanced SIMD.
But should the semantics of the optab be that:

(1) GCC lane number 0 holds a real part, or
(2) the least significant lane holds a real part?

With (1), it would be up to the target to hide the permute above.
With (2), the vectoriser would need to introduce the permute itself.

I'm not sure there's a perfect answer even for Arm targets.  (2) matches
the Advanced SIMD semantics.  But for SVE, the register layout follows
LD1 rather than LDR, and the GCC and architectural lane numbering match up.
(1) would therefore be better than (2) for SVE (and so no permute would be
needed for either endianness on SVE).

> +The operation is only supported for vector modes @var{n} and with
> +rotations @var{m} of 90 or 270.
> +
> +This pattern is not allowed to @code{FAIL}.
> +
>  @cindex @code{ffs@var{m}2} instruction pattern
>  @item @samp{ffs@var{m}2}
>  Store into operand 0 one plus the index of the least significant 1-bit
> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index 
> 13e60828fcf5db6c5f15aae2bacd4cf04029e430..956a65a338c157b51de7e78a3fb005b5af78ef31
>  100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -275,6 +275,8 @@ DEF_INTERNAL_FLT_FN (SCALB, ECF_CONST, scalb, binary)
>  DEF_INTERNAL_FLT_FLOATN_FN (FMIN, ECF_CONST, fmin, binary)
>  DEF_INTERNAL_FLT_FLOATN_FN (FMAX, ECF_CONST, fmax, binary)
>  DEF_INTERNAL_OPTAB_FN (XORSIGN, ECF_CONST, xorsign, binary)
> +DEF_INTERNAL_OPTAB_FN (COMPLEX_ADD_ROT90, ECF_CONST, cadd90, binary)
> +DEF_INTERNAL_OPTAB_FN (COMPLEX_ADD_ROT270, ECF_CONST, cadd270, binary)
>  
>  /* FP scales.  */
>  DEF_INTERNAL_FLT_FN (LDEXP, ECF_CONST, ldexp, binary)
> diff --git a/gcc/optabs.def b/gcc/optabs.def
> index 
> 78409aa14537d259bf90277751aac00d452a0d3f..2bb0bf857977035bf562a77f5f6848e80edf936d
>  100644
> --- a/gcc/optabs.def
> +++ b/gcc/optabs.def
> @@ -290,6 +290,8 @@ OPTAB_D (atan_optab, "atan$a2")
>  OPTAB_D (atanh_optab, "atanh$a2")
>  OPTAB_D (copysign_optab, "copysign$F$a3")
>  OPTAB_D (xorsign_optab, "xorsign$F$a3")
> +OPTAB_D (cadd90_optab, "cadd90$a3")
> +OPTAB_D (cadd270_optab, "cadd270$a3")
>  OPTAB_D (cos_optab, "cos$a2")
>  OPTAB_D (cosh_optab, "cosh$a2")
>  OPTAB_D (exp10_optab, "exp10$a2")
> diff --git a/gcc/tree-vect-slp-patterns.c b/gcc/tree-vect-slp-patterns.c
> index 
> 6453a5b1b6464dba833adc2c2a194db5e712bb79..b2b0ac62e9a69145470f41d2bac736dd970be735
>  100644
> --- a/gcc/tree-vect-slp-patterns.c
> +++ b/gcc/tree-vect-slp-patterns.c
> @@ -663,12 +663,94 @@ graceful_exit:
>  }
>  };
>  
> +class ComplexAddPattern : public ComplexPattern

Another nitpick, sorry, but type names should be lower case rather than
CamelCase.

Re: [PATCH] libstdc++: Add C++2a synchronization support

2020-09-29 Thread Jonathan Wakely via Gcc-patches


On 28/09/20 14:29 -0700, Thomas Rodgers wrote:

+template
+  __atomic_wait_status
+  __platform_wait_until_impl(__platform_wait_t* __addr,
+__platform_wait_t __val,
+const 
chrono::time_point<__platform_wait_clock_t,
+ _Duration>& __atime) 
noexcept
+  {
+   auto __s = chrono::time_point_cast(__atime);
+   auto __ns = chrono::duration_cast(__atime - __s);
+
+   struct timespec __rt =
+   {
+ static_cast(__s.time_since_epoch().count()),
+ static_cast(__ns.count())
+   };
+
+   auto __e = syscall (SYS_futex, __addr,
+ 
static_cast(__futex_wait_flags::__wait_bitset_private),
+ __val, &__rt, nullptr,
+ 
static_cast(__futex_wait_flags::__bitset_match_any));
+   if (__e && !(errno == EINTR || errno == EAGAIN || errno == ETIMEDOUT))
+   std::terminate();
+   return (__platform_wait_clock_t::now() < __atime)
+  ? __atomic_wait_status::no_timeout : 
__atomic_wait_status::timeout;
+  }
+
+template
+  __atomic_wait_status
+  __platform_wait_until(__platform_wait_t* __addr, __platform_wait_t __val,
+   const chrono::time_point<_Clock, _Duration>& 
__atime)
+  {
+   if constexpr (is_same_v<__platform_wait_clock_t, _Clock>)


This case is impossible, since the other overload would be selected
if the clock is the __platform_wait_clock_t (unless the caller says
__platform_wait_until<__platform_wait_until> to explicitly call this
overload, but users can't call this function, and we won't do that).



Which overload?



I must have misread __platform_wait_until_impl above as
__platform_wait_until. Ignore this comment, sorry!

Re: [PATCH v2 3/16]middle-end Add basic SLP pattern matching scaffolding.

2020-09-29 Thread Richard Biener

On Tue, 29 Sep 2020, Richard Sandiford wrote:

> Richard Biener  writes:
> >> > > @@ -2192,6 +2378,17 @@ vect_analyze_slp_instance (vec_info *vinfo,
> >> > >   _size, bst_map);
> >> > >if (node != NULL)
> >> > >  {
> >> > > +  /* Temporarily allow add_stmt calls again.  */
> >> > > +  vinfo->stmt_vec_info_ro = false;
> >> > > +
> >> > > +  /* See if any patterns can be found in the constructed SLP tree
> >> > > +before we do any analysis on it.  */
> >> > > +  vect_match_slp_patterns (node, vinfo, group_size, _nunits,
> >> > > +  matches, , _size,
> >> > > + bst_map);
> >> > > +
> >> > > +  /* After this no more add_stmt calls are allowed.  */
> >> > > +  vinfo->stmt_vec_info_ro = true;
> >> > > +
> >> > >
> >> > > I think this is a bit early to match patterns - I'd defer it to the
> >> > > point where all entries into the same SLP subgraph are analyzed, thus
> >> > > somewhere at the end of vect_analyze_slp loop over all instances and
> >> > > match patterns?  That way phases are more clearly separated.
> >> > 
> >> > That would probably work, my only worry is that the SLP analysis itself 
> >> > may
> >> > fail and bail out at
> >> > 
> >> >/* If the loads and stores can be handled with load/store-lane
> >> >   instructions do not generate this SLP instance.  */
> >> >if (is_a  (vinfo)
> >> >&& loads_permuted
> >> >&& dr && vect_store_lanes_supported (vectype, group_size,
> >> > false))
> >
> > Ah, that piece of code.  Yeah, I'm repeatedly running into it as well - 
> > it's a bad hack that stands in the way all the time :/
> 
> At one point I was wondering about trying to drop the above, vectorise with
> and without SLP, and then compare their costs, like for VECT_COMPARE_COSTS.
> But that seemed like a dead end with the move to doing everything on the
> SLP representation.

Yeah ... though even moving everything to the SLP representation will
retain the issue since there it will be N group-size 1 SLP instances
vs. 1 group-size N SLP instance.

> > I guess we should try moving this upward like to
> > vect_analyze_loop_2 right before
> >
> >   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  
> > */
> >   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
> >   if (!ok)
> > return ok;
> >
> > and there check whether all grouped loads and stores can be handled
> > with store-/load-lanes (and there are no SLP reduction chains?) in
> > which case do not try to attempt SLP at all.  Because the testcases
> > this check was supposed to change were all-load/store-lane or
> > all SLP so the mixed case is probably not worth special casing.
> >
> > Since load-/store-lanes is an arm speciality I tried to only touch
> > this fragile part with a ten-foot pole ;)  CCing Richard, if he
> > acks the above I can produce a patch.
> 
> Yeah, sounds good to me.  Probably also sorth checking whether the
> likely_max iteration count is high enough to support group_size
> vectors, if we have enough information to guess that.
> 
> We could also get the gen* machinery to emit a macro that is true if at
> least one load/store-lane pattern is defined, so that we can skip the
> code for non-Arm targets.  I can do that as a follow-up.

I've had a second look and one complication is that we only elide the
SLP node if any of the loads are permuted.  So if all loads/stores
are unpermuted but load/store-lanes would work we'd keep the SLP node.

Of course without actually building the SLP node we don't know whether
the loads will be permuted or not ...

But surely the current place for the check will cause some testcases
to become hybrid vectorizations which is likely undesirable.

So we could move the check after all SLP discovery is completed
and throw it all away if we can and should use load/store-lanes?
But that might then not solve Tamars issue.

Richard.

Re: [PATCH v2 3/16]middle-end Add basic SLP pattern matching scaffolding.

2020-09-29 Thread Richard Sandiford

Richard Biener  writes:
>> > > @@ -2192,6 +2378,17 @@ vect_analyze_slp_instance (vec_info *vinfo,
>> > >   _size, bst_map);
>> > >if (node != NULL)
>> > >  {
>> > > +  /* Temporarily allow add_stmt calls again.  */
>> > > +  vinfo->stmt_vec_info_ro = false;
>> > > +
>> > > +  /* See if any patterns can be found in the constructed SLP tree
>> > > +before we do any analysis on it.  */
>> > > +  vect_match_slp_patterns (node, vinfo, group_size, _nunits,
>> > > +  matches, , _size,
>> > > + bst_map);
>> > > +
>> > > +  /* After this no more add_stmt calls are allowed.  */
>> > > +  vinfo->stmt_vec_info_ro = true;
>> > > +
>> > >
>> > > I think this is a bit early to match patterns - I'd defer it to the
>> > > point where all entries into the same SLP subgraph are analyzed, thus
>> > > somewhere at the end of vect_analyze_slp loop over all instances and
>> > > match patterns?  That way phases are more clearly separated.
>> > 
>> > That would probably work, my only worry is that the SLP analysis itself may
>> > fail and bail out at
>> > 
>> >  /* If the loads and stores can be handled with load/store-lane
>> > instructions do not generate this SLP instance.  */
>> >  if (is_a  (vinfo)
>> >  && loads_permuted
>> >  && dr && vect_store_lanes_supported (vectype, group_size,
>> > false))
>
> Ah, that piece of code.  Yeah, I'm repeatedly running into it as well - 
> it's a bad hack that stands in the way all the time :/

At one point I was wondering about trying to drop the above, vectorise with
and without SLP, and then compare their costs, like for VECT_COMPARE_COSTS.
But that seemed like a dead end with the move to doing everything on the
SLP representation.

> I guess we should try moving this upward like to
> vect_analyze_loop_2 right before
>
>   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  
> */
>   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
>   if (!ok)
> return ok;
>
> and there check whether all grouped loads and stores can be handled
> with store-/load-lanes (and there are no SLP reduction chains?) in
> which case do not try to attempt SLP at all.  Because the testcases
> this check was supposed to change were all-load/store-lane or
> all SLP so the mixed case is probably not worth special casing.
>
> Since load-/store-lanes is an arm speciality I tried to only touch
> this fragile part with a ten-foot pole ;)  CCing Richard, if he
> acks the above I can produce a patch.

Yeah, sounds good to me.  Probably also sorth checking whether the
likely_max iteration count is high enough to support group_size
vectors, if we have enough information to guess that.

We could also get the gen* machinery to emit a macro that is true if at
least one load/store-lane pattern is defined, so that we can skip the
code for non-Arm targets.  I can do that as a follow-up.

Thanks,
Richard

RE: [PATCH][GCC 10] arm: Add support for Neoverse V1 CPU

2020-09-29 Thread Kyrylo Tkachov



> -Original Message-
> From: Alex Coplan 
> Sent: 29 September 2020 09:59
> To: gcc-patches@gcc.gnu.org
> Cc: ni...@redhat.com; Richard Earnshaw ;
> Ramana Radhakrishnan ; Kyrylo
> Tkachov 
> Subject: [PATCH][GCC 10] arm: Add support for Neoverse V1 CPU
> 
> Hello,
> 
> This patch backports the AArch32 support for Arm's Neoverse V1 CPU to
> GCC 10.
> 
> Testing:
>  * Bootstrapped and regtested on arm-none-linux-gnueabihf.
> 
> OK for GCC 10 branch?

Ok.
Thanks,
Kyrill

> 
> Thanks,
> Alex
> 
> ---
> 
> gcc/ChangeLog:
> 
>   * config/arm/arm-cpus.in (neoverse-v1): New.
>   * config/arm/arm-tables.opt: Regenerate.
>   * config/arm/arm-tune.md: Regenerate.
>   * doc/invoke.texi: Document AArch32 support for Neoverse V1.

Re: [PATCH] RISC-V: Define __riscv_cmodel_medany for PIC mode.

2020-09-29 Thread Kito Cheng via Gcc-patches

Thanks, committed with more comments in code :)

On Tue, Sep 29, 2020 at 3:35 AM Jim Wilson  wrote:
>
> On Thu, Sep 24, 2020 at 10:46 PM Kito Cheng  wrote:
> >
> >  - According the conclusion in RISC-V C API document, we decide to deprecat
> >the __riscv_cmodel_pic marco
> >
> >  - __riscv_cmodel_pic is deprecated and will removed in next GCC
> >release.
>
> Looks good to me.  By the way, you can self approve patches like this.
>
> Optionally, you might add a comment to the code to point out that
> __riscv_cmodel_pic is deprecated.  That makes it a little easier to
> understand the code.
>
> Jim

[PATCH][GCC 10] arm: Add support for Neoverse V1 CPU

2020-09-29 Thread Alex Coplan

Hello,

This patch backports the AArch32 support for Arm's Neoverse V1 CPU to
GCC 10.

Testing:
 * Bootstrapped and regtested on arm-none-linux-gnueabihf.

OK for GCC 10 branch?

Thanks,
Alex

---

gcc/ChangeLog:

* config/arm/arm-cpus.in (neoverse-v1): New.
* config/arm/arm-tables.opt: Regenerate.
* config/arm/arm-tune.md: Regenerate.
* doc/invoke.texi: Document AArch32 support for Neoverse V1.
diff --git a/gcc/config/arm/arm-cpus.in b/gcc/config/arm/arm-cpus.in
index 728be500b80..b1fe48eb087 100644
--- a/gcc/config/arm/arm-cpus.in
+++ b/gcc/config/arm/arm-cpus.in
@@ -1478,6 +1478,16 @@ begin cpu cortex-a76.cortex-a55
  costs cortex_a57
 end cpu cortex-a76.cortex-a55
 
+# Armv8.4 A-profile Architecture Processors
+begin cpu neoverse-v1
+  cname neoversev1
+  tune for cortex-a57
+  tune flags LDSCHED
+  architecture armv8.4-a+fp16+bf16+i8mm
+  option crypto add FP_ARMv8 CRYPTO
+  costs cortex_a57
+end cpu neoverse-v1
+
 # V8 M-profile implementations.
 begin cpu cortex-m23
  cname cortexm23
diff --git a/gcc/config/arm/arm-tables.opt b/gcc/config/arm/arm-tables.opt
index ce356611861..1a7c3191784 100644
--- a/gcc/config/arm/arm-tables.opt
+++ b/gcc/config/arm/arm-tables.opt
@@ -249,6 +249,9 @@ Enum(processor_type) String(cortex-a75.cortex-a55) Value( 
TARGET_CPU_cortexa75co
 EnumValue
 Enum(processor_type) String(cortex-a76.cortex-a55) Value( 
TARGET_CPU_cortexa76cortexa55)
 
+EnumValue
+Enum(processor_type) String(neoverse-v1) Value( TARGET_CPU_neoversev1)
+
 EnumValue
 Enum(processor_type) String(cortex-m23) Value( TARGET_CPU_cortexm23)
 
diff --git a/gcc/config/arm/arm-tune.md b/gcc/config/arm/arm-tune.md
index 8ea9435c0c9..3874f42a26b 100644
--- a/gcc/config/arm/arm-tune.md
+++ b/gcc/config/arm/arm-tune.md
@@ -46,6 +46,6 @@ (define_attr "tune"
cortexa73cortexa53,cortexa55,cortexa75,
cortexa76,cortexa76ae,cortexa77,
neoversen1,cortexa75cortexa55,cortexa76cortexa55,
-   cortexm23,cortexm33,cortexm35p,
-   cortexm55,cortexr52"
+   neoversev1,cortexm23,cortexm33,
+   cortexm35p,cortexm55,cortexr52"
(const (symbol_ref "((enum attr_tune) arm_tune)")))
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 5b408150084..0eb5b6bb135 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -18824,9 +18824,9 @@ Permissible names are: @samp{arm7tdmi}, 
@samp{arm7tdmi-s}, @samp{arm710t},
 @samp{cortex-m35p}, @samp{cortex-m55},
 @samp{cortex-m1.small-multiply}, @samp{cortex-m0.small-multiply},
 @samp{cortex-m0plus.small-multiply}, @samp{exynos-m1}, @samp{marvell-pj4},
-@samp{neoverse-n1}, @samp{xscale}, @samp{iwmmxt}, @samp{iwmmxt2},
-@samp{ep9312}, @samp{fa526}, @samp{fa626}, @samp{fa606te}, @samp{fa626te},
-@samp{fmp626}, @samp{fa726te}, @samp{xgene1}.
+@samp{neoverse-n1} @samp{neoverse-v1}, @samp{xscale}, @samp{iwmmxt},
+@samp{iwmmxt2}, @samp{ep9312}, @samp{fa526}, @samp{fa626}, @samp{fa606te},
+@samp{fa626te}, @samp{fmp626}, @samp{fa726te}, @samp{xgene1}.
 
 Additionally, this option can specify that GCC should tune the performance
 of the code for a big.LITTLE system.  Permissible names are:

1 2 >

1 - 100 of 109 matches

Mail list logo