[PATCH] PR gcc/84923 - gcc.dg/attr-weakref-1.c failed on aarch64

2018-04-12 Thread vladimir . mezentsev
From: Vladimir Mezentsev 

When weakref_targets is not empty a target cannot be removed from weak_decls.
A small example is below when 'wv12' is removed from the weak list on aarch64:
  static vtype Wv12 __attribute__((weakref ("wv12")));
  extern vtype wv12 __attribute__((weak));

Bootstrapped on aarch64-unknown-linux-gnu including (c,c++ and go).
Tested on aarch64-linux-gnu.
No regression. The attr-weakref-1.c test passed.

ChangeLog:
2018-04-12  Vladimir Mezentsev  

PR gcc/84923
* varasm.c (weak_finish): clean up weak_decls
---
 gcc/varasm.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/gcc/varasm.c b/gcc/varasm.c
index d24bac4..2a70234 100644
--- a/gcc/varasm.c
+++ b/gcc/varasm.c
@@ -5683,8 +5683,7 @@ weak_finish (void)
   nor multiple .weak directives for the latter.  */
for (p = &weak_decls; (t2 = *p) ; )
  {
-   if (TREE_VALUE (t2) == alias_decl
-   || target == DECL_ASSEMBLER_NAME (TREE_VALUE (t2)))
+   if (TREE_VALUE (t2) == alias_decl)
  *p = TREE_CHAIN (t2);
else
  p = &TREE_CHAIN (t2);
-- 
1.8.3.1



[PATCH, rs6000] (PR84302) Fix _mm_slli_epi{32,64} for shift values 16 through 31 and negative

2018-04-12 Thread Paul Clarke
The powerpc versions of _mm_slli_epi32 and __mm_slli_epi64 in emmintrin.h
do not properly handle shift values between 16 and 31, inclusive.
These were setting up the shift with vec_splat_s32, which only accepts
*5 bit signed* shift values, or a range of -16 to 15.  Values above 15
produced an error:

  error: argument 1 must be a 5-bit signed literal

Fix is to effectively reduce the range for which vec_splat_s32 is used
to < 32 and use vec_splats otherwise.

Also, __mm_slli_epi{16,32,64}, when given a negative shift value,
should always return a vector of {0}.

2018-04-12  Paul A. Clarke  

gcc/config

PR target/83402
* rs6000/emmintrin.h (_mm_slli_epi{16,32,64}):
Ensure that vec_splat_s32 is only called with 0 < shift < 16.
Ensure negative shifts result in {0}.

gcc/testsuite/gcc.target/powerpc

PR target/83402
* gcc.target/powerpc/sse2-psllw-1.c: Refactor and add tests for
several positive and negative values.
* gcc.target/powerpc/sse2-pslld-1.c: Same.
* gcc.target/powerpc/sse2-psllq-1.c: Same.

Index: gcc/config/rs6000/emmintrin.h
===
--- gcc/config/rs6000/emmintrin.h   (revision 259016)
+++ gcc/config/rs6000/emmintrin.h   (working copy)
@@ -1488,7 +1488,7 @@ _mm_slli_epi16 (__m128i __A, int __B)
   __v8hu lshift;
   __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
 
-  if (__B < 16)
+  if (__B > 0 && __B < 16)
 {
   if (__builtin_constant_p(__B))
  lshift = (__v8hu) vec_splat_s16(__B);
@@ -1507,12 +1507,12 @@ _mm_slli_epi32 (__m128i __A, int __B)
   __v4su lshift;
   __v4si result = { 0, 0, 0, 0 };
 
-  if (__B < 32)
+  if (__B > 0 && __B < 32)
 {
-  if (__builtin_constant_p(__B))
-   lshift = (__v4su) vec_splat_s32(__B);
+  if (__builtin_constant_p(__B) && __B < 16)
+lshift = (__v4su) vec_splat_s32(__B);
   else
-   lshift = vec_splats ((unsigned int) __B);
+lshift = vec_splats ((unsigned int) __B);
 
   result = vec_vslw ((__v4si) __A, lshift);
 }
@@ -1527,17 +1527,12 @@ _mm_slli_epi64 (__m128i __A, int __B)
   __v2du lshift;
   __v2di result = { 0, 0 };
 
-  if (__B < 64)
+  if (__B > 0 && __B < 64)
 {
-  if (__builtin_constant_p(__B))
-   {
- if (__B < 32)
- lshift = (__v2du) vec_splat_s32(__B);
-   else
- lshift = (__v2du) vec_splats((unsigned long long)__B);
-   }
+  if (__builtin_constant_p(__B) && __B < 16)
+   lshift = (__v2du) vec_splat_s32(__B);
   else
- lshift = (__v2du) vec_splats ((unsigned int) __B);
+   lshift = (__v2du) vec_splats ((unsigned int) __B);
 
   result = vec_vsld ((__v2di) __A, lshift);
 }
Index: gcc/testsuite/gcc.target/powerpc/sse2-pslld-1.c
===
--- gcc/testsuite/gcc.target/powerpc/sse2-pslld-1.c (revision 259016)
+++ gcc/testsuite/gcc.target/powerpc/sse2-pslld-1.c (working copy)
@@ -13,32 +13,50 @@
 #define TEST sse2_test_pslld_1
 #endif
 
-#define N 0xf
-
 #include 
 
-static __m128i
-__attribute__((noinline, unused))
-test (__m128i s1)
-{
-  return _mm_slli_epi32 (s1, N); 
-}
+#define TEST_FUNC(id, N) \
+  static __m128i \
+  __attribute__((noinline, unused)) \
+  test##id (__m128i s1) \
+  { \
+return _mm_slli_epi32 (s1, N);  \
+  }
 
+TEST_FUNC(0, 0)
+TEST_FUNC(15, 15)
+TEST_FUNC(16, 16)
+TEST_FUNC(31, 31)
+TEST_FUNC(neg1, -1)
+TEST_FUNC(neg16, -16)
+TEST_FUNC(neg32, -32)
+TEST_FUNC(neg64, -64)
+TEST_FUNC(neg128, -128)
+
+#define TEST_CODE(id, N) \
+  { \
+int e[4] = {0}; \
+union128i_d u, s; \
+int i; \
+s.x = _mm_set_epi32 (1, -2, 3, 4); \
+u.x = test##id (s.x); \
+if (N > 0 && N < 32) \
+  for (i = 0; i < 4; i++) \
+e[i] = s.a[i] << (N * (N > 0)); \
+if (check_union128i_d (u, e)) \
+  abort (); \
+  }
+
 static void
 TEST (void)
 {
-  union128i_d u, s;
-  int e[4] = {0};
-  int i;
- 
-  s.x = _mm_set_epi32 (1, -2, 3, 4);
-
-  u.x = test (s.x);
-
-  if (N < 32)
-for (i = 0; i < 4; i++)
-  e[i] = s.a[i] << N; 
-
-  if (check_union128i_d (u, e))
-abort (); 
+  TEST_CODE(0, 0);
+  TEST_CODE(15, 15);
+  TEST_CODE(16, 16);
+  TEST_CODE(31, 31);
+  TEST_CODE(neg1, -1);
+  TEST_CODE(neg16, -16);
+  TEST_CODE(neg32, -32);
+  TEST_CODE(neg64, -64);
+  TEST_CODE(neg128, -128);
 }
Index: gcc/testsuite/gcc.target/powerpc/sse2-psllq-1.c
===
--- gcc/testsuite/gcc.target/powerpc/sse2-psllq-1.c (revision 259016)
+++ gcc/testsuite/gcc.target/powerpc/sse2-psllq-1.c (working copy)
@@ -13,36 +13,56 @@
 #define TEST sse2_test_psllq_1
 #endif
 
-#define N 60
-
 #include 
 
 #ifdef _ARCH_PWR8
-static __m128i
-__attribute__((noinline, unused))
-test (__m128i s1)
-{
-  return _mm_slli_epi64 (s1, N); 
-}
+#define TEST_FUNC(id, N) \
+  static __m128i \
+  __attribute__((noinline, unused)) \
+  test##id (

Re: [PATCH] avoid duplicate warning for strcmp with a nonstring (PR 85359)

2018-04-12 Thread Martin Sebor

Attached is a minor update that avoids additional duplicate
warnings exposed by more extensive testing (for PR 85369).

On 04/12/2018 02:52 PM, Martin Sebor wrote:

The attached patch makes a small tweak to avoid issuing a duplicate
warning for calls to strcmp with a nonstring argument.  The most
onerous part of this was figuring out how to test for the absence
of duplicate warnings.  The "hack" I used (dg-regexp) is in place
until a more straightforward solution becomes available.  (David
Malcolm has something planned for GCC 9.)

Martin


PR middle-end/85359 - duplicate -Wstringop-overflow for a strcmp call with a nonstring pointer

gcc/ChangeLog:

	PR middle-end/85359
	* builtins.c (expand_builtin_strcpy): Call maybe_warn_nonstring_arg
	only when expasion succeeds.
	(expand_builtin_strcmp): Same.
	(expand_builtin_strncmp): Same.

gcc/testsuite/ChangeLog:

	PR middle-end/85359
	* gcc.dg/attr-nonstring.c: New test.
Index: gcc/builtins.c
===
--- gcc/builtins.c	(revision 259298)
+++ gcc/builtins.c	(working copy)
@@ -3777,7 +3777,17 @@ expand_builtin_strcpy (tree exp, rtx target)
 		src, destsize);
 }
 
-  return expand_builtin_strcpy_args (dest, src, target);
+  if (rtx ret = expand_builtin_strcpy_args (dest, src, target))
+{
+  /* Check to see if the argument was declared attribute nonstring
+	 and if so, issue a warning since at this point it's not known
+	 to be nul-terminated.  */
+  tree fndecl = get_callee_fndecl (exp);
+  maybe_warn_nonstring_arg (fndecl, exp);
+  return ret;
+}
+
+  return NULL_RTX;
 }
 
 /* Helper function to do the actual work for expand_builtin_strcpy.  The
@@ -4570,14 +4580,14 @@ expand_builtin_strcmp (tree exp, ATTRIBUTE_UNUSED
 	}
 }
 
-  /* Check to see if the argument was declared attribute nonstring
- and if so, issue a warning since at this point it's not known
- to be nul-terminated.  */
   tree fndecl = get_callee_fndecl (exp);
-  maybe_warn_nonstring_arg (fndecl, exp);
-
   if (result)
 {
+  /* Check to see if the argument was declared attribute nonstring
+	 and if so, issue a warning since at this point it's not known
+	 to be nul-terminated.  */
+  maybe_warn_nonstring_arg (fndecl, exp);
+
   /* Return the value in the proper mode for this function.  */
   machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
   if (GET_MODE (result) == mode)
@@ -4674,14 +4684,14 @@ expand_builtin_strncmp (tree exp, ATTRIBUTE_UNUSED
 	 arg2_rtx, TREE_TYPE (len), arg3_rtx,
 	 MIN (arg1_align, arg2_align));
 
-  /* Check to see if the argument was declared attribute nonstring
- and if so, issue a warning since at this point it's not known
- to be nul-terminated.  */
   tree fndecl = get_callee_fndecl (exp);
-  maybe_warn_nonstring_arg (fndecl, exp);
-
   if (result)
 {
+  /* Check to see if the argument was declared attribute nonstring
+	 and if so, issue a warning since at this point it's not known
+	 to be nul-terminated.  */
+  maybe_warn_nonstring_arg (fndecl, exp);
+
   /* Return the value in the proper mode for this function.  */
   mode = TYPE_MODE (TREE_TYPE (exp));
   if (GET_MODE (result) == mode)

===
--- gcc/testsuite/gcc.dg/attr-nonstring.c	(nonexistent)
+++ gcc/testsuite/gcc.dg/attr-nonstring.c	(working copy)
@@ -0,0 +1,123 @@
+/* PR middle-end/85359 - duplicate -Wstringop-overflow for a strcmp call
+   with a nonstring pointer
+   { dg-do compile }
+   { dg-options "-O2 -Wall" } */
+
+typedef __SIZE_TYPE__   size_t;
+typedef __builtin_va_list   va_list;
+
+int printf (const char*, ...);
+int puts (const char*);
+int puts_unlocked (const char*);
+int sprintf (char*, const char*, ...);
+int snprintf (char*, size_t, const char*, ...);
+int vsprintf (char*, const char*, va_list);
+int vsnprintf (char*, size_t, const char*, va_list);
+
+int strcmp (const char*, const char*);
+int strncmp (const char*, const char*, size_t);
+
+char* stpcpy (char*, const char*);
+char* stpncpy (char*, const char*, size_t);
+
+char* strcat (char*, const char*);
+char* strncat (char*, const char*, size_t);
+
+char* strcpy (char*, const char*);
+char* strncpy (char*, const char*, size_t);
+
+char* strchr (const char*, int);
+char* strrchr (const char*, int);
+char* strstr (const char*, const char*);
+char* strdup (const char*);
+size_t strlen (const char*);
+size_t strnlen (const char*, size_t);
+char* strndup (const char*, size_t);
+
+#define NONSTRING __attribute__ ((nonstring))
+
+extern char ns5[5] NONSTRING;
+
+int strcmp_nonstring_1 (NONSTRING const char *a, const char *b)
+{
+  /* dg-warning matches one or more instances of the warning so it's
+ no good on its own.  Use dg-regexp instead to verify that just
+ one instance of the warning is issued.  See gcc.dg/pr64223-1
+ for a different approach.  */
+  return strcmp (a, b);  /* { dg-regexp "\[^\n\

[PATCH] issue nonstring warning for strcpy even on s360 (PR 85369)

2018-04-12 Thread Martin Sebor

PR 85369 notes that the c-c++-common/attr-nonstring-3.c fails
on IBM Z (and other similar targets) whose back-end provides
the movstr expander.  The failure is cause by an expected
warning failing to trigger because the strcpy call is expanded
early and the checker never runs.

The attached patch adjusts the code to make sure the warning
is not bypassed on these targets.

I've verified the patch with an s390-linux cross-compiler and
with a full x86_64-linux native build and regression run.

Martin
PR middle-end/85369 - no -Wstringop-overflow for a strcpy / stpcpy call with a nonstring pointer when providing movstr pattern

gcc/ChangeLog:

	PR middle-end/85369
	* builtins.c (expand_builtin_strcpy_1): New function.
	(expand_builtin_stpcpy): Call it, and call maybe_warn_nonstring_arg
	only if the former succeeds.

diff --git a/gcc/builtins.c b/gcc/builtins.c
index b751a4b..f681488 100644
--- a/gcc/builtins.c
+++ b/gcc/builtins.c
@@ -3808,7 +3808,7 @@ expand_builtin_strcpy_args (tree dest, tree src, rtx target)
mode MODE if that's convenient).  */
 
 static rtx
-expand_builtin_stpcpy (tree exp, rtx target, machine_mode mode)
+expand_builtin_stpcpy_1 (tree exp, rtx target, machine_mode mode)
 {
   tree dst, src;
   location_t loc = EXPR_LOCATION (exp);
@@ -3885,6 +3885,25 @@ expand_builtin_stpcpy (tree exp, rtx target, machine_mode mode)
 }
 }
 
+/* Expand a call EXP to the stpcpy builtin and diagnose uses of nonstring
+   arguments while being careful to avoid duplicate warnings (which could
+   be issued if the expander were to expand the call, resulting in it
+   being emitted in expand_call().  */
+
+static rtx
+expand_builtin_stpcpy (tree exp, rtx target, machine_mode mode)
+{
+  if (rtx ret = expand_builtin_stpcpy_1 (exp, target, mode))
+{
+  /* The call has been successfully expanded.  Check for nonstring
+	 arguments and issue warnings as appropriate.  */
+  maybe_warn_nonstring_arg (get_callee_fndecl (exp), exp);
+  return ret;
+}
+
+  return NULL_RTX;
+}
+
 /* Check a call EXP to the stpncpy built-in for validity.
Return NULL_RTX on both success and failure.  */
 


[committed] Prevent erroneous "macro had not yet been defined" messages (PR c++/85385)

2018-04-12 Thread David Malcolm
PR c++/85385 reports an issue where we emit bogus "macro had not yet been
defined" notes when a macro is mis-used:

  $ cat test.c
  #define MACRO(X,Y)

  void test ()
  {
MACRO(42);
  }

  $ ./xg++ -B. -c test.c
  test.c:5:11: error: macro "MACRO" requires 2 arguments, but only 1 given
 MACRO(42);
 ^
  test.c: In function ‘void test()’:
  test.c:5:3: error: ‘MACRO’ was not declared in this scope
 MACRO(42);
 ^
  test.c:5:3: note:
  test.c:1: note: it was later defined here
   #define MACRO(X,Y)

The macro *had* been defined, it was merely misused.

This patch fixes the issue by only issuing the note if the use location
is before the definition location (using linemap_location_before_p).

Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu;
adds 39 PASS results to g++.sum.

Committed to trunk as r259360.

gcc/cp/ChangeLog:
PR c++/85385
* name-lookup.c (macro_use_before_def::maybe_make): New function,
checking that the use is indeed before the definition.
(macro_use_before_def::macro_use_before_def): Make private.
(macro_use_before_def::~macro_use_before_def): Make private.  Move
check for UNKNOWN_LOCATION to macro_use_before_def::maybe_make.
(lookup_name_fuzzy): Call macro_use_before_def::maybe_make rather
than using new directly.

gcc/testsuite/ChangeLog:
PR c++/85385
* g++.dg/diagnostic/macro-arg-count.C: New test.
---
 gcc/cp/name-lookup.c  | 39 -
 gcc/testsuite/g++.dg/diagnostic/macro-arg-count.C | 51 +++
 2 files changed, 80 insertions(+), 10 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/diagnostic/macro-arg-count.C

diff --git a/gcc/cp/name-lookup.c b/gcc/cp/name-lookup.c
index b923107..d2e5acb 100644
--- a/gcc/cp/name-lookup.c
+++ b/gcc/cp/name-lookup.c
@@ -5888,6 +5888,27 @@ consider_binding_level (tree name, best_match  &bm,
 class macro_use_before_def : public deferred_diagnostic
 {
  public:
+  /* Factory function.  Return a new macro_use_before_def instance if
+ appropriate, or return NULL. */
+  static macro_use_before_def *
+  maybe_make (location_t use_loc, cpp_hashnode *macro)
+  {
+source_location def_loc = cpp_macro_definition_location (macro);
+if (def_loc == UNKNOWN_LOCATION)
+  return NULL;
+
+/* We only want to issue a note if the macro was used *before* it was
+   defined.
+   We don't want to issue a note for cases where a macro was incorrectly
+   used, leaving it unexpanded (e.g. by using the wrong argument
+   count).  */
+if (!linemap_location_before_p (line_table, use_loc, def_loc))
+  return NULL;
+
+return new macro_use_before_def (use_loc, macro);
+  }
+
+ private:
   /* Ctor.  LOC is the location of the usage.  MACRO is the
  macro that was used.  */
   macro_use_before_def (location_t loc, cpp_hashnode *macro)
@@ -5901,13 +5922,10 @@ class macro_use_before_def : public deferred_diagnostic
 if (is_suppressed_p ())
   return;
 
-source_location def_loc = cpp_macro_definition_location (m_macro);
-if (def_loc != UNKNOWN_LOCATION)
-  {
-   inform (get_location (), "the macro %qs had not yet been defined",
-   (const char *)m_macro->ident.str);
-   inform (def_loc, "it was later defined here");
-  }
+inform (get_location (), "the macro %qs had not yet been defined",
+   (const char *)m_macro->ident.str);
+inform (cpp_macro_definition_location (m_macro),
+   "it was later defined here");
   }
 
  private:
@@ -5990,12 +6008,13 @@ lookup_name_fuzzy (tree name, enum 
lookup_name_fuzzy_kind kind, location_t loc)
 bm.consider ((const char *)best_macro->ident.str);
   else if (bmm.get_best_distance () == 0)
 {
-  /* If we have an exact match for a macro name, then the
-macro has been used before it was defined.  */
+  /* If we have an exact match for a macro name, then either the
+macro was used with the wrong argument count, or the macro
+has been used before it was defined.  */
   cpp_hashnode *macro = bmm.blithely_get_best_candidate ();
   if (macro && (macro->flags & NODE_BUILTIN) == 0)
return name_hint (NULL,
- new macro_use_before_def (loc, macro));
+ macro_use_before_def::maybe_make (loc, macro));
 }
 
   /* Try the "starts_decl_specifier_p" keywords to detect
diff --git a/gcc/testsuite/g++.dg/diagnostic/macro-arg-count.C 
b/gcc/testsuite/g++.dg/diagnostic/macro-arg-count.C
new file mode 100644
index 000..12b2dbd
--- /dev/null
+++ b/gcc/testsuite/g++.dg/diagnostic/macro-arg-count.C
@@ -0,0 +1,51 @@
+// { dg-options "-fdiagnostics-show-caret" }
+
+#define MACRO_1(X,Y)
+void test_1 ()
+{
+  MACRO_1(42); // { dg-line "use_of_MACRO_1" }
+  // { dg-error "macro \"MACRO_1\" requires 2 arguments, but only 1 given" "" 
{ target *-*-* } use_of_MACRO_1 }
+  /* { dg-begin

Re: [PATCH] Don't mark IFUNC resolver as only called directly

2018-04-12 Thread H.J. Lu
On Thu, Apr 12, 2018 at 6:39 AM, H.J. Lu  wrote:
> On Thu, Apr 12, 2018 at 5:17 AM, Jan Hubicka  wrote:
>>> On Thu, Apr 12, 2018 at 1:29 PM, H.J. Lu  wrote:
>>> > Since IFUNC resolver is called indirectly, don't mark IFUNC resolver as
>>> > only called directly.
>>> >
>>> > OK for trunk?
>>> >
>>> >
>>> > H.J.
>>> > ---
>>> > gcc/
>>> >
>>> > PR target/85345
>>> > * cgraph.h: Include stringpool.h" and "attribs.h".
>>> > (cgraph_node::only_called_directly_or_aliased_p): Return false
>>> > for IFUNC resolver.
>>> >
>>> > gcc/testsuite/
>>> >
>>> > PR target/85345
>>> > * gcc.target/i386/pr85345.c: New test.
>>> > ---
>>> >  gcc/cgraph.h|  5 +++-
>>> >  gcc/testsuite/gcc.target/i386/pr85345.c | 44 
>>> > +
>>> >  2 files changed, 48 insertions(+), 1 deletion(-)
>>> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr85345.c
>>> >
>>> > diff --git a/gcc/cgraph.h b/gcc/cgraph.h
>>> > index d1ef8408497..9e195824fcc 100644
>>> > --- a/gcc/cgraph.h
>>> > +++ b/gcc/cgraph.h
>>> > @@ -24,6 +24,8 @@ along with GCC; see the file COPYING3.  If not see
>>> >  #include "profile-count.h"
>>> >  #include "ipa-ref.h"
>>> >  #include "plugin-api.h"
>>> > +#include "stringpool.h"
>>> > +#include "attribs.h"
>>> >
>>> >  class ipa_opt_pass_d;
>>> >  typedef ipa_opt_pass_d *ipa_opt_pass;
>>> > @@ -2894,7 +2896,8 @@ cgraph_node::only_called_directly_or_aliased_p 
>>> > (void)
>>> >   && !DECL_STATIC_CONSTRUCTOR (decl)
>>> >   && !DECL_STATIC_DESTRUCTOR (decl)
>>> >   && !used_from_object_file_p ()
>>> > - && !externally_visible);
>>> > + && !externally_visible
>>> > + && !lookup_attribute ("ifunc", DECL_ATTRIBUTES (decl)));
>>>
>>> How's it handled for our own generated resolver functions?  That is,
>>> isn't there sth cheaper than doing a lookup_attribute here?  I see
>>> that make_dispatcher_decl nor ix86_get_function_versions_dispatcher
>>> adds the 'ifunc' attribute (though they are TREE_PUBLIC there).
>>
>> Is there any drawback of setting force_output flag?
>> Honza
>
> Setting force_output may prevent some optimizations.  Can we add a bit
> for IFUNC resolver?
>

Here is the patch to add ifunc_resolver to cgraph_node. Tested on x86-64
and i686.  Any comments?

Thanks.

-- 
H.J.
From 283a3282d018a40ab550a137a5a2770ce63f4a40 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" 
Date: Wed, 11 Apr 2018 12:31:21 -0700
Subject: [PATCH] Don't mark IFUNC resolver as only called directly

Since IFUNC resolver is called indirectly, don't mark IFUNC resolver as
only called directly.  This patch adds ifunc_resolver to cgraph_node,
sets ifunc_resolver for ifunc attribute and checks ifunc_resolver
instead of looking up ifunc attribute.

gcc/

	PR target/85345
	* cgraph.h (cgraph_node::create): Set ifunc_resolver for ifunc
	attribute.
	(cgraph_node::create_alias): Likewise.
	(cgraph_node::get_availability): Check ifunc_resolver instead
	of looking up ifunc attribute.
	* cgraphunit.c (maybe_diag_incompatible_alias): Likewise.
	* symtab.c (symtab_node::binds_to_current_def_p): Likewise.
	* varasm.c (do_assemble_alias): Likewise.
	(assemble_alias): Likewise.
	(default_binds_local_p_3): Likewise.
	* cgraph.h (cgraph_node): Add ifunc_resolver.
	(cgraph_node::only_called_directly_or_aliased_p): Return false
	for IFUNC resolver.
	* lto-cgraph.c (input_node): Set ifunc_resolver for ifunc
	attribute.

gcc/testsuite/

	PR target/85345
	* gcc.target/i386/pr85345.c: New test.
---
 gcc/cgraph.c|  7 +-
 gcc/cgraph.h|  4 +++
 gcc/cgraphunit.c|  2 +-
 gcc/lto-cgraph.c|  2 ++
 gcc/symtab.c|  4 +--
 gcc/testsuite/gcc.target/i386/pr85345.c | 44 +
 gcc/varasm.c|  8 +++---
 7 files changed, 64 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr85345.c

diff --git a/gcc/cgraph.c b/gcc/cgraph.c
index 9a7d54d7cee..9f3a2929f6b 100644
--- a/gcc/cgraph.c
+++ b/gcc/cgraph.c
@@ -517,6 +517,9 @@ cgraph_node::create (tree decl)
 	g->have_offload = true;
 }
 
+  if (lookup_attribute ("ifunc", DECL_ATTRIBUTES (decl)))
+node->ifunc_resolver = true;
+
   node->register_symbol ();
 
   if (DECL_CONTEXT (decl) && TREE_CODE (DECL_CONTEXT (decl)) == FUNCTION_DECL)
@@ -575,6 +578,8 @@ cgraph_node::create_alias (tree alias, tree target)
   alias_node->alias = true;
   if (lookup_attribute ("weakref", DECL_ATTRIBUTES (alias)) != NULL)
 alias_node->transparent_alias = alias_node->weakref = true;
+  if (lookup_attribute ("ifunc", DECL_ATTRIBUTES (alias)))
+alias_node->ifunc_resolver = true;
   return alias_node;
 }
 
@@ -2299,7 +2304,7 @@ cgraph_node::get_availability (symtab_node *ref)
 avail = AVAIL_AVAILABLE;
   else if (transparent_alias)
 ultimate_alias_target (&avail, ref);
-  else if

Re: [PATCH] Prefer mempcpy to memcpy on x86_64 target (PR middle-end/81657, take 2)

2018-04-12 Thread Jakub Jelinek
On Thu, Apr 12, 2018 at 07:37:22PM +0200, Jakub Jelinek wrote:
> On Thu, Apr 12, 2018 at 05:29:35PM +, Wilco Dijkstra wrote:
> > > Depending on what you mean old, I see e.g. in 2010 power7 mempcpy got 
> > > added,
> > > in 2013 other power versions, in 2016 s390*, etc.  Doing a decent mempcpy
> > > isn't hard if you have asm version of memcpy and one spare register.
> > 
> > More mempcpy implementations have been added in recent years indeed, but 
> > almost all
> > add an extra copy of the memcpy code rather than using a single combined 
> > implementation.
> > That means it is still better to call memcpy (which is frequently used and 
> > thus likely in L1/L2)
> > rather than mempcpy (which is more likely to be cold and thus not cached).
> 
> That really depends, usually when some app uses mempcpy, it uses it very
> heavily.  And all the proposed patches do is honor what the user asked, if
> you use memcpy () + n, we aren't transforming that into mempcpy behind the
> user's back.
> 
> Anyway, here is what I think Richard was asking for, that I'm currently
> bootstrapping/regtesting.  It can be easily combined with Martin's target
> hook if needed, or do it only for
> endp == 1 && target != const0_rtx && CALL_EXPR_TAILCALL (exp)
> etc.
> 
> 2018-04-12  Martin Liska  
>   Jakub Jelinek  
> 
>   PR middle-end/81657
>   * expr.h (enum block_op_methods): Add BLOCK_OP_NO_LIBCALL_RET.
>   * expr.c (emit_block_move_hints): Handle BLOCK_OP_NO_LIBCALL_RET.
>   * builtins.c (expand_builtin_memory_copy_args): Use
>   BLOCK_OP_NO_LIBCALL_RET method for mempcpy with non-ignored target,
>   handle dest_addr == pc_rtx.
> 
>   * gcc.dg/string-opt-1.c: Remove bogus comment.  Expect a mempcpy
>   call.

Successfully bootstrapped/regtested on x86_64-linux and i686-linux.

Jakub


Re: [PATCH] configure.ac: honor --with-gcc-major-version in gcc-driver-name.h (PR jit/85384)

2018-04-12 Thread Jakub Jelinek
On Thu, Apr 12, 2018 at 04:51:21PM -0400, David Malcolm wrote:
> Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.
> 
> OK for trunk?
> 
> config/ChangeLog:
>   PR jit/85384
>   * acx.m4 (GCC_BASE_VER): Remove \$\$ from sed expression.
> 
> gcc/ChangeLog:
>   PR jit/85384
>   * configure.ac (gcc-driver-name.h): Honor --with-gcc-major-version
>   by using gcc_base_ver to generate a gcc_driver_version, and use
>   it when generating GCC_DRIVER_NAME.
>   * configure.ac: Regenerate.

I'd prefer not touching acx.m4 and instead just:
gcc_driver_version=$gcc_BASEVER
if test x$with_gcc_major_version_only = xyes ; then
  gcc_driver_version=`echo $gcc_BASEVER | sed -e 's/^\([0-9]*\).*$/\1/'`
fi
in configure.ac; after all, it is something what is done elsewhere in
configure.ac:
#define GCCPLUGIN_VERSION_MAJOR   `echo $gcc_BASEVER | sed -e 
's/^\([0-9]*\).*$/\1/'`
#define GCCPLUGIN_VERSION_MINOR   `echo $gcc_BASEVER | sed -e 
's/^[0-9]*\.\([0-9]*\).*$/\1/'`
#define GCCPLUGIN_VERSION_PATCHLEVEL   `echo $gcc_BASEVER | sed -e 
's/^[0-9]*\.[0-9]*\.\([0-9]*\)$/\1/'`
and the $ is used in all similar sed patterns.

> --- a/gcc/configure.ac
> +++ b/gcc/configure.ac
> @@ -6499,8 +6499,10 @@ AC_DEFINE_UNQUOTED(DIAGNOSTICS_COLOR_DEFAULT, 
> $DIAGNOSTICS_COLOR_DEFAULT,
>  
>  # Generate gcc-driver-name.h containing GCC_DRIVER_NAME for the benefit
>  # of jit/jit-playback.c.
> +gcc_driver_version=`eval "${get_gcc_base_ver} $srcdir/BASE-VER"`
> +echo "gcc_driver_version: ${gcc_driver_version}"
>  cat > gcc-driver-name.h < -#define GCC_DRIVER_NAME "${target_noncanonical}-gcc-${gcc_BASEVER}${exeext}"
> +#define GCC_DRIVER_NAME 
> "${target_noncanonical}-gcc-${gcc_driver_version}${exeext}"
>  EOF
>  
>  # Check whether --enable-default-pie was given.

Jakub


[PATCH] Fix CSE CLZ/CTZ handling (PR rtl-optimization/85376)

2018-04-12 Thread Jakub Jelinek
Hi!

The following testcase is miscompiled, because due to various disabled
optimization passes we end up with a dead bsf instruction (CTZ) of a
register known to be zero.
fold_rtx uses simplify_unary_operation, which has in this case:
case CTZ:
  if (wi::ne_p (op0, 0))
int_value = wi::ctz (op0);
  else if (! CTZ_DEFINED_VALUE_AT_ZERO (imode, int_value))
int_value = GET_MODE_PRECISION (imode);
  result = wi::shwi (int_value, result_mode);
  break;
x86_64 is a target where CTZ_DEFINED_VALUE_AT_ZERO is false, the instruction
keeps previous value of the destination register, so something pretty
random.  As it is undefined, simplifying it to something random is fine,
except when used the way CSE uses it, by remembering that the value
(const_int 32) is stored in the destination register and optimizing later
code that has (set some_reg (const_int 32)) to that destination register.
Beucase that destination register contains an indeterminate value, we can't
expect it will be exactly 32.

The following patch let us punt in these cases.  Bootstrapped/regtested on
x86_64-linux and i686-linux, ok for trunk?

Another option would be to tweak simplify-rtx.c and instead of doing
  else if (! CTZ_DEFINED_VALUE_AT_ZERO (imode, int_value))
int_value = GET_MODE_PRECISION (imode);
do
  else if (! CTZ_DEFINED_VALUE_AT_ZERO (imode, int_value))
return NULL_RTX;
and similarly for CLZ, haven't tested what would break if anything;
we've been doing something like that since r62453 when the
C?Z_DEFINED_VALUE_AT_ZERO macros have been introduced, and before that
actually the same, just unconditionally assumed the value is undefined at 0.

2018-04-12  Jakub Jelinek  

PR rtl-optimization/85376
* cse.c (fold_rtx): For CLZ and CTZ don't try to simplify if
the source is known to be zero and CLZ/CTZ is not defined at zero
for the target.

* gcc.dg/pr85376.c: New test.

--- gcc/cse.c.jj2018-02-12 23:24:47.350482694 +0100
+++ gcc/cse.c   2018-04-12 17:49:32.157664289 +0200
@@ -3322,6 +3322,19 @@ fold_rtx (rtx x, rtx_insn *insn)
&& mode_arg0 == VOIDmode)
  break;
 
+   /* Avoid recording a constant value for CLZ or CTZ if the argument is
+  known to be zero when the operation is undefined for zero on the
+  target.  See PR85376.  */
+   if ((code == CLZ || code == CTZ)
+   && ((const_arg0 ? const_arg0 : folded_arg0) == CONST0_RTX (mode)))
+ {
+   int dummy;
+   scalar_mode imode = GET_MODE_INNER (mode);
+   if ((code == CLZ && !CLZ_DEFINED_VALUE_AT_ZERO (imode, dummy))
+   || (code == CTZ && !CTZ_DEFINED_VALUE_AT_ZERO (imode, dummy)))
+ break;
+ }
+
new_rtx = simplify_unary_operation (code, mode,
const_arg0 ? const_arg0 : 
folded_arg0,
mode_arg0);
--- gcc/testsuite/gcc.dg/pr85376.c.jj   2018-04-12 17:44:41.506370642 +0200
+++ gcc/testsuite/gcc.dg/pr85376.c  2018-04-12 17:45:11.669401115 +0200
@@ -0,0 +1,32 @@
+/* PR rtl-optimization/85376 */
+/* { dg-do run { target int128 } } */
+/* { dg-options "-Og -fno-dce -fgcse -fno-tree-ccp -fno-tree-copy-prop 
-Wno-psabi" } */
+
+typedef unsigned int U __attribute__ ((vector_size (64)));
+typedef unsigned __int128 V __attribute__ ((vector_size (64)));
+unsigned int e, i, l;
+unsigned char f;
+U g, h, k, j;
+
+static inline V
+foo (unsigned char n, unsigned short o, unsigned int p, U q, U r, U s)
+{
+  unsigned int t;
+  o <<= 5;
+  q[7] >>= __builtin_add_overflow (0xfff0, __builtin_ffs (n), &s[5]);
+  t = __builtin_ffs (g[7]);
+  e *= __builtin_sub_overflow (o, t, &f);
+  return f + (V) g + (V) h + (V) q + i + (V) j + (V) s + (V) k + l;
+}
+
+int
+main ()
+{
+  if (__SIZEOF_INT128__ != 16 || __SIZEOF_INT__ != 4 || __CHAR_BIT__ != 8)
+return 0;
+  V x = foo (0, 1, 5, (U) { }, (U) { }, (U) { });
+  for (unsigned i = 0; i < 4; i++)
+if ((unsigned int) x[i] != 0x20)
+  __builtin_abort ();
+  return 0;
+}

Jakub


[PATCH] Fix -fsanitize=address VLA instrumentation (PR sanitizer/85230)

2018-04-12 Thread Jakub Jelinek
Hi!

As mentioned in the PR, we need to unpoison the red zones when leaving a
scope with VLA variable(s); this is done through __asan_allocas_unpoison
call, unfortunately it is called after the __builtin_stack_restore which
restores the stack pointer; now, if an interrupt comes in between the stack
restore and the __asan_allocas_unpoison call, the interrupt handler might
have some stack bytes marked as red zones in the shadow memory and might
diagnose sanitizing error even when there is none in the original program.

The following patch ought to fix this by swapping the two calls, so we first
unpoison and only after it is unpoisoned in shadow memory release the stack.
The second argument to the __asan_allocas_unpoison call is meant to
be virtual_dynamic_stack_rtx after the __builtin_stack_restore, i.e. the new
stack_pointer_rtx value + STACK_DYNAMIC_OFFSET (current_function_decl).
As the STACK_DYNAMIC_OFFSET value isn't known until the vregs pass, the code
used a hack where it ignored the second argument and replaced it by
virtual_dynamic_stack_rtx.  With the asan.c change below this doesn't work
anymore, because virtual_dynamic_stack_rtx aka stack_pointer_rtx +
STACK_DYNAMIC_OFFSET (current_function_decl) before the
__builtin_stack_restore is a different value.  The patch instead uses the
argument passed to the __asan_allocas_unpoison at GIMPLE time, which is the
same as passed to __builtin_stack_restore; this is the new stack_pointer_rtx
value after __builtin_stack_restore.  And, because we don't want that value,
but that + STACK_DYNAMIC_OFFSET (current_function_decl), we compute
arg1 + (virtual_dynamic_stack_rtx - stack_pointer_rtx) and let CSE/combiner
optimize it into arg1 (on targets like x86_64 where STACK_DYNAMIC_OFFSET can
be even 0 when not accumulating outgoing args or when that size is 0) or
arg1 + some_constant.

Bootstrapped on
{x86_64,i686,powerpc64,powerpc64le,aarch64,s390x,armv7hl}-linux, regtested
on {x86_64,i686,powerpc64,powerpc64le}-linux so far, but on the power* ones
on virtual address space size that isn't really supported (likely
https://github.com/google/sanitizers/issues/933#issuecomment-380058705
issue, so while nothing regresses there, pretty much all asan tests fail
there before and after the patch); also tested successfully with
asan.exp=alloca* on gcc110 and gcc112 on compile farm where it doesn't
suffer from the VA issue.  Ok if testing passes also on aarch64, s390x
and armv7hl?

2018-04-12  Jakub Jelinek  

PR sanitizer/85230
* asan.c (handle_builtin_stack_restore): Adjust comment.  Emit
__asan_allocas_unpoison call and last_alloca_addr = new_sp before
__builtin_stack_restore rather than after it.
* builtins.c (expand_asan_emit_allocas_unpoison): Pass
arg1 + (virtual_dynamic_stack_rtx - stack_pointer_rtx) as second
argument instead of virtual_dynamic_stack_rtx.

--- gcc/asan.c.jj   2018-01-09 21:53:38.821577722 +0100
+++ gcc/asan.c  2018-04-12 13:22:59.166095523 +0200
@@ -554,14 +554,14 @@ get_last_alloca_addr ()
   return last_alloca_addr;
 }
 
-/* Insert __asan_allocas_unpoison (top, bottom) call after
+/* Insert __asan_allocas_unpoison (top, bottom) call before
__builtin_stack_restore (new_sp) call.
The pseudocode of this routine should look like this:
- __builtin_stack_restore (new_sp);
  top = last_alloca_addr;
  bot = new_sp;
  __asan_allocas_unpoison (top, bot);
  last_alloca_addr = new_sp;
+ __builtin_stack_restore (new_sp);
In general, we can't use new_sp as bot parameter because on some
architectures SP has non zero offset from dynamic stack area.  Moreover, on
some architectures this offset (STACK_DYNAMIC_OFFSET) becomes known for each
@@ -570,9 +570,8 @@ get_last_alloca_addr ()
http://refspecs.linuxfoundation.org/ELF/ppc64/PPC-elf64abi.html#DYNAM-STACK.
To overcome the issue we use following trick: pass new_sp as a second
parameter to __asan_allocas_unpoison and rewrite it during expansion with
-   virtual_dynamic_stack_rtx later in expand_asan_emit_allocas_unpoison
-   function.
-*/
+   new_sp + (virtual_dynamic_stack_rtx - sp) later in
+   expand_asan_emit_allocas_unpoison function.  */
 
 static void
 handle_builtin_stack_restore (gcall *call, gimple_stmt_iterator *iter)
@@ -584,9 +583,9 @@ handle_builtin_stack_restore (gcall *cal
   tree restored_stack = gimple_call_arg (call, 0);
   tree fn = builtin_decl_implicit (BUILT_IN_ASAN_ALLOCAS_UNPOISON);
   gimple *g = gimple_build_call (fn, 2, last_alloca, restored_stack);
-  gsi_insert_after (iter, g, GSI_NEW_STMT);
+  gsi_insert_before (iter, g, GSI_SAME_STMT);
   g = gimple_build_assign (last_alloca, restored_stack);
-  gsi_insert_after (iter, g, GSI_NEW_STMT);
+  gsi_insert_before (iter, g, GSI_SAME_STMT);
 }
 
 /* Deploy and poison redzones around __builtin_alloca call.  To do this, we
--- gcc/builtins.c.jj   2018-04-04 21:33:20.530639395 +0200
+++ gcc/builtins.c  2018-04-12 13:35:34.32839

Re: [patch, fortran] Remove parallell annotation from DO CONCURRENT

2018-04-12 Thread Jakub Jelinek
On Thu, Apr 12, 2018 at 11:14:45PM +0200, Thomas Koenig wrote:
> 2018-04-12  Thomas Koenig  
> 
> PR fortran/83064
> PR testsuite/85346
> * trans-stmt.c (gfc_trans_forall_loop): Use annot_expr_ivdep_kind
> for annotation and remove dependence on -ftree-parallelize-loops.
> 
> 2018-04-12  Thomas Koenig  
> 
> PR fortran/83064
> PR testsuite/85346
> * gfortran.dg/do_concurrent_5.f90: Dynamically allocate main work
> array and move test to libgomp/testsuite/libgomp.fortran.
> * gfortran.dg/do_concurrent_6.f90: New test.
> 
> 2018-04-12  Thomas Koenig  
> 
> PR fortran/83064
> PR testsuite/85346
> * testsuite/libgomp.fortran: Move modified test from gfortran.dg
> to here.

Please use full filename here, like:
* testsuite/libgomp.fortran/do_concurrent_5.f90: New test, moved
from gfortran.dg.  Make edof array allocatable.

Ok with that change.

> Index: trans-stmt.c
> ===
> --- trans-stmt.c  (Revision 259326)
> +++ trans-stmt.c  (Arbeitskopie)
> @@ -3643,12 +3643,12 @@ gfc_trans_forall_loop (forall_info *forall_tmp, tr
>cond = fold_build2_loc (input_location, LE_EXPR, logical_type_node,
> count, build_int_cst (TREE_TYPE (count), 0));
>  
> -  /* PR 83064 means that we cannot use the annotation if the
> -  autoparallelizer is active.  */
> -  if (forall_tmp->do_concurrent && ! flag_tree_parallelize_loops)
> +  /* PR 83064 means that we cannot use annot_expr_parallel_kind until
> +   the autoparallelizer can hande this.  */
> +  if (forall_tmp->do_concurrent)
>   cond = build3 (ANNOTATE_EXPR, TREE_TYPE (cond), cond,
>  build_int_cst (integer_type_node,
> -   annot_expr_parallel_kind),
> +   annot_expr_ivdep_kind),
>  integer_zero_node);
>  
>tmp = build1_v (GOTO_EXPR, exit_label);

> ! { dg-do  run }
> ! PR 83064 - this used to give wrong results.
> ! { dg-additional-options "-O1 -ftree-parallelize-loops=2" }
> ! Original test case by Christian Felter
> 
> program main
> use, intrinsic :: iso_fortran_env
> implicit none
> 
> integer, parameter :: nsplit = 4
> integer(int64), parameter :: ne = 2**20
> integer(int64) :: stride, low(nsplit), high(nsplit), i
> integer(int64), dimension(:), allocatable :: edof
> real(real64), dimension(nsplit) :: pi
> 
> allocate (edof(ne))
> edof(1::4) = 1
> edof(2::4) = 2
> edof(3::4) = 3
> edof(4::4) = 4
> 
> stride = ceiling(real(ne)/nsplit)
> do i = 1, nsplit
> high(i) = stride*i
> end do
> do i = 2, nsplit
> low(i) = high(i-1) + 1
> end do
> low(1) = 1
> high(nsplit) = ne
> 
> pi = 0
> do concurrent (i = 1:nsplit)
> pi(i) = sum(compute( low(i), high(i) ))
> end do
> if (abs (sum(pi) - atan(1.0d0)) > 1e-5) STOP 1
> 
> contains
> 
> pure function compute( low, high ) result( ttt )
> integer(int64), intent(in) :: low, high
> real(real64), dimension(nsplit) :: ttt
> integer(int64) :: j, k
> 
> ttt = 0
> 
> ! Unrolled loop
> ! do j = low, high, 4
> ! k = 1
> ! ttt(k) = ttt(k) + (-1)**(j+1) / real( 2*j-1 )   
>  
> ! k = 2
> ! ttt(k) = ttt(k) + (-1)**(j+2) / real( 2*j+1 )   
>  
> ! k = 3
> ! ttt(k) = ttt(k) + (-1)**(j+3) / real( 2*j+3 )   
>  
> ! k = 4
> ! ttt(k) = ttt(k) + (-1)**(j+4) / real( 2*j+5 )   
>  
> ! end do
> 
> ! Loop with modulo operation
> ! do j = low, high
> ! k = mod( j, nsplit ) + 1
> ! ttt(k) = ttt(k) + (-1)**(j+1) / real( 2*j-1 )   
>  
> ! end do
> 
> ! Loop with subscripting via host association
> do j = low, high
> k = edof(j)
> ttt(k) = ttt(k) + (-1.0_real64)**(j+1) / real( 2*j-1 )
> 
> end do
> end function
> 
> end program main

> ! { dg-do compile }
> ! { dg-additional-options "-fdump-tree-original" }
> 
> program main
>   real, dimension(100) :: a,b
>   call random_number(a)
>   do concurrent (i=1:100)
>  b(i) = a(i)*a(i)
>   end do
>   print *,sum(a)
> end program main
> 
> ! { dg-final { scan-tree-dump-times "ivdep" 1 "original" } }


Jakub


Re: [patch, fortran] Remove parallell annotation from DO CONCURRENT

2018-04-12 Thread Thomas Koenig

Well, here's a variation which actually passes regression-test.

Seems I implicitly believed that the implicit save on main program
variables actually works... well, it turns out that it doesn't,
which is now PR85364.

OK for trunk?

Thomas

2018-04-12  Thomas Koenig  

PR fortran/83064
PR testsuite/85346
* trans-stmt.c (gfc_trans_forall_loop): Use annot_expr_ivdep_kind
for annotation and remove dependence on -ftree-parallelize-loops.

2018-04-12  Thomas Koenig  

PR fortran/83064
PR testsuite/85346
* gfortran.dg/do_concurrent_5.f90: Dynamically allocate main work
array and move test to libgomp/testsuite/libgomp.fortran.
* gfortran.dg/do_concurrent_6.f90: New test.

2018-04-12  Thomas Koenig  

PR fortran/83064
PR testsuite/85346
* testsuite/libgomp.fortran: Move modified test from gfortran.dg
to here.
Index: trans-stmt.c
===
--- trans-stmt.c	(Revision 259326)
+++ trans-stmt.c	(Arbeitskopie)
@@ -3643,12 +3643,12 @@ gfc_trans_forall_loop (forall_info *forall_tmp, tr
   cond = fold_build2_loc (input_location, LE_EXPR, logical_type_node,
 			  count, build_int_cst (TREE_TYPE (count), 0));
 
-  /* PR 83064 means that we cannot use the annotation if the
-	 autoparallelizer is active.  */
-  if (forall_tmp->do_concurrent && ! flag_tree_parallelize_loops)
+  /* PR 83064 means that we cannot use annot_expr_parallel_kind until
+   the autoparallelizer can hande this.  */
+  if (forall_tmp->do_concurrent)
 	cond = build3 (ANNOTATE_EXPR, TREE_TYPE (cond), cond,
 		   build_int_cst (integer_type_node,
-  annot_expr_parallel_kind),
+  annot_expr_ivdep_kind),
 		   integer_zero_node);
 
   tmp = build1_v (GOTO_EXPR, exit_label);
! { dg-do  run }
! PR 83064 - this used to give wrong results.
! { dg-additional-options "-O1 -ftree-parallelize-loops=2" }
! Original test case by Christian Felter

program main
use, intrinsic :: iso_fortran_env
implicit none

integer, parameter :: nsplit = 4
integer(int64), parameter :: ne = 2**20
integer(int64) :: stride, low(nsplit), high(nsplit), i
integer(int64), dimension(:), allocatable :: edof
real(real64), dimension(nsplit) :: pi

allocate (edof(ne))
edof(1::4) = 1
edof(2::4) = 2
edof(3::4) = 3
edof(4::4) = 4

stride = ceiling(real(ne)/nsplit)
do i = 1, nsplit
high(i) = stride*i
end do
do i = 2, nsplit
low(i) = high(i-1) + 1
end do
low(1) = 1
high(nsplit) = ne

pi = 0
do concurrent (i = 1:nsplit)
pi(i) = sum(compute( low(i), high(i) ))
end do
if (abs (sum(pi) - atan(1.0d0)) > 1e-5) STOP 1

contains

pure function compute( low, high ) result( ttt )
integer(int64), intent(in) :: low, high
real(real64), dimension(nsplit) :: ttt
integer(int64) :: j, k

ttt = 0

! Unrolled loop
! do j = low, high, 4
! k = 1
! ttt(k) = ttt(k) + (-1)**(j+1) / real( 2*j-1 )
! k = 2
! ttt(k) = ttt(k) + (-1)**(j+2) / real( 2*j+1 )
! k = 3
! ttt(k) = ttt(k) + (-1)**(j+3) / real( 2*j+3 )
! k = 4
! ttt(k) = ttt(k) + (-1)**(j+4) / real( 2*j+5 )
! end do

! Loop with modulo operation
! do j = low, high
! k = mod( j, nsplit ) + 1
! ttt(k) = ttt(k) + (-1)**(j+1) / real( 2*j-1 )
! end do

! Loop with subscripting via host association
do j = low, high
k = edof(j)
ttt(k) = ttt(k) + (-1.0_real64)**(j+1) / real( 2*j-1 )
end do
end function

end program main
! { dg-do compile }
! { dg-additional-options "-fdump-tree-original" }

program main
  real, dimension(100) :: a,b
  call random_number(a)
  do concurrent (i=1:100)
 b(i) = a(i)*a(i)
  end do
  print *,sum(a)
end program main

! { dg-final { scan-tree-dump-times "ivdep" 1 "original" } }


Re: [PATCH] PR libstdc++/85222 allow catching iostream errors as gcc4-compatible ios::failure

2018-04-12 Thread Jonathan Wakely
This fixes some comments with misspelled files and classes.
Committed to trunk and gcc-7-branch.

It occurred to me that the name of the new __ios_failure type is
visible in the verbose terminate handler messages:

terminate called after throwing an instance of 'std::__ios_failure'
  what():  basic_filebuf::underflow error reading the file: Is a directory
Aborted (core dumped)

And that there's no need for this type to use a reserved name. Users
can't refer to it, or define macros that affect it (because it's never
exposed in headers).

So we could call it something else, like std::ios_failure rather than
std::__ios_failure.

Anybody got a preference they want to argue for?
commit a28bcba2a812d4eac6da8ce86907b670361a09a6
Author: Jonathan Wakely 
Date:   Thu Apr 12 21:28:38 2018 +0100

Fix comments that misspell names of files and classes

* src/c++11/Makefile.am: Fix comment.
* src/c++11/Makefile.in: Regenerate.
* src/c++11/cxx11-ios_failure.cc: Fix comment.
* src/c++98/ios_failure.cc: Likewise.

diff --git a/libstdc++-v3/src/c++11/Makefile.am 
b/libstdc++-v3/src/c++11/Makefile.am
index 6f49f0d55d3..8d524b67232 100644
--- a/libstdc++-v3/src/c++11/Makefile.am
+++ b/libstdc++-v3/src/c++11/Makefile.am
@@ -127,7 +127,7 @@ hashtable_c++0x.o: hashtable_c++0x.cc
$(CXXCOMPILE) -fimplicit-templates -c $<
 
 if ENABLE_DUAL_ABI
-# Rewrite the type info for __dual_abi_ios_failure.
+# Rewrite the type info for __ios_failure.
 rewrite_ios_failure_typeinfo = sed -e '/^_ZTISt13__ios_failure:$$/{' \
-e 'n' \
-e 
's/_ZTVN10__cxxabiv120__si_class_type_infoE/_ZTVSt19__iosfail_type_info/' \
diff --git a/libstdc++-v3/src/c++11/cxx11-ios_failure.cc 
b/libstdc++-v3/src/c++11/cxx11-ios_failure.cc
index 847b5946234..b1e4bfb2b44 100644
--- a/libstdc++-v3/src/c++11/cxx11-ios_failure.cc
+++ b/libstdc++-v3/src/c++11/cxx11-ios_failure.cc
@@ -140,7 +140,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   // __ios_failure can be upcast to the type in a catch handler.
   bool
   __iosfail_type_info::__do_upcast(const __class_type_info *dst_type,
-   void **obj_ptr) const
+  void **obj_ptr) const
   {
 // If the handler is for the gcc4-compatible ios::failure type then
 // catch the object stored in __ios_failure::buf instead of
@@ -150,7 +150,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
*obj_ptr = static_cast<__ios_failure*>(*obj_ptr)->buf;
return true;
   }
-// Otherwise proceeed as normal to see if the handler matches.
+// Otherwise proceed as normal to see if the handler matches.
 return __class_type_info::__do_upcast(dst_type, obj_ptr);
   }
 #else // ! __cpp_rtti
diff --git a/libstdc++-v3/src/c++98/ios_failure.cc 
b/libstdc++-v3/src/c++98/ios_failure.cc
index a2fc5593e15..49d24f49620 100644
--- a/libstdc++-v3/src/c++98/ios_failure.cc
+++ b/libstdc++-v3/src/c++98/ios_failure.cc
@@ -57,7 +57,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
 #if _GLIBCXX_USE_DUAL_ABI
   // When the dual ABI is enabled __throw_ios_failure() is defined in
-  // src/c++11/ios_failure.cc
+  // src/c++11/cxx11-ios_failure.cc
 #if __cpp_rtti
   // If RTTI is enabled the exception type thrown will use these functions to
   // construct/destroy a gcc4-compatible ios::failure object in a buffer,


[PATCH] avoid duplicate warning for strcmp with a nonstring (PR 85359)

2018-04-12 Thread Martin Sebor

The attached patch makes a small tweak to avoid issuing a duplicate
warning for calls to strcmp with a nonstring argument.  The most
onerous part of this was figuring out how to test for the absence
of duplicate warnings.  The "hack" I used (dg-regexp) is in place
until a more straightforward solution becomes available.  (David
Malcolm has something planned for GCC 9.)

Martin
PR middle-end/85359 - duplicate -Wstringop-overflow for a strcmp call with a nonstring pointer

gcc/ChangeLog:

	PR middle-end/85359
	* builtins.c (expand_builtin_strcmp): Take care to avoid issuing
	a duplicate warning.

gcc/testsuite/ChangeLog:

	PR middle-end/85359
	* gcc.dg/attr-nonstring.c: New test.
Index: gcc/builtins.c
===
--- gcc/builtins.c	(revision 259298)
+++ gcc/builtins.c	(working copy)
@@ -4570,14 +4570,15 @@ expand_builtin_strcmp (tree exp, ATTRIBUTE_UNUSED
 	}
 }
 
-  /* Check to see if the argument was declared attribute nonstring
- and if so, issue a warning since at this point it's not known
- to be nul-terminated.  */
   tree fndecl = get_callee_fndecl (exp);
-  maybe_warn_nonstring_arg (fndecl, exp);
-
   if (result)
 {
+  /* Check to see if the argument was declared attribute nonstring
+	 and if so, issue a warning since at this point it's not known
+	 to be nul-terminated.  Avoid doing this when RESULT is false
+	 and let expand_call() do it.  */
+  maybe_warn_nonstring_arg (fndecl, exp);
+
   /* Return the value in the proper mode for this function.  */
   machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
   if (GET_MODE (result) == mode)
Index: gcc/testsuite/gcc.dg/attr-nonstring.c
===
--- gcc/testsuite/gcc.dg/attr-nonstring.c	(nonexistent)
+++ gcc/testsuite/gcc.dg/attr-nonstring.c	(working copy)
@@ -0,0 +1,58 @@
+/* PR middle-end/85359 - duplicate -Wstringop-overflow for a strcmp call
+   with a nonstring pointer
+   { dg-do compile }
+   { dg-options "-O2 -Wall" } */
+
+extern char* strchr (const char*, int);
+extern char* strrchr (const char*, int);
+extern char* stpcpy (char*, const char*);
+extern char* strcpy (char*, const char*);
+extern int strcmp (const char*, const char*);
+extern char* strstr (const char*, const char*);
+
+#define NONSTRING __attribute__ ((nonstring))
+
+int strcmp_nonstring_1 (NONSTRING const char *a, const char *b)
+{
+  /* dg-warning matches one or more instances of the warning so it's
+ no good on its own.  Use dg-regexp instead to verify that just
+ one instance of the warning is issued.  See gcc.dg/pr64223-1
+ for a different approach.  */
+  return strcmp (a, b);  /* { dg-regexp "\[^\n\r\]+: warning: .strcmp. argument 1 declared attribute .nonstring. \\\[-Wstringop-overflow=]" } */
+}
+
+int strcmp_nonstring_2 (const char *a, NONSTRING const char *b)
+{
+  return strcmp (a, b);  /* { dg-regexp "\[^\n\r\]+: warning: .strcmp. argument 2 declared attribute .nonstring. \\\[-Wstringop-overflow=]" } */
+}
+
+
+char* stpcpy_nonstring (char *a, NONSTRING const char *b)
+{
+  return stpcpy (a, b);  /* { dg-regexp "\[^\n\r\]+: warning: .stpcpy. argument 2 declared attribute .nonstring. \\\[-Wstringop-overflow=]" } */
+}
+
+char* strchr_nonstring (NONSTRING const char *s, int c)
+{
+  return strchr (s, c);  /* { dg-regexp "\[^\n\r\]+: warning: .strchr. argument 1 declared attribute .nonstring. \\\[-Wstringop-overflow=]" } */
+}
+
+char* strrchr_nonstring (NONSTRING const char *s, int c)
+{
+  return strrchr (s, c);  /* { dg-regexp "\[^\n\r\]+: warning: .strrchr. argument 1 declared attribute .nonstring. \\\[-Wstringop-overflow=]" } */
+}
+
+char* strcpy_nonstring (char *a, NONSTRING const char *b)
+{
+  return strcpy (a, b);  /* { dg-regexp "\[^\n\r\]+: warning: .strcpy. argument 2 declared attribute .nonstring. \\\[-Wstringop-overflow=]" } */
+}
+
+char* strstr_nonstring_1 (NONSTRING const char *a, const char *b)
+{
+  return strstr (a, b);  /* { dg-regexp "\[^\n\r\]+: warning: .strstr. argument 1 declared attribute .nonstring. \\\[-Wstringop-overflow=]" } */
+}
+
+char* strstr_nonstring_2 (const char *a, NONSTRING const char *b)
+{
+  return strstr (a, b);  /* { dg-regexp "\[^\n\r\]+: warning: .strstr. argument 2 declared attribute .nonstring. \\\[-Wstringop-overflow=]" } */
+}


[PATCH] configure.ac: honor --with-gcc-major-version in gcc-driver-name.h (PR jit/85384)

2018-04-12 Thread David Malcolm
This patch updates gcc/configure.ac to use gcc_base_ver.

I had to drop the \$\$ from the sed expression to get it to work
within the configure script; I'm not entirely sure what their purpose
is.  Without them, it's still matching on the first group of numeric
characters in BASE-VER.

Tested with and without --with-gcc-major-version; in each case,
gcc-driver-name.h is correctly determined.

Fixes the linker issue reported downstream in
  https://bugzilla.redhat.com/show_bug.cgi?id=1566178
and fixes the driver not found issue with:
  gcc_jit_context_set_bool_use_external_driver (ctxt, 1);

Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.

OK for trunk?

config/ChangeLog:
PR jit/85384
* acx.m4 (GCC_BASE_VER): Remove \$\$ from sed expression.

gcc/ChangeLog:
PR jit/85384
* configure.ac (gcc-driver-name.h): Honor --with-gcc-major-version
by using gcc_base_ver to generate a gcc_driver_version, and use
it when generating GCC_DRIVER_NAME.
* configure.ac: Regenerate.
---
 config/acx.m4| 2 +-
 gcc/configure.ac | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/config/acx.m4 b/config/acx.m4
index aa1d34b..87c1b5e 100644
--- a/config/acx.m4
+++ b/config/acx.m4
@@ -246,7 +246,7 @@ AC_DEFUN([GCC_BASE_VER],
   [AS_HELP_STRING([--with-gcc-major-version-only], [use only GCC major number 
in filesystem paths])],
   [if test x$with_gcc_major_version_only = xyes ; then
 changequote(,)dnl
-get_gcc_base_ver="sed -e 's/^\([0-9]*\).*\$\$/\1/'"
+get_gcc_base_ver="sed -e 's/^\([0-9]*\).*/\1/'"
 changequote([,])dnl
   fi
   ])
diff --git a/gcc/configure.ac b/gcc/configure.ac
index 67e1682..b066cc6 100644
--- a/gcc/configure.ac
+++ b/gcc/configure.ac
@@ -6499,8 +6499,10 @@ AC_DEFINE_UNQUOTED(DIAGNOSTICS_COLOR_DEFAULT, 
$DIAGNOSTICS_COLOR_DEFAULT,
 
 # Generate gcc-driver-name.h containing GCC_DRIVER_NAME for the benefit
 # of jit/jit-playback.c.
+gcc_driver_version=`eval "${get_gcc_base_ver} $srcdir/BASE-VER"`
+echo "gcc_driver_version: ${gcc_driver_version}"
 cat > gcc-driver-name.h <

Re: [PATCH] Handle empty infinite loops in OpenACC for PR84955

2018-04-12 Thread Jakub Jelinek
On Thu, Apr 12, 2018 at 11:39:43AM -0700, Cesar Philippidis wrote:
> Strange. I didn't observe any regressions when I tested it. But, then
> again, I was testing against revision
> 
> r259092 | jason | 2018-04-04 09:42:55 -0700 (Wed, 04 Apr 2018) | 4 lines
> 
> which is over a week old. I'll revert that patch for now, and revisit
> this issue in stage1.

You should have kept the omp-expand.c chunk, that is correct and shouldn't
cause issues.

Jakub


C++ PATCH for c++/85356, C++17 ICE with pointer to member function in template

2018-04-12 Thread Jason Merrill
We weren't instantiating exception-specifications when a template
referred to them, but that won't fly in the C++17 world where they're
part of the type, so we need to resolve them to do overload resolution
for non-dependent expressions.

The change to check_redeclaration_exception_specification is necessary
because type_dependent_expression_p (fn) will fail for a dependent
new_decl, beacuse it doesn't have DECL_TEMPLATE_INFO yet.

Tested x86_64-pc-linux-gnu, applying to trunk.
commit e68e003bf7c837312bab52de2195ef4707150a3a
Author: Jason Merrill 
Date:   Thu Apr 12 07:45:03 2018 -0400

PR c++/85356 - ICE with pointer to member function.

* pt.c (maybe_instantiate_noexcept): Do instantiate in templates if
flag_noexcept_type.  Build the new spec within the function context.
* except.c (build_noexcept_spec): Do get constant value in templates
if flag_noexcept_type.
* decl.c (check_redeclaration_exception_specification): Don't
instantiate noexcept on a dependent declaration.

diff --git a/gcc/cp/decl.c b/gcc/cp/decl.c
index 44a152bd195..9f1a171ead7 100644
--- a/gcc/cp/decl.c
+++ b/gcc/cp/decl.c
@@ -1232,8 +1232,11 @@ check_redeclaration_exception_specification (tree new_decl,
   && UNEVALUATED_NOEXCEPT_SPEC_P (old_exceptions))
 return;
 
-  maybe_instantiate_noexcept (new_decl);
-  maybe_instantiate_noexcept (old_decl);
+  if (!type_dependent_expression_p (old_decl))
+{
+  maybe_instantiate_noexcept (new_decl);
+  maybe_instantiate_noexcept (old_decl);
+}
   new_exceptions = TYPE_RAISES_EXCEPTIONS (TREE_TYPE (new_decl));
   old_exceptions = TYPE_RAISES_EXCEPTIONS (TREE_TYPE (old_decl));
 
diff --git a/gcc/cp/except.c b/gcc/cp/except.c
index 0b46698b974..6dab6d6bd96 100644
--- a/gcc/cp/except.c
+++ b/gcc/cp/except.c
@@ -1194,11 +1194,14 @@ build_noexcept_spec (tree expr, int complain)
 {
   /* This isn't part of the signature, so don't bother trying to evaluate
  it until instantiation.  */
-  if (!processing_template_decl && TREE_CODE (expr) != DEFERRED_NOEXCEPT)
+  if (TREE_CODE (expr) != DEFERRED_NOEXCEPT
+  && (!processing_template_decl
+	  || (flag_noexcept_type && !value_dependent_expression_p (expr
 {
   expr = perform_implicit_conversion_flags (boolean_type_node, expr,
 		complain,
 		LOOKUP_NORMAL);
+  expr = instantiate_non_dependent_expr (expr);
   expr = cxx_constant_value (expr);
 }
   if (TREE_CODE (expr) == INTEGER_CST)
diff --git a/gcc/cp/pt.c b/gcc/cp/pt.c
index 76e546cdeaa..da8a5264d33 100644
--- a/gcc/cp/pt.c
+++ b/gcc/cp/pt.c
@@ -23234,7 +23234,8 @@ maybe_instantiate_noexcept (tree fn, tsubst_flags_t complain)
   tree fntype, spec, noex, clone;
 
   /* Don't instantiate a noexcept-specification from template context.  */
-  if (processing_template_decl)
+  if (processing_template_decl
+  && (!flag_noexcept_type || type_dependent_expression_p (fn)))
 return true;
 
   if (DECL_CLONED_FUNCTION_P (fn))
@@ -23273,10 +23274,10 @@ maybe_instantiate_noexcept (tree fn, tsubst_flags_t complain)
 	tf_warning_or_error, fn,
 	/*function_p=*/false,
 	/*integral_constant_expression_p=*/true);
+	  spec = build_noexcept_spec (noex, tf_warning_or_error);
 	  pop_deferring_access_checks ();
 	  pop_access_scope (fn);
 	  pop_tinst_level ();
-	  spec = build_noexcept_spec (noex, tf_warning_or_error);
 	  if (spec == error_mark_node)
 	spec = noexcept_false_spec;
 	}
diff --git a/gcc/testsuite/g++.dg/template/mem_func_ptr2.C b/gcc/testsuite/g++.dg/template/mem_func_ptr2.C
new file mode 100644
index 000..9ceabd3642b
--- /dev/null
+++ b/gcc/testsuite/g++.dg/template/mem_func_ptr2.C
@@ -0,0 +1,13 @@
+// PR c++/85356
+
+struct A
+{
+  A& operator=(int);
+};
+
+void foo(A&(A::*)(int));
+
+template void bar()
+{
+  foo(&A::operator=);
+}


Re: C++ PATCH for c++/85258, ICE with invalid range-based for-loop

2018-04-12 Thread Jason Merrill
OK.

On Thu, Apr 12, 2018 at 1:47 PM, Marek Polacek  wrote:
> This is a crash on invalid which started when we changed 
> decl_maybe_constant_var_p
> to say true for references.  Then in tsubst_copy we take this branch:
>   if (decl_maybe_constant_var_p (r))
> {
>   /* We can't call cp_finish_decl, so handle the
>  initializer by hand.  */
>   tree init = tsubst_init (DECL_INITIAL (t), r, args,
>complain, in_decl);
> but tsubst_init can return NULL_TREE, which potential_constant_expression
> knows how to handle, but reduced_constant_expression_p didn't.  So the
> following patch will fix the ICE.
>
> Bootstrapped/regtested on x86_64-linux, ok for trunk?
>
> 2018-04-12  Marek Polacek  
>
> PR c++/85258
> * constexpr.c (reduced_constant_expression_p): Return false for null
> trees.
>
> * g++.dg/parse/error61.C: New test.
>
> diff --git gcc/cp/constexpr.c gcc/cp/constexpr.c
> index 75f56df4465..82f14baaefd 100644
> --- gcc/cp/constexpr.c
> +++ gcc/cp/constexpr.c
> @@ -1773,6 +1773,9 @@ cxx_eval_call_expression (const constexpr_ctx *ctx, 
> tree t,
>  bool
>  reduced_constant_expression_p (tree t)
>  {
> +  if (t == NULL_TREE)
> +return false;
> +
>switch (TREE_CODE (t))
>  {
>  case PTRMEM_CST:
> @@ -1794,9 +1797,8 @@ reduced_constant_expression_p (tree t)
> field = NULL_TREE;
>FOR_EACH_CONSTRUCTOR_ELT (CONSTRUCTOR_ELTS (t), i, idx, val)
> {
> - if (!val)
> -   /* We're in the middle of initializing this element.  */
> -   return false;
> + /* If VAL is null, we're in the middle of initializing this
> +element.  */
>   if (!reduced_constant_expression_p (val))
> return false;
>   if (field)
> diff --git gcc/testsuite/g++.dg/parse/error61.C 
> gcc/testsuite/g++.dg/parse/error61.C
> index e69de29bb2d..199e1aa721c 100644
> --- gcc/testsuite/g++.dg/parse/error61.C
> +++ gcc/testsuite/g++.dg/parse/error61.C
> @@ -0,0 +1,14 @@
> +// PR c++/85258
> +// { dg-do compile { target c++11 } }
> +
> +template void foo()
> +{
> +  int x[8];
> +  for (int& i, j : x) // { dg-error "multiple" }
> +i = 0; // { dg-error "local variable" }
> +}
> +
> +void bar()
> +{
> +  foo<0>();
> +}
>
> Marek


[PATCH] rs6000: Fix an ICE with -mno-direct-move (PR85291)

2018-04-12 Thread Segher Boessenkool
This fixes an ICE with -mno-direct-move.

Tested etc.; committing.


Segher


2018-04-12  Segher Boessenkool  

* config/rs6000/rs6000.md (fix_truncsi2): Use legacy code if
asked to not generate direct moves.
(fix_truncsi2_stfiwx): Similar.
(fix_truncsi2_internal): Similar.

---
 gcc/config/rs6000/rs6000.md | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 25ac0b8..de652fa 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -5591,7 +5591,7 @@ (define_expand "fix_truncsi2"
(fix:SI (match_operand:SFDF 1 "gpc_reg_operand")))]
   "TARGET_HARD_FLOAT && "
 {
-  if (!TARGET_P8_VECTOR)
+  if (!(TARGET_P8_VECTOR && TARGET_DIRECT_MOVE))
 {
   rtx src = force_reg (mode, operands[1]);
 
@@ -5618,7 +5618,7 @@ (define_insn_and_split "fix_truncsi2_stfiwx"
   "TARGET_HARD_FLOAT && TARGET_DOUBLE_FLOAT
&& (mode != SFmode || TARGET_SINGLE_FLOAT)
&& TARGET_STFIWX && can_create_pseudo_p ()
-   && !TARGET_P8_VECTOR"
+   && !(TARGET_P8_VECTOR && TARGET_DIRECT_MOVE)"
   "#"
   ""
   [(pc)]
@@ -5659,7 +5659,8 @@ (define_insn_and_split "fix_truncsi2_internal"
(fix:SI (match_operand:SFDF 1 "gpc_reg_operand" "d,")))
(clobber (match_operand:DI 2 "gpc_reg_operand" "=1,d"))
(clobber (match_operand:DI 3 "offsettable_mem_operand" "=o,o"))]
-  "TARGET_HARD_FLOAT && TARGET_DOUBLE_FLOAT && !TARGET_P8_VECTOR"
+  "TARGET_HARD_FLOAT && TARGET_DOUBLE_FLOAT
+   && !(TARGET_P8_VECTOR && TARGET_DIRECT_MOVE)"
   "#"
   ""
   [(pc)]
-- 
1.8.3.1



Re: [PATCH] libgcc/CET: Skip signal frames when unwinding shadow stack

2018-04-12 Thread H.J. Lu
On Wed, Apr 11, 2018 at 3:37 AM, H.J. Lu  wrote:
> When -fcf-protection -mcet is used, I got
>
> FAIL: g++.dg/eh/sighandle.C
>
> (gdb) bt
>  #0  _Unwind_RaiseException (exc=exc@entry=0x416ed0)
> at /export/gnu/import/git/sources/gcc/libgcc/unwind.inc:140
>  #1  0x77d9936b in __cxxabiv1::__cxa_throw (obj=,
> tinfo=0x403dd0 , dest=0x0)
> at 
> /export/gnu/import/git/sources/gcc/libstdc++-v3/libsupc++/eh_throw.cc:90
>  #2  0x00401255 in sighandler (signo=11, si=0x7fffd6f8,
> uc=0x7fffd5c0)
> at 
> /export/gnu/import/git/sources/gcc/gcc/testsuite/g++.dg/eh/sighandle.C:9
>  #3    Signal frame which isn't on shadow stack
>  #4  dosegv ()
> at 
> /export/gnu/import/git/sources/gcc/gcc/testsuite/g++.dg/eh/sighandle.C:14
>  #5  0x004012e3 in main ()
> at 
> /export/gnu/import/git/sources/gcc/gcc/testsuite/g++.dg/eh/sighandle.C:30
> (gdb) p frames
> $6 = 5
> (gdb)
>
> frame count should be 4, not 5.  This patch skips signal frames when
> unwinding shadow stack.
>
> Tested on i686 and x86-64.  OK for trunk?
>
> H.J.
> 
> PR libgcc/85334
> * unwind-generic.h (_Unwind_Frames_Increment): New.
> * config/i386/shadow-stack-unwind.h (_Unwind_Frames_Increment):
> Likewise.
> * unwind.inc (_Unwind_RaiseException_Phase2): Increment frame
> count with _Unwind_Frames_Increment.
> (_Unwind_ForcedUnwind_Phase2): Likewise.
> ---
>  libgcc/config/i386/shadow-stack-unwind.h | 5 +
>  libgcc/unwind-generic.h  | 3 +++
>  libgcc/unwind.inc| 6 --
>  3 files changed, 12 insertions(+), 2 deletions(-)
>
> diff --git a/libgcc/config/i386/shadow-stack-unwind.h 
> b/libgcc/config/i386/shadow-stack-unwind.h
> index 40f48df2aec..a32f3e74b52 100644
> --- a/libgcc/config/i386/shadow-stack-unwind.h
> +++ b/libgcc/config/i386/shadow-stack-unwind.h
> @@ -49,3 +49,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  
> If not, see
> }   \
>  }  \
>  while (0)
> +
> +/* Increment frame count.  Skip signal frames.  */
> +#undef _Unwind_Frames_Increment
> +#define _Unwind_Frames_Increment(context, frames) \
> +  if (!_Unwind_IsSignalFrame (context)) frames++
> diff --git a/libgcc/unwind-generic.h b/libgcc/unwind-generic.h
> index b5e3568e1bc..639c96f438e 100644
> --- a/libgcc/unwind-generic.h
> +++ b/libgcc/unwind-generic.h
> @@ -291,4 +291,7 @@ EXCEPTION_DISPOSITION _GCC_specific_handler 
> (PEXCEPTION_RECORD, void *,
>  /* Additional actions to unwind number of stack frames.  */
>  #define _Unwind_Frames_Extra(frames)
>
> +/* Increment frame count.  */
> +#define _Unwind_Frames_Increment(context, frames) frames++
> +
>  #endif /* unwind.h */
> diff --git a/libgcc/unwind.inc b/libgcc/unwind.inc
> index 68c08964d30..b49f8797009 100644
> --- a/libgcc/unwind.inc
> +++ b/libgcc/unwind.inc
> @@ -72,8 +72,9 @@ _Unwind_RaiseException_Phase2(struct _Unwind_Exception *exc,
>/* Don't let us unwind past the handler context.  */
>gcc_assert (!match_handler);
>
> +  _Unwind_Frames_Increment (context, frames);
> +
>uw_update_context (context, &fs);
> -  frames++;
>  }
>
>*frames_p = frames;
> @@ -187,10 +188,11 @@ _Unwind_ForcedUnwind_Phase2 (struct _Unwind_Exception 
> *exc,
> return _URC_FATAL_PHASE2_ERROR;
> }
>
> +  _Unwind_Frames_Increment (context, frames);
> +
>/* Update cur_context to describe the same frame as fs, and discard
>  the previous context if necessary.  */
>uw_advance_context (context, &fs);
> -  frames++;
>  }
>
>*frames_p = frames;
> --
> 2.14.3
>

I need to increment frame count after uw_advance_context which will set
the signal frame bit.

OK for trunk?

-- 
H.J.
From 6ced07f8318d2c1faf616395b630c32c32e332f3 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" 
Date: Tue, 10 Apr 2018 20:46:04 -0700
Subject: [PATCH] libgcc/CET: Skip signal frames when unwinding shadow stack

When -fcf-protection -mcet is used, I got

FAIL: g++.dg/eh/sighandle.C

(gdb) bt
 #0  _Unwind_RaiseException (exc=exc@entry=0x416ed0)
at /export/gnu/import/git/sources/gcc/libgcc/unwind.inc:140
 #1  0x77d9936b in __cxxabiv1::__cxa_throw (obj=,
tinfo=0x403dd0 , dest=0x0)
at /export/gnu/import/git/sources/gcc/libstdc++-v3/libsupc++/eh_throw.cc:90
 #2  0x00401255 in sighandler (signo=11, si=0x7fffd6f8,
uc=0x7fffd5c0)
at /export/gnu/import/git/sources/gcc/gcc/testsuite/g++.dg/eh/sighandle.C:9
 #3    Signal frame which isn't on shadow stack
 #4  dosegv ()
at /export/gnu/import/git/sources/gcc/gcc/testsuite/g++.dg/eh/sighandle.C:14
 #5  0x004012e3 in main ()
at /export/gnu/import/git/sources/gcc/gcc/testsuite/g++.dg/eh/sighandle.C:30
(gdb) p frames
$6 = 5
(gdb)

frame count should be 4, not 5.  This patch skips signal frames when
unwinding shadow stack.

gcc/tes

Re: [wwwdocs] [COMMITTED] ARC gcc8 changes entry

2018-04-12 Thread Bernhard Reutner-Fischer
On 11 April 2018 13:05:52 CEST, Claudiu Zissulescu 
 wrote:
>Hi,
>
>Please find the ARC's gcc8 changes entry section as committed to
>wwwdocs.

s/qualifer/qualifier/

thanks,


Re: [PATCH] Handle empty infinite loops in OpenACC for PR84955

2018-04-12 Thread Cesar Philippidis
On 04/12/2018 11:27 AM, H.J. Lu wrote:
> On Wed, Apr 11, 2018 at 12:30 PM, Cesar Philippidis
>  wrote:
>> On 04/09/2018 04:31 AM, Richard Biener wrote:
>>> On Fri, 6 Apr 2018, Jakub Jelinek wrote:
>>>
 On Fri, Apr 06, 2018 at 06:48:52AM -0700, Cesar Philippidis wrote:
> 2018-04-06  Cesar Philippidis  
>
> PR middle-end/84955
>
> gcc/
> * cfgloop.c (flow_loops_find): Add assert.
> * omp-expand.c (expand_oacc_for): Add dummy false branch for
> tiled basic blocks without omp continue statements.
> * tree-cfg.c (execute_fixup_cfg): Handle calls to internal
> functions like regular functions.
>
> libgomp/
> * testsuite/libgomp.oacc-c-c++-common/pr84955.c: New test.
> * testsuite/libgomp.oacc-fortran/pr84955.f90: New test.

 I'd like to defer the cfgloop.c and tree-cfg.c changes to Richard, just 
 want to
 mention that:

> --- a/gcc/tree-cfg.c
> +++ b/gcc/tree-cfg.c
> @@ -9586,10 +9586,7 @@ execute_fixup_cfg (void)
>for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
> {
>   gimple *stmt = gsi_stmt (gsi);
> - tree decl = is_gimple_call (stmt)
> - ? gimple_call_fndecl (stmt)
> - : NULL;
> - if (decl)
> + if (is_gimple_call (stmt))

 This change doesn't affect just internal functions, but also all indirect
 calls through function pointers with const, pure or noreturn attributes.
>>>
>>> I think the change is desirable nevertheless.  The question is if we
>>> want to do it at this point in time.
>>>
>>> The description of the problem sounds more like LTO writing writing out
>>> loops without previously fixing up state.  So sth like the following
>>> which I'd prefer at this stage (the above hunk is ok for stage1 then).
>>
>> OK, I'll save that hunk for stage 1.
>>
>>> Index: gcc/lto-streamer-out.c
>>> ===
>>> --- gcc/lto-streamer-out.c  (revision 259227)
>>> +++ gcc/lto-streamer-out.c  (working copy)
>>> @@ -2084,6 +2151,9 @@ output_function (struct cgraph_node *nod
>>>/* Set current_function_decl and cfun.  */
>>>push_cfun (fn);
>>>
>>> +  /* Fixup loops if required to match discovery done in the reader.  */
>>> +  loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
>>> +
>>>/* Make string 0 be a NULL string.  */
>>>streamer_write_char_stream (ob->string_stream, 0);
>>>
>>> @@ -2176,12 +2246,13 @@ output_function (struct cgraph_node *nod
>>>streamer_write_record_start (ob, LTO_null);
>>>
>>>output_cfg (ob, fn);
>>> -
>>> -  pop_cfun ();
>>> }
>>>else
>>>  streamer_write_uhwi (ob, 0);
>>>
>>> +  loop_optimizer_finalize ();
>>> +  pop_cfun ();
>>> +
>>>/* Create a section to hold the pickled output of this function.   */
>>>produce_asm (ob, function);
>>
>> That worked. Is this patch OK for trunk, GCC 6 and GCC 7?
> 
> This caused:
> 
> https://gcc.gnu.org/ml/gcc-regression/2018-04/msg00099.html
> 
> FAIL: g++.dg/ipa/pr46984.C  -std=gnu++11 (internal compiler error)
> FAIL: g++.dg/ipa/pr46984.C  -std=gnu++11 (test for excess errors)
> FAIL: g++.dg/ipa/pr46984.C  -std=gnu++14 (internal compiler error)
> FAIL: g++.dg/ipa/pr46984.C  -std=gnu++14 (test for excess errors)
> FAIL: g++.dg/ipa/pr46984.C  -std=gnu++98 (internal compiler error)
> FAIL: g++.dg/ipa/pr46984.C  -std=gnu++98 (test for excess errors)
> FAIL: g++.dg/lto/20081217-1 cp_lto_20081217-1_0.o assemble, -O0 -flto
> -flto-partition=1to1 -fno-use-linker-plugin  (internal compiler error)
> FAIL: g++.dg/lto/20081217-1 cp_lto_20081217-1_0.o assemble, -O0 -flto
> -flto-partition=none -fuse-linker-plugin (internal compiler error)
> FAIL: g++.dg/lto/20081217-1 cp_lto_20081217-1_0.o assemble, -O0 -flto
> -fuse-linker-plugin -fno-fat-lto-objects  (internal compiler error)
> FAIL: g++.dg/lto/20081217-1 cp_lto_20081217-1_0.o assemble, -O2 -flto
> -flto-partition=1to1 -fno-use-linker-plugin  (internal compiler error)
> FAIL: g++.dg/lto/20081217-1 cp_lto_20081217-1_0.o assemble, -O2 -flto
> -flto-partition=none -fuse-linker-plugin -fno-fat-lto-objects
> (internal compiler error)
> FAIL: g++.dg/lto/20081217-1 cp_lto_20081217-1_0.o assemble, -O2 -flto
> -fuse-linker-plugin (internal compiler error)
> FAIL: g++.dg/lto/20081217-2 cp_lto_20081217-2_0.o assemble, -O0 -flto
> -flto-partition=1to1 -fno-use-linker-plugin  (internal compiler error)
> FAIL: g++.dg/lto/20081217-2 cp_lto_20081217-2_0.o assemble, -O0 -flto
> -flto-partition=none -fuse-linker-plugin (internal compiler error)
> FAIL: g++.dg/lto/20081217-2 cp_lto_20081217-2_0.o assemble, -O0 -flto
> -fuse-linker-plugin -fno-fat-lto-objects  (internal compiler error)
> FAIL: g++.dg/lto/20081217-2 cp_lto_20081217-2_0.o assemble, -O2 -flto
> -flto-partition=1to1 -fno-use-linker-plugin  (internal compiler error)
> FAIL: g++.dg/lto/20081217-2 cp_lto_20081217-2_0.o asse

Re: [PATCH] Make redirection only for target_clones: V2 (PR ipa/85329).

2018-04-12 Thread Jakub Jelinek
On Thu, Apr 12, 2018 at 03:46:26PM +0200, Jan Hubicka wrote:
> If you make C++ inline and get the idea to use target cloning attribute on 
> this,
> this will likely lead to link error if you compile multiple files because you
> turn comdat to non-comdat.
> 
> For comdats this woudl effectivly need to become C++ abi extension and we 
> would
> need to define comdat sections for these.  Perhaps easiest way is to simply
> reject the attribute on comdats and probaby also extern functions?

I'm not really sure we can do that, various packages in the wild are already
using this.
What is the problem with comdats and multi-versioning?
The question is what comdat groups we should use for the comdat resolver and
the versioned functions, shall the ifunc symbol be the original mangling of
the method (or other comdat) and the other entrypoints just be .local
non-weak symbols inside of the same section?

Jakub


Re: [PATCH] Make redirection only for target_clones: V2 (PR ipa/85329).

2018-04-12 Thread Jakub Jelinek
On Thu, Apr 12, 2018 at 07:53:35PM +0200, Jakub Jelinek wrote:
> On Thu, Apr 12, 2018 at 03:46:26PM +0200, Jan Hubicka wrote:
> > If you make C++ inline and get the idea to use target cloning attribute on 
> > this,
> > this will likely lead to link error if you compile multiple files because 
> > you
> > turn comdat to non-comdat.
> > 
> > For comdats this woudl effectivly need to become C++ abi extension and we 
> > would
> > need to define comdat sections for these.  Perhaps easiest way is to simply
> > reject the attribute on comdats and probaby also extern functions?
> 
> I'm not really sure we can do that, various packages in the wild are already
> using this.
> What is the problem with comdats and multi-versioning?
> The question is what comdat groups we should use for the comdat resolver and
> the versioned functions, shall the ifunc symbol be the original mangling of
> the method (or other comdat) and the other entrypoints just be .local
> non-weak symbols inside of the same section?

Ah, but we emit the resolver only if we see a use of it.  That sounds quite
broken, resolver in each TU that uses it?  Better to have one at each
definition...

Jakub


Re: [PATCH] Handle empty infinite loops in OpenACC for PR84955

2018-04-12 Thread H.J. Lu
On Wed, Apr 11, 2018 at 12:30 PM, Cesar Philippidis
 wrote:
> On 04/09/2018 04:31 AM, Richard Biener wrote:
>> On Fri, 6 Apr 2018, Jakub Jelinek wrote:
>>
>>> On Fri, Apr 06, 2018 at 06:48:52AM -0700, Cesar Philippidis wrote:
 2018-04-06  Cesar Philippidis  

 PR middle-end/84955

 gcc/
 * cfgloop.c (flow_loops_find): Add assert.
 * omp-expand.c (expand_oacc_for): Add dummy false branch for
 tiled basic blocks without omp continue statements.
 * tree-cfg.c (execute_fixup_cfg): Handle calls to internal
 functions like regular functions.

 libgomp/
 * testsuite/libgomp.oacc-c-c++-common/pr84955.c: New test.
 * testsuite/libgomp.oacc-fortran/pr84955.f90: New test.
>>>
>>> I'd like to defer the cfgloop.c and tree-cfg.c changes to Richard, just 
>>> want to
>>> mention that:
>>>
 --- a/gcc/tree-cfg.c
 +++ b/gcc/tree-cfg.c
 @@ -9586,10 +9586,7 @@ execute_fixup_cfg (void)
for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
 {
   gimple *stmt = gsi_stmt (gsi);
 - tree decl = is_gimple_call (stmt)
 - ? gimple_call_fndecl (stmt)
 - : NULL;
 - if (decl)
 + if (is_gimple_call (stmt))
>>>
>>> This change doesn't affect just internal functions, but also all indirect
>>> calls through function pointers with const, pure or noreturn attributes.
>>
>> I think the change is desirable nevertheless.  The question is if we
>> want to do it at this point in time.
>>
>> The description of the problem sounds more like LTO writing writing out
>> loops without previously fixing up state.  So sth like the following
>> which I'd prefer at this stage (the above hunk is ok for stage1 then).
>
> OK, I'll save that hunk for stage 1.
>
>> Index: gcc/lto-streamer-out.c
>> ===
>> --- gcc/lto-streamer-out.c  (revision 259227)
>> +++ gcc/lto-streamer-out.c  (working copy)
>> @@ -2084,6 +2151,9 @@ output_function (struct cgraph_node *nod
>>/* Set current_function_decl and cfun.  */
>>push_cfun (fn);
>>
>> +  /* Fixup loops if required to match discovery done in the reader.  */
>> +  loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
>> +
>>/* Make string 0 be a NULL string.  */
>>streamer_write_char_stream (ob->string_stream, 0);
>>
>> @@ -2176,12 +2246,13 @@ output_function (struct cgraph_node *nod
>>streamer_write_record_start (ob, LTO_null);
>>
>>output_cfg (ob, fn);
>> -
>> -  pop_cfun ();
>> }
>>else
>>  streamer_write_uhwi (ob, 0);
>>
>> +  loop_optimizer_finalize ();
>> +  pop_cfun ();
>> +
>>/* Create a section to hold the pickled output of this function.   */
>>produce_asm (ob, function);
>
> That worked. Is this patch OK for trunk, GCC 6 and GCC 7?

This caused:

https://gcc.gnu.org/ml/gcc-regression/2018-04/msg00099.html

FAIL: g++.dg/ipa/pr46984.C  -std=gnu++11 (internal compiler error)
FAIL: g++.dg/ipa/pr46984.C  -std=gnu++11 (test for excess errors)
FAIL: g++.dg/ipa/pr46984.C  -std=gnu++14 (internal compiler error)
FAIL: g++.dg/ipa/pr46984.C  -std=gnu++14 (test for excess errors)
FAIL: g++.dg/ipa/pr46984.C  -std=gnu++98 (internal compiler error)
FAIL: g++.dg/ipa/pr46984.C  -std=gnu++98 (test for excess errors)
FAIL: g++.dg/lto/20081217-1 cp_lto_20081217-1_0.o assemble, -O0 -flto
-flto-partition=1to1 -fno-use-linker-plugin  (internal compiler error)
FAIL: g++.dg/lto/20081217-1 cp_lto_20081217-1_0.o assemble, -O0 -flto
-flto-partition=none -fuse-linker-plugin (internal compiler error)
FAIL: g++.dg/lto/20081217-1 cp_lto_20081217-1_0.o assemble, -O0 -flto
-fuse-linker-plugin -fno-fat-lto-objects  (internal compiler error)
FAIL: g++.dg/lto/20081217-1 cp_lto_20081217-1_0.o assemble, -O2 -flto
-flto-partition=1to1 -fno-use-linker-plugin  (internal compiler error)
FAIL: g++.dg/lto/20081217-1 cp_lto_20081217-1_0.o assemble, -O2 -flto
-flto-partition=none -fuse-linker-plugin -fno-fat-lto-objects
(internal compiler error)
FAIL: g++.dg/lto/20081217-1 cp_lto_20081217-1_0.o assemble, -O2 -flto
-fuse-linker-plugin (internal compiler error)
FAIL: g++.dg/lto/20081217-2 cp_lto_20081217-2_0.o assemble, -O0 -flto
-flto-partition=1to1 -fno-use-linker-plugin  (internal compiler error)
FAIL: g++.dg/lto/20081217-2 cp_lto_20081217-2_0.o assemble, -O0 -flto
-flto-partition=none -fuse-linker-plugin (internal compiler error)
FAIL: g++.dg/lto/20081217-2 cp_lto_20081217-2_0.o assemble, -O0 -flto
-fuse-linker-plugin -fno-fat-lto-objects  (internal compiler error)
FAIL: g++.dg/lto/20081217-2 cp_lto_20081217-2_0.o assemble, -O2 -flto
-flto-partition=1to1 -fno-use-linker-plugin  (internal compiler error)
FAIL: g++.dg/lto/20081217-2 cp_lto_20081217-2_0.o assemble, -O2 -flto
-flto-partition=none -fuse-linker-plugin -fno-fat-lto-objects
(internal compiler error)
FAIL: g++.dg/lto/20081217-2 cp_lto_20081217-2_0.o assemble, -O2 -flto
-fuse-linker-

Re: Patch ping^3

2018-04-12 Thread Jeff Law
On 04/12/2018 02:41 AM, Richard Biener wrote:
> On Thu, 12 Apr 2018, Jakub Jelinek wrote:
> 
>> Hi!
>>
>> I'd like to ping the
>>
>> http://gcc.gnu.org/ml/gcc-patches/2018-03/msg01244.html
>>   - PR83157 - improve debug info for x86 setcc peepholes
>>
>> patch.  Thanks.
> 
> OK for stage1 and backporting after it soaked there for a while.
> I'm too unfamiliar with the code to approve it at this point
> (esp. concerned about the cselib part affecting others than
> var-tracking in unexpected ways).
I think it's reasonable to include now.

Jeff


C++ PATCH for c++/85258, ICE with invalid range-based for-loop

2018-04-12 Thread Marek Polacek
This is a crash on invalid which started when we changed 
decl_maybe_constant_var_p
to say true for references.  Then in tsubst_copy we take this branch:
  if (decl_maybe_constant_var_p (r))
{
  /* We can't call cp_finish_decl, so handle the
 initializer by hand.  */
  tree init = tsubst_init (DECL_INITIAL (t), r, args,
   complain, in_decl);
but tsubst_init can return NULL_TREE, which potential_constant_expression
knows how to handle, but reduced_constant_expression_p didn't.  So the
following patch will fix the ICE.

Bootstrapped/regtested on x86_64-linux, ok for trunk?

2018-04-12  Marek Polacek  

PR c++/85258
* constexpr.c (reduced_constant_expression_p): Return false for null
trees.

* g++.dg/parse/error61.C: New test.

diff --git gcc/cp/constexpr.c gcc/cp/constexpr.c
index 75f56df4465..82f14baaefd 100644
--- gcc/cp/constexpr.c
+++ gcc/cp/constexpr.c
@@ -1773,6 +1773,9 @@ cxx_eval_call_expression (const constexpr_ctx *ctx, tree 
t,
 bool
 reduced_constant_expression_p (tree t)
 {
+  if (t == NULL_TREE)
+return false;
+
   switch (TREE_CODE (t))
 {
 case PTRMEM_CST:
@@ -1794,9 +1797,8 @@ reduced_constant_expression_p (tree t)
field = NULL_TREE;
   FOR_EACH_CONSTRUCTOR_ELT (CONSTRUCTOR_ELTS (t), i, idx, val)
{
- if (!val)
-   /* We're in the middle of initializing this element.  */
-   return false;
+ /* If VAL is null, we're in the middle of initializing this
+element.  */
  if (!reduced_constant_expression_p (val))
return false;
  if (field)
diff --git gcc/testsuite/g++.dg/parse/error61.C 
gcc/testsuite/g++.dg/parse/error61.C
index e69de29bb2d..199e1aa721c 100644
--- gcc/testsuite/g++.dg/parse/error61.C
+++ gcc/testsuite/g++.dg/parse/error61.C
@@ -0,0 +1,14 @@
+// PR c++/85258
+// { dg-do compile { target c++11 } }
+
+template void foo()
+{
+  int x[8];
+  for (int& i, j : x) // { dg-error "multiple" }
+i = 0; // { dg-error "local variable" }
+}
+
+void bar()
+{
+  foo<0>();
+}

Marek


[PATCH] Prefer mempcpy to memcpy on x86_64 target (PR middle-end/81657, take 2)

2018-04-12 Thread Jakub Jelinek
On Thu, Apr 12, 2018 at 05:29:35PM +, Wilco Dijkstra wrote:
> > Depending on what you mean old, I see e.g. in 2010 power7 mempcpy got added,
> > in 2013 other power versions, in 2016 s390*, etc.  Doing a decent mempcpy
> > isn't hard if you have asm version of memcpy and one spare register.
> 
> More mempcpy implementations have been added in recent years indeed, but 
> almost all
> add an extra copy of the memcpy code rather than using a single combined 
> implementation.
> That means it is still better to call memcpy (which is frequently used and 
> thus likely in L1/L2)
> rather than mempcpy (which is more likely to be cold and thus not cached).

That really depends, usually when some app uses mempcpy, it uses it very
heavily.  And all the proposed patches do is honor what the user asked, if
you use memcpy () + n, we aren't transforming that into mempcpy behind the
user's back.

Anyway, here is what I think Richard was asking for, that I'm currently
bootstrapping/regtesting.  It can be easily combined with Martin's target
hook if needed, or do it only for
endp == 1 && target != const0_rtx && CALL_EXPR_TAILCALL (exp)
etc.

2018-04-12  Martin Liska  
Jakub Jelinek  

PR middle-end/81657
* expr.h (enum block_op_methods): Add BLOCK_OP_NO_LIBCALL_RET.
* expr.c (emit_block_move_hints): Handle BLOCK_OP_NO_LIBCALL_RET.
* builtins.c (expand_builtin_memory_copy_args): Use
BLOCK_OP_NO_LIBCALL_RET method for mempcpy with non-ignored target,
handle dest_addr == pc_rtx.

* gcc.dg/string-opt-1.c: Remove bogus comment.  Expect a mempcpy
call.

--- gcc/expr.h.jj   2018-01-12 11:35:51.424222835 +0100
+++ gcc/expr.h  2018-04-12 18:38:07.377464114 +0200
@@ -100,7 +100,11 @@ enum block_op_methods
   BLOCK_OP_NO_LIBCALL,
   BLOCK_OP_CALL_PARM,
   /* Like BLOCK_OP_NORMAL, but the libcall can be tail call optimized.  */
-  BLOCK_OP_TAILCALL
+  BLOCK_OP_TAILCALL,
+  /* Like BLOCK_OP_NO_LIBCALL, but instead of emitting a libcall return
+ pc_rtx to indicate nothing has been emitted and let the caller handle
+ it.  */
+  BLOCK_OP_NO_LIBCALL_RET
 };
 
 typedef rtx (*by_pieces_constfn) (void *, HOST_WIDE_INT, scalar_int_mode);
--- gcc/expr.c.jj   2018-04-06 19:19:14.954130838 +0200
+++ gcc/expr.c  2018-04-12 18:39:58.866536619 +0200
@@ -1565,7 +1565,7 @@ emit_block_move_hints (rtx x, rtx y, rtx
   unsigned HOST_WIDE_INT max_size,
   unsigned HOST_WIDE_INT probable_max_size)
 {
-  bool may_use_call;
+  int may_use_call;
   rtx retval = 0;
   unsigned int align;
 
@@ -1577,7 +1577,7 @@ emit_block_move_hints (rtx x, rtx y, rtx
 {
 case BLOCK_OP_NORMAL:
 case BLOCK_OP_TAILCALL:
-  may_use_call = true;
+  may_use_call = 1;
   break;
 
 case BLOCK_OP_CALL_PARM:
@@ -1589,7 +1589,11 @@ emit_block_move_hints (rtx x, rtx y, rtx
   break;
 
 case BLOCK_OP_NO_LIBCALL:
-  may_use_call = false;
+  may_use_call = 0;
+  break;
+
+case BLOCK_OP_NO_LIBCALL_RET:
+  may_use_call = -1;
   break;
 
 default:
@@ -1625,6 +1629,9 @@ emit_block_move_hints (rtx x, rtx y, rtx
   && ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x))
   && ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (y)))
 {
+  if (may_use_call < 0)
+   return pc_rtx;
+
   /* Since x and y are passed to a libcall, mark the corresponding
 tree EXPR as addressable.  */
   tree y_expr = MEM_EXPR (y);
--- gcc/builtins.c.jj   2018-04-12 13:35:34.328395156 +0200
+++ gcc/builtins.c  2018-04-12 18:42:01.846616598 +0200
@@ -3650,12 +3650,16 @@ expand_builtin_memory_copy_args (tree de
   set_mem_align (src_mem, src_align);
 
   /* Copy word part most expediently.  */
-  dest_addr = emit_block_move_hints (dest_mem, src_mem, len_rtx,
-CALL_EXPR_TAILCALL (exp)
-&& (endp == 0 || target == const0_rtx)
-? BLOCK_OP_TAILCALL : BLOCK_OP_NORMAL,
+  enum block_op_methods method = BLOCK_OP_NORMAL;
+  if (CALL_EXPR_TAILCALL (exp) && (endp == 0 || target == const0_rtx))
+method = BLOCK_OP_TAILCALL;
+  if (endp == 1 && target != const0_rtx)
+method = BLOCK_OP_NO_LIBCALL_RET;
+  dest_addr = emit_block_move_hints (dest_mem, src_mem, len_rtx, method,
 expected_align, expected_size,
 min_size, max_size, probable_max_size);
+  if (dest_addr == pc_rtx)
+return NULL_RTX;
 
   if (dest_addr == 0)
 {
--- gcc/testsuite/gcc.dg/string-opt-1.c.jj  2017-08-01 19:23:09.923512205 
+0200
+++ gcc/testsuite/gcc.dg/string-opt-1.c 2018-04-12 18:57:10.940217129 +0200
@@ -1,4 +1,3 @@
-/* Ensure mempcpy is "optimized" into memcpy followed by addition.  */
 /* { dg-do compile } */
 /* { dg-options "-O2" } */
 
@@ -48,5 +47,5 @@ main (void)
   return 0;
 }
 
-/* { dg-final { scan-assembler-not "\" } } */
+/* { dg-final { 

Re: [PATCH] Prefer mempcpy to memcpy on x86_64 target (PR middle-end/81657).

2018-04-12 Thread Jakub Jelinek
On Thu, Apr 12, 2018 at 04:30:07PM +, Wilco Dijkstra wrote:
> Jakub Jelinek wrote:
> > On Thu, Apr 12, 2018 at 03:53:13PM +, Wilco Dijkstra wrote:
> 
> >> The tailcall issue is just a distraction. Historically the handling of 
> >> mempcpy 
> >> has been horribly inefficient in both GCC and GLIBC for practically all 
> >> targets.
> >> This is why it was decided to defer to memcpy.
> >
> > I guess we need to agree to disagree.  But we have a P1 PR that we need to
> > resolve and it is one of the last 6 blockers we have.  I'm not suggesting to
> > revert PR70140, just let use mempcpy libcall if it is what the user wrote 
> > and
> > we aren't expanding it inline.
> 
> Frankly I don't see why it is a P1 regression. Do you have a benchmark that

That is how regression priorities are defined.

> >> So generally it's a good idea to change mempcpy into memcpy by default. 
> >> It's
> >> not slower than calling mempcpy even if you have a fast implementation, 
> >> it's faster
> >> if you use an up to date GLIBC which calls memcpy, and it's significantly 
> >> better
> >> when using an old GLIBC.
> >
> > mempcpy is quite good on many targets even in old GLIBCs.
> 
> Only true if with "many" you mean x86, x86_64 and IIRC sparc.

Depending on what you mean old, I see e.g. in 2010 power7 mempcpy got added,
in 2013 other power versions, in 2016 s390*, etc.  Doing a decent mempcpy
isn't hard if you have asm version of memcpy and one spare register.

Jakub


Re: [PATCH] Prefer mempcpy to memcpy on x86_64 target (PR middle-end/81657).

2018-04-12 Thread Jakub Jelinek
On Thu, Apr 12, 2018 at 05:17:29PM +0200, Richard Biener wrote:
> >For -Os that is easily measurable regression, for -O2 it depends on the
> >relative speed of memcpy vs. mempcpy and whether one or both of them
> >are in
> >I-cache or not.
> 
> Well, then simply unconditionally not generate a libcall from the move 
> expander? 

We need to generate libcall for many callers and in fact, we don't have a
mode nor a way to tell the caller that we haven't emitted anything.

What we could do is add another enumerator to enum block_op_methods that
would be like BLOCK_OP_NO_LIBCALL, but would not use emit_block_move_via_loop
if move_by_pieces nor emit_block_move_via_movmem can be used, and say
instead return const0_rtx or pc_rtx or some way to tell the caller that
it hasn't emitted anything and in expand_builtin_memory_copy_args
pass for endp == 1 && target != const0_rtx that new BLOCK_OP_NO_LIBCALL_LOOP
to emit_block_move_hints and return 0 if dest_addr is const0_rtx (or pc_rtx
or whatever is chosen).

Jakub


Re: [PATCH] Prefer mempcpy to memcpy on x86_64 target (PR middle-end/81657).

2018-04-12 Thread Wilco Dijkstra
Jakub Jelinek wrote:
>On Thu, Apr 12, 2018 at 04:30:07PM +, Wilco Dijkstra wrote:
>> Jakub Jelinek wrote:

>> Frankly I don't see why it is a P1 regression. Do you have a benchmark that
>
>That is how regression priorities are defined.

How can one justify considering this a release blocker without hard numbers?
If this is a 1% regression on a large body of code it would be very serious, if 
0.01% - 
not so much.

>> >> So generally it's a good idea to change mempcpy into memcpy by default. 
>> >> It's
>> >> not slower than calling mempcpy even if you have a fast implementation, 
>> >> it's faster
>> >> if you use an up to date GLIBC which calls memcpy, and it's significantly 
>> >> better
>> >> when using an old GLIBC.
>> >
>> > mempcpy is quite good on many targets even in old GLIBCs.
>> 
>> Only true if with "many" you mean x86, x86_64 and IIRC sparc.
>
> Depending on what you mean old, I see e.g. in 2010 power7 mempcpy got added,
> in 2013 other power versions, in 2016 s390*, etc.  Doing a decent mempcpy
> isn't hard if you have asm version of memcpy and one spare register.

More mempcpy implementations have been added in recent years indeed, but almost 
all
add an extra copy of the memcpy code rather than using a single combined 
implementation.
That means it is still better to call memcpy (which is frequently used and thus 
likely in L1/L2)
rather than mempcpy (which is more likely to be cold and thus not cached).

Wilco

Re: Fix PR target/85238

2018-04-12 Thread Richard Biener
On April 12, 2018 4:18:47 PM GMT+02:00, Eric Botcazou  
wrote:
>This makes -g work again in LTO mode for Windows targets by kludging
>around 
>the missing support for copying PE-COFF debug sections in the simple
>object 
>module of libiberty, thus effectively disabling early debug in LTO
>mode.
>The patch also contains a fixlet for a related oversight in the LTO
>wrapper.
>
>Bootstrapped and tested on x86-64/Windows, approved by Richard B. in
>the audit 
>trail and applied on the mainline.

Thanks Eric for fixing this. 

Richard. 

>
>2018-04-12  Eric Botcazou  
>
>   PR target/85238
>   * lto-wrapper.c (debug_objcopy): Open the files in binary mode.
>   * dwarf2out.c (dwarf2out_early_finish): Do not generate assembly in
>LTO
>   mode for PE-COFF targets.
>   * config/i386/i386-protos.h (i386_pe_asm_lto_start): Declare.
>   (i386_pe_asm_lto_end): Likewise.
>   * config/i386/cygming.h (TARGET_ASM_LTO_START): Define.
>   (TARGET_ASM_LTO_END): Likewise.
>   * config/i386/winnt.c (saved_debug_info_level): New static variable.
>   (i386_pe_asm_lto_start): New function.
>   (i386_pe_asm_lto_end): Likewise.



[PATCH][OBVIOUS] PR85347: New testcase vec-ldl-1.c FAILs on powerpc64-linux

2018-04-12 Thread Kelvin Nilsen
This new test case required a dejagnu qualifier to restrict its
execution on big-endian platforms.

The patch bootstrapped and tested without regressions.  Was committed as
obvious.


gcc/testsuite/ChangeLog:

2018-04-12  Kelvin Nilsen  

    PR target/85347
    * gcc.target/powerpc/vec-ldl-1.c: Change dejagnu directives to
    specify -mvsx on gcc command line.

Index: gcc/testsuite/gcc.target/powerpc/vec-ldl-1.c
===
--- gcc/testsuite/gcc.target/powerpc/vec-ldl-1.c    (revision 259318)
+++ gcc/testsuite/gcc.target/powerpc/vec-ldl-1.c    (working copy)
@@ -1,6 +1,6 @@
 /* { dg-do run { target powerpc*-*-* } } */
-/* { dg-require-effective-target vmx_hw } */
-/* { dg-options "-maltivec -O0 -Wall" } */
+/* { dg-require-effective-target vsx_hw } */
+/* { dg-options "-mvsx -O0 -Wall" } */
 
 #include 
 #include 



Re: [PATCH] Prefer mempcpy to memcpy on x86_64 target (PR middle-end/81657).

2018-04-12 Thread Jakub Jelinek
On Thu, Apr 12, 2018 at 03:53:13PM +, Wilco Dijkstra wrote:
> Jakub Jelinek wrote:
> > On Thu, Apr 12, 2018 at 03:52:09PM +0200, Richard Biener wrote:
> >> Not sure if I missed some important part of the discussion but
> >> for the testcase we want to preserve the tailcall, right?  So
> >> it would be enough to set avoid_libcall to
> >> endp != 0 && CALL_EXPR_TAILCALL (exp) (and thus also handle
> >> stpcpy)?
> 
> The tailcall issue is just a distraction. Historically the handling of 
> mempcpy  
> has been horribly inefficient in both GCC and GLIBC for practically all 
> targets.
> This is why it was decided to defer to memcpy.

I guess we need to agree to disagree.  But we have a P1 PR that we need to
resolve and it is one of the last 6 blockers we have.  I'm not suggesting to
revert PR70140, just let use mempcpy libcall if it is what the user wrote and
we aren't expanding it inline.

> So generally it's a good idea to change mempcpy into memcpy by default. It's

No.

> not slower than calling mempcpy even if you have a fast implementation, it's 
> faster
> if you use an up to date GLIBC which calls memcpy, and it's significantly 
> better
> when using an old GLIBC.

mempcpy is quite good on many targets even in old GLIBCs.

Jakub


Re: [PATCH] Prefer mempcpy to memcpy on x86_64 target (PR middle-end/81657).

2018-04-12 Thread Wilco Dijkstra
Jakub Jelinek wrote:
> On Thu, Apr 12, 2018 at 03:53:13PM +, Wilco Dijkstra wrote:

>> The tailcall issue is just a distraction. Historically the handling of 
>> mempcpy 
>> has been horribly inefficient in both GCC and GLIBC for practically all 
>> targets.
>> This is why it was decided to defer to memcpy.
>
> I guess we need to agree to disagree.  But we have a P1 PR that we need to
> resolve and it is one of the last 6 blockers we have.  I'm not suggesting to
> revert PR70140, just let use mempcpy libcall if it is what the user wrote and
> we aren't expanding it inline.

Frankly I don't see why it is a P1 regression. Do you have a benchmark that
regresses significantly (a few percent, not by a few bytes)? I already showed
the AArch64 results for GLIBC, do you have x86 results that prove things are
much worse?

>> So generally it's a good idea to change mempcpy into memcpy by default. It's
>> not slower than calling mempcpy even if you have a fast implementation, it's 
>> faster
>> if you use an up to date GLIBC which calls memcpy, and it's significantly 
>> better
>> when using an old GLIBC.
>
> mempcpy is quite good on many targets even in old GLIBCs.

Only true if with "many" you mean x86, x86_64 and IIRC sparc.

Wilco


Re: [PATCH] Prefer mempcpy to memcpy on x86_64 target (PR middle-end/81657).

2018-04-12 Thread H.J. Lu
On Thu, Apr 12, 2018 at 8:53 AM, Wilco Dijkstra  wrote:

> So generally it's a good idea to change mempcpy into memcpy by default. It's
> not slower than calling mempcpy even if you have a fast implementation, it's 
> faster
> if you use an up to date GLIBC which calls memcpy, and it's significantly 
> better
> when using an old GLIBC.
>

It is a BAD idea for x86.   We don't want to turn mempcpy to to memcpy on
x86.  PERIOD.

-- 
H.J.


Re: [PATCH] PR libstdc++/85222 allow catching iostream errors as gcc4-compatible ios::failure

2018-04-12 Thread Jonathan Wakely
On 10 April 2018 at 00:36, Jonathan Wakely wrote:
> Define a new exception type derived from std::ios::failure[abi:cxx11]
> which also aggregates an object of the gcc4-compatible ios::failure
> type. Make __throw_ios_failure throw this new type for iostream errors
> that raise exceptions. Provide custom type info for the new type so that
> it can be caught by handlers for the gcc4-compatible ios::failure type
> as well as handlers for ios::failure[abi:cxx11] and its bases.
>
> PR libstdc++/85222
> * src/c++11/Makefile.am [ENABLE_DUAL_ABI]: Add special rules for
> cxx11-ios_failure.cc to rewrite type info for __ios_failure.
> * src/c++11/Makefile.in: Regenerate.
> * src/c++11/cxx11-ios_failure.cc (__ios_failure, __iosfail_type_info):
> New types.
> [_GLIBCXX_USE_DUAL_ABI] (__throw_ios_failure): Define here.
> * src/c++11/ios.cc (__throw_ios_failure): Remove definition.
> * src/c++98/ios_failure.cc (__construct_ios_failure)
> (__destroy_ios_failure, is_ios_failure_handler): New functions.
> [!_GLIBCXX_USE_DUAL_ABI] (__throw_ios_failure): Define here.
> * testsuite/27_io/ios_base/failure/dual_abi.cc: New.
> * testsuite/27_io/basic_ios/copyfmt/char/1.cc: Revert changes to
> handler types, to always catch std::ios_base::failure.
> * testsuite/27_io/basic_ios/exceptions/char/1.cc: Likewise.
> * testsuite/27_io/basic_istream/extractors_arithmetic/char/
> exceptions_failbit.cc: Likewise.
> * testsuite/27_io/basic_istream/extractors_arithmetic/wchar_t/
> exceptions_failbit.cc: Likewise.
> * testsuite/27_io/basic_istream/extractors_other/char/
> exceptions_null.cc: Likewise.
> * testsuite/27_io/basic_istream/extractors_other/wchar_t/
> exceptions_null.cc: Likewise.
> * testsuite/27_io/basic_istream/sentry/char/12297.cc: Likewise.
> * testsuite/27_io/basic_istream/sentry/wchar_t/12297.cc: Likewise.
> * testsuite/27_io/basic_ostream/inserters_other/char/
> exceptions_null.cc: Likewise.
> * testsuite/27_io/basic_ostream/inserters_other/wchar_t/
> exceptions_null.cc: Likewise.
> * testsuite/27_io/ios_base/storage/2.cc: Likewise.
>
> Tested x86_64-linux and powerpc64-linux, with the default config, and
> --disable-libstdcxx-dual-abi, and
> --with-default-libstdcxx-abi=gcc4-compatible. I intend to commit this
> to trunk and gcc-7-branch soon.

This removes the #define for _GLIBCXX_USE_CXX11_ABI from the top of
src/c++11/ios.cc, because __throw_ios_failure is no longer defined
there.

Tested as before, committed to trunk.
commit 845d8dc521d0958b625f2bc691b284e221009929
Author: Jonathan Wakely 
Date:   Wed Apr 11 13:47:31 2018 +0100

Remove #define made redundant by r259281

The definition of __throw_ios_failure is no longer in this file, so
setting the macro here is unnecessary.

* src/c++11/ios.cc: Remove redundant macro definition.

diff --git a/libstdc++-v3/src/c++11/ios.cc b/libstdc++-v3/src/c++11/ios.cc
index e928c594149..82063e4b2f5 100644
--- a/libstdc++-v3/src/c++11/ios.cc
+++ b/libstdc++-v3/src/c++11/ios.cc
@@ -26,10 +26,6 @@
 // ISO C++ 14882: 27.4  Iostreams base classes
 //
 
-// Determines the version of ios_base::failure thrown by __throw_ios_failure.
-// If !_GLIBCXX_USE_DUAL_ABI this will get undefined automatically.
-#define _GLIBCXX_USE_CXX11_ABI 1
-
 #include 
 #include 
 


Re: [PATCH] Prefer mempcpy to memcpy on x86_64 target (PR middle-end/81657).

2018-04-12 Thread Richard Biener
On April 12, 2018 5:38:44 PM GMT+02:00, Jakub Jelinek  wrote:
>On Thu, Apr 12, 2018 at 05:17:29PM +0200, Richard Biener wrote:
>> >For -Os that is easily measurable regression, for -O2 it depends on
>the
>> >relative speed of memcpy vs. mempcpy and whether one or both of them
>> >are in
>> >I-cache or not.
>> 
>> Well, then simply unconditionally not generate a libcall from the
>move expander? 
>
>We need to generate libcall for many callers and in fact, we don't have
>a
>mode nor a way to tell the caller that we haven't emitted anything.
>
>What we could do is add another enumerator to enum block_op_methods
>that
>would be like BLOCK_OP_NO_LIBCALL, but would not use
>emit_block_move_via_loop
>if move_by_pieces nor emit_block_move_via_movmem can be used, and say
>instead return const0_rtx or pc_rtx or some way to tell the caller that
>it hasn't emitted anything and in expand_builtin_memory_copy_args
>pass for endp == 1 && target != const0_rtx that new
>BLOCK_OP_NO_LIBCALL_LOOP
>to emit_block_move_hints and return 0 if dest_addr is const0_rtx (or
>pc_rtx
>or whatever is chosen).

Yes. Emit the "original" call whenever inline expansion fails. 

Richard. 

>   Jakub



Re: [PATCH] Prefer mempcpy to memcpy on x86_64 target (PR middle-end/81657).

2018-04-12 Thread Wilco Dijkstra
Jakub Jelinek wrote:
> On Thu, Apr 12, 2018 at 03:52:09PM +0200, Richard Biener wrote:
>> Not sure if I missed some important part of the discussion but
>> for the testcase we want to preserve the tailcall, right?  So
>> it would be enough to set avoid_libcall to
>> endp != 0 && CALL_EXPR_TAILCALL (exp) (and thus also handle
>> stpcpy)?

The tailcall issue is just a distraction. Historically the handling of mempcpy  
has been horribly inefficient in both GCC and GLIBC for practically all targets.
This is why it was decided to defer to memcpy.

For example small constant mempcpy was not expanded inline like memcpy
until PR70140 was fixed. Except for a few targets which have added an
optimized mempcpy, the default mempcpy implementation in almost all
released GLIBCs is much slower than memcpy (due to using a badly written
C implementation).

Recent GLIBCs now call the optimized memcpy - this is better but still adds
extra call/return overheads. So to improve that the GLIBC headers have an
inline that changes any call to mempcpy into memcpy (this is the default but
can be disabled on a per-target basis).

Obviously it is best to do this optimization in GCC, which is what we finally do
in GCC8. Inlining mempcpy means you sometimes miss a tailcall, but this is
not common - in all of GLIBC the inlining on AArch64 adds 166 extra instructions
and 12 callee-save registers. This is a small codesize cost to avoid the 
overhead
of calling the generic C version.

> My preference would be to have non-lame mempcpy etc. on all targets, but the
> aarch64 folks disagree.

The question is who is going to write the 30+ mempcpy implementations for all
those targets which don't have one? And who says doing this is actually going 
to 
improve performance? Having mempcpy+memcpy typically means more Icache
misses in code that uses both.

So generally it's a good idea to change mempcpy into memcpy by default. It's
not slower than calling mempcpy even if you have a fast implementation, it's 
faster
if you use an up to date GLIBC which calls memcpy, and it's significantly better
when using an old GLIBC.

Wilco


Re: [PATCH] Prefer mempcpy to memcpy on x86_64 target (PR middle-end/81657).

2018-04-12 Thread Richard Biener
On April 12, 2018 4:31:12 PM GMT+02:00, Jakub Jelinek  wrote:
>On Thu, Apr 12, 2018 at 04:19:38PM +0200, Richard Biener wrote:
>> Well, but that wouldn't be a fix for a regression and IMHO there's
>> no reason for a really lame mempcpy.  If targets disgree well,
>
>It is a regression as well, in the past we've emitted mempcpy when user
>wrote mempcpy, now we don't.
>
>E.g.
>extern void *mempcpy (void *, const void *, __SIZE_TYPE__);
>void bar (void *, void *, void *);
>
>void
>foo (void *x, void *y, void *z, void *w, __SIZE_TYPE__ n)
>{
>  bar (mempcpy (x, w, n), mempcpy (y, w, n), mempcpy (z, w, n));
>}
>
>is on x86_64-linux -O2 in 7.x using the 3 mempcpy calls and 90 bytes in
>foo, while
>on the trunk uses 3 memcpy calls and 96 bytes in foo.
>
>For -Os that is easily measurable regression, for -O2 it depends on the
>relative speed of memcpy vs. mempcpy and whether one or both of them
>are in
>I-cache or not.

Well, then simply unconditionally not generate a libcall from the move 
expander? 

>
>> then they get what they deserve.
>> 
>> I don't see any aarch64 specific mempcpy in glibc btw so hopefully
>> the default non-stupid one kicks in (it exactly looks like my C
>> version)
>
>   Jakub



Re: [PATCH] sel-sched: run cleanup_cfg just before loop_optimizer_init (PR 84659)

2018-04-12 Thread Andrey Belevantsev
On 12.04.2018 0:55, Alexander Monakov wrote:
> As noted in PR 85354, we cannot simply invoke cfg_cleanup after dominators are
> computed, because they may become invalid but neither freed nor recomputed, so
> this trips checking in flow_loops_find.
> 
> We can move cleanup_cfg earlier (and run it for all sel-sched invocations, not
> only when pipelining).

OK.  Sorry, I should have noticed that before, and our ia64 tester also
misses libraries required for graphite.

Best,
Andrey

> 
> Bootstrapped/regtested on x86_64 and ppc64 (my previous testing missed this
> issue: the testcase requires graphite, but libisl wasn't present).
> 
>   PR rtl-optimization/85354
>   * sel-sched-ir.c (sel_init_pipelining): Move cfg_cleanup call...
>   * sel-sched.c (sel_global_init): ... here.
> 
> diff --git a/gcc/sel-sched-ir.c b/gcc/sel-sched-ir.c
> index 50a7daafba6..ee970522890 100644
> --- a/gcc/sel-sched-ir.c
> +++ b/gcc/sel-sched-ir.c
> @@ -30,7 +30,6 @@ along with GCC; see the file COPYING3.  If not see
>  #include "cfgrtl.h"
>  #include "cfganal.h"
>  #include "cfgbuild.h"
> -#include "cfgcleanup.h"
>  #include "insn-config.h"
>  #include "insn-attr.h"
>  #include "recog.h"
> @@ -6122,9 +6121,6 @@ make_regions_from_loop_nest (struct loop *loop)
>  void
>  sel_init_pipelining (void)
>  {
> -  /* Remove empty blocks: their presence can break assumptions elsewhere,
> - e.g. the logic to invoke update_liveness_on_insn in sel_region_init.  */
> -  cleanup_cfg (0);
>/* Collect loop information to be used in outer loops pipelining.  */
>loop_optimizer_init (LOOPS_HAVE_PREHEADERS
> | LOOPS_HAVE_FALLTHRU_PREHEADERS
> diff --git a/gcc/sel-sched.c b/gcc/sel-sched.c
> index cd29df35666..59762964c6e 100644
> --- a/gcc/sel-sched.c
> +++ b/gcc/sel-sched.c
> @@ -28,6 +28,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "tm_p.h"
>  #include "regs.h"
>  #include "cfgbuild.h"
> +#include "cfgcleanup.h"
>  #include "insn-config.h"
>  #include "insn-attr.h"
>  #include "params.h"
> @@ -7661,6 +7662,10 @@ sel_sched_region (int rgn)
>  static void
>  sel_global_init (void)
>  {
> +  /* Remove empty blocks: their presence can break assumptions elsewhere,
> + e.g. the logic to invoke update_liveness_on_insn in sel_region_init.  */
> +  cleanup_cfg (0);
> +
>calculate_dominance_info (CDI_DOMINATORS);
>alloc_sched_pools ();
> 



Re: [PATCH] Prefer mempcpy to memcpy on x86_64 target (PR middle-end/81657).

2018-04-12 Thread Jakub Jelinek
On Thu, Apr 12, 2018 at 04:19:38PM +0200, Richard Biener wrote:
> Well, but that wouldn't be a fix for a regression and IMHO there's
> no reason for a really lame mempcpy.  If targets disgree well,

It is a regression as well, in the past we've emitted mempcpy when user
wrote mempcpy, now we don't.

E.g.
extern void *mempcpy (void *, const void *, __SIZE_TYPE__);
void bar (void *, void *, void *);

void
foo (void *x, void *y, void *z, void *w, __SIZE_TYPE__ n)
{
  bar (mempcpy (x, w, n), mempcpy (y, w, n), mempcpy (z, w, n));
}

is on x86_64-linux -O2 in 7.x using the 3 mempcpy calls and 90 bytes in foo, 
while
on the trunk uses 3 memcpy calls and 96 bytes in foo.

For -Os that is easily measurable regression, for -O2 it depends on the
relative speed of memcpy vs. mempcpy and whether one or both of them are in
I-cache or not.

> then they get what they deserve.
> 
> I don't see any aarch64 specific mempcpy in glibc btw so hopefully
> the default non-stupid one kicks in (it exactly looks like my C
> version)

Jakub


Re: [PATCH] Prefer mempcpy to memcpy on x86_64 target (PR middle-end/81657).

2018-04-12 Thread Jakub Jelinek
On Thu, Apr 12, 2018 at 03:52:09PM +0200, Richard Biener wrote:
> Not sure if I missed some important part of the discussion but
> for the testcase we want to preserve the tailcall, right?  So
> it would be enough to set avoid_libcall to
> endp != 0 && CALL_EXPR_TAILCALL (exp) (and thus also handle
> stpcpy)?

For the testcase yes.  There the question is if some targets have so lame
mempcpy that using a tailcall to mempcpy is slower over avoiding the
tailcall (and on aarch64 it looked like maintainer's choice to have lame
mempcpy and hope the compiler will avoid it at all costs).  On the other
side, that change has been forced over to all targets, even when they don't
have lame mempcpy.
So, the tailcall is one issue, and we can either use mempcpy if endp
and CALL_EXPR_TAILCALL, or only do that if -Os.

And another issue is mempcpy uses in other contexts, here again I think x86
has good enough mempcpy that if I use
foo (mempcpy (x, y, z)) then it is better to use mempcpy over memcpy call,
but not so on targets with lame mempcpy.

My preference would be to have non-lame mempcpy etc. on all targets, but the
aarch64 folks disagree.

So, wonder e.g. about Martin's patch, which would use mempcpy if endp and
either FAST_SPEED for mempcpy (regardless of the context), or not
SLOW_SPEED and CALL_EXPR_TAILCALL.  That way, targets could signal they have
so lame mempcpy that they never want to use it (return SLOW_SPEED), or ask
for it to be used every time it makes sense from caller POV, and have the
default something in between (only use it in tail calls).

Jakub


Re: [PATCH] Prefer mempcpy to memcpy on x86_64 target (PR middle-end/81657).

2018-04-12 Thread Richard Biener
On Thu, 12 Apr 2018, Jakub Jelinek wrote:

> On Thu, Apr 12, 2018 at 03:52:09PM +0200, Richard Biener wrote:
> > Not sure if I missed some important part of the discussion but
> > for the testcase we want to preserve the tailcall, right?  So
> > it would be enough to set avoid_libcall to
> > endp != 0 && CALL_EXPR_TAILCALL (exp) (and thus also handle
> > stpcpy)?
> 
> For the testcase yes.  There the question is if some targets have so lame
> mempcpy that using a tailcall to mempcpy is slower over avoiding the
> tailcall (and on aarch64 it looked like maintainer's choice to have lame
> mempcpy and hope the compiler will avoid it at all costs).  On the other
> side, that change has been forced over to all targets, even when they don't
> have lame mempcpy.
> So, the tailcall is one issue, and we can either use mempcpy if endp
> and CALL_EXPR_TAILCALL, or only do that if -Os.
> 
> And another issue is mempcpy uses in other contexts, here again I think x86
> has good enough mempcpy that if I use
> foo (mempcpy (x, y, z)) then it is better to use mempcpy over memcpy call,
> but not so on targets with lame mempcpy.
> 
> My preference would be to have non-lame mempcpy etc. on all targets, but the
> aarch64 folks disagree.
> 
> So, wonder e.g. about Martin's patch, which would use mempcpy if endp and
> either FAST_SPEED for mempcpy (regardless of the context), or not
> SLOW_SPEED and CALL_EXPR_TAILCALL.  That way, targets could signal they have
> so lame mempcpy that they never want to use it (return SLOW_SPEED), or ask
> for it to be used every time it makes sense from caller POV, and have the
> default something in between (only use it in tail calls).

Well, but that wouldn't be a fix for a regression and IMHO there's
no reason for a really lame mempcpy.  If targets disgree well,
then they get what they deserve.

I don't see any aarch64 specific mempcpy in glibc btw so hopefully
the default non-stupid one kicks in (it exactly looks like my C
version)

Richard.

>   Jakub
> 
> 

-- 
Richard Biener 
SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 
21284 (AG Nuernberg)


Fix PR target/85238

2018-04-12 Thread Eric Botcazou
This makes -g work again in LTO mode for Windows targets by kludging around 
the missing support for copying PE-COFF debug sections in the simple object 
module of libiberty, thus effectively disabling early debug in LTO mode.
The patch also contains a fixlet for a related oversight in the LTO wrapper.

Bootstrapped and tested on x86-64/Windows, approved by Richard B. in the audit 
trail and applied on the mainline.


2018-04-12  Eric Botcazou  

PR target/85238
* lto-wrapper.c (debug_objcopy): Open the files in binary mode.
* dwarf2out.c (dwarf2out_early_finish): Do not generate assembly in LTO
mode for PE-COFF targets.
* config/i386/i386-protos.h (i386_pe_asm_lto_start): Declare.
(i386_pe_asm_lto_end): Likewise.
* config/i386/cygming.h (TARGET_ASM_LTO_START): Define.
(TARGET_ASM_LTO_END): Likewise.
* config/i386/winnt.c (saved_debug_info_level): New static variable.
(i386_pe_asm_lto_start): New function.
(i386_pe_asm_lto_end): Likewise.

-- 
Eric BotcazouIndex: config/i386/cygming.h
===
--- config/i386/cygming.h	(revision 259205)
+++ config/i386/cygming.h	(working copy)
@@ -356,6 +356,12 @@ do {		\
 #undef TARGET_ASM_FILE_END
 #define TARGET_ASM_FILE_END i386_pe_file_end
 
+/* Kludge because of missing PE-COFF support for early LTO debug.  */
+#undef  TARGET_ASM_LTO_START
+#define TARGET_ASM_LTO_START i386_pe_asm_lto_start
+#undef  TARGET_ASM_LTO_END
+#define TARGET_ASM_LTO_END i386_pe_asm_lto_end
+
 #undef ASM_COMMENT_START
 #define ASM_COMMENT_START " #"
 
Index: config/i386/i386-protos.h
===
--- config/i386/i386-protos.h	(revision 259205)
+++ config/i386/i386-protos.h	(working copy)
@@ -254,6 +254,8 @@ extern void i386_pe_asm_output_aligned_d
 		HOST_WIDE_INT,
 		HOST_WIDE_INT);
 extern void i386_pe_file_end (void);
+extern void i386_pe_asm_lto_start (void);
+extern void i386_pe_asm_lto_end (void);
 extern void i386_pe_start_function (FILE *, const char *, tree);
 extern void i386_pe_end_function (FILE *, const char *, tree);
 extern void i386_pe_end_cold_function (FILE *, const char *, tree);
Index: config/i386/winnt.c
===
--- config/i386/winnt.c	(revision 259205)
+++ config/i386/winnt.c	(working copy)
@@ -808,6 +808,23 @@ i386_pe_file_end (void)
 }
 }
 
+/* Kludge because of missing PE-COFF support for early LTO debug.  */
+
+static enum debug_info_levels saved_debug_info_level;
+
+void
+i386_pe_asm_lto_start (void)
+{
+  saved_debug_info_level = debug_info_level;
+  debug_info_level = DINFO_LEVEL_NONE;
+}
+
+void
+i386_pe_asm_lto_end (void)
+{
+  debug_info_level = saved_debug_info_level;
+}
+
 
 /* x64 Structured Exception Handling unwind info.  */
 
Index: dwarf2out.c
===
--- dwarf2out.c	(revision 259205)
+++ dwarf2out.c	(working copy)
@@ -31807,7 +31807,11 @@ dwarf2out_early_finish (const char *file
   early_dwarf_finished = true;
 
   /* Do not generate DWARF assembler now when not producing LTO bytecode.  */
-  if (!flag_generate_lto && !flag_generate_offload)
+  if ((!flag_generate_lto && !flag_generate_offload)
+  /* FIXME: Disable debug info generation for PE-COFF targets since the
+	 copy_lto_debug_sections operation of the simple object support in
+	 libiberty is not implemented for them yet.  */
+  || TARGET_PECOFF)
 return;
 
   /* Now as we are going to output for LTO initialize sections and labels
Index: lto-wrapper.c
===
--- lto-wrapper.c	(revision 259205)
+++ lto-wrapper.c	(working copy)
@@ -983,7 +983,7 @@ debug_objcopy (const char *infile)
   infile = fname;
   inoff = (off_t) loffset;
 }
-  int infd = open (infile, O_RDONLY);
+  int infd = open (infile, O_RDONLY | O_BINARY);
   if (infd == -1)
 return NULL;
   simple_object_read *inobj = simple_object_start_read (infd, inoff,


[og7, nvptx, committed] Fix propagation of branch cond in vw-neutered code

2018-04-12 Thread Tom de Vries

Hi,

Currently, when we enable -mlong-vector-in-workers in gemm.f90, we get:
...
  {
.reg.u32%tidy;
.reg.u64%t_bcast;
.reg.u64%y64;
mov.u32 %tidy, %tid.y;
cvt.u64.u32 %y64, %tidy;
add.u64 %y64, %y64, 1;
cvta.shared.u64 %t_bcast, __oacc_bcast;
mad.lo.u64  %r166, %y64, 104, %t_bcast;
  }

  @ %r179 bra.uni $L28;
  @ %r174 bra $L29;
  ...
  setp.le.s32 %r114,%r113,0;
  selp.u32 %r182,1,0,%r114;
  st.u32 [%r166],%r182;
 $L29:
 $L28:

  bar.sync %r167,128;

  ld.u32 %r183,[%r166];
  setp.ne.u32 %r114,%r183,0;

  bar.sync %r167,128;

  @ %r114 bra.uni $L1
...

The branch condition %114 is computed in a W0V0 region, and then 
broadcast to a WAVA region. The broadcast is done using a partition of 
the broadcast buffer at %r166, but this is a worker-specific buffer.


So since the writing of the buffer is done in worker 0 only, the read in 
workers other than 0 is reading uninitialized memory.


This patch fixes this by using the generic broadcast buffer in this 
case, rather than a worker-specific one.


Build x86_64 with nvptx accelerator and tested libgomp.

Committed to og7.

Thanks,
- Tom
[nvptx] Fix propagation of branch cond in vw-neutered code

2018-04-12  Tom de Vries  

	PR target/85246
	* config/nvptx/nvptx.c (nvptx_single): Don't use partitioning when
	propagating branch condition calculated in vector-worker-neutered code.

	* testsuite/libgomp.oacc-fortran/gemm.f90: Use
	-foffload=-mlong-vector-in-workers.

---
 gcc/config/nvptx/nvptx.c| 3 ++-
 libgomp/testsuite/libgomp.oacc-fortran/gemm.f90 | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 547022e..9d011eb 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -4306,13 +4306,14 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
 	  broadcast_data_t data;
 	  unsigned size = GET_MODE_SIZE (SImode);
 	  bool vector = (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask) != 0;
+	  bool worker = (GOMP_DIM_MASK (GOMP_DIM_WORKER) == mask) != 0;
 	  rtx barrier = GEN_INT (0);
 	  int threads = 0;
 
 	  data.base = oacc_bcast_sym;
 	  data.ptr = 0;
 
-	  bool use_partitioning_p = (vector
+	  bool use_partitioning_p = (vector && !worker
  && nvptx_mach_max_workers () > 1
  && cfun->machine->bcast_partition);
 	  if (use_partitioning_p)
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/gemm.f90 b/libgomp/testsuite/libgomp.oacc-fortran/gemm.f90
index ad67dce..744d21e 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/gemm.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/gemm.f90
@@ -1,6 +1,7 @@
 ! Exercise three levels of parallelism using SGEMM from BLAS.
 
 ! { dg-additional-options "-fopenacc-dim=-:-:128" }
+! { dg-additional-options "-foffload=-mlong-vector-in-workers" }
 
 ! Implicitly set vector_length to 128 using -fopenacc-dim.
 subroutine openacc_sgemm (m, n, k, alpha, a, b, beta, c)


Re: [PATCH] Prefer mempcpy to memcpy on x86_64 target (PR middle-end/81657).

2018-04-12 Thread Richard Biener
On Thu, 12 Apr 2018, Martin Liška wrote:

> Hi.
> 
> I'm reminding review request from Richi for generic part
> and Uros/Honza for target part.

Not sure if I missed some important part of the discussion but
for the testcase we want to preserve the tailcall, right?  So
it would be enough to set avoid_libcall to
endp != 0 && CALL_EXPR_TAILCALL (exp) (and thus also handle
stpcpy)?

I'm not sure I like the interfacing of that to emit_block_move_hints
very much.  I'd have used sth like BLOCK_OP_ABORT_ON_LIBCALL
and extend the interface in a way to return what kind of method it
chose rather than just a bool.

Not sure what gcc.dg/20050503-1.c did on non-x86 targets - the
test runs on all archs but only x86 is ever tested for a result.

So - I think tail-calling is prefered, and somehow in the PR
the discussion wandered off to whether there's fast implementations
or not - but the testcase looks for a tailcall where the source
was a tailcall, that should be authorative for the "default"
decision when the hook isn't implemented or doesn't cover the case.

IMO target libraries have to be quite stupid if they have anything
slower than

void *mempcpy (void *dest, const void *src, size_t n)
{
  return memcpy (dest, src, n) + n;
}

which should be not (very much) slower than a non-tailcall memcpy call.

So -- remove the hook and instead use CALL_EXPR_TAILCALL (exp) instead
of its result.

Thanks,
Richard.

Re: [PATCH] Make redirection only for target_clones: V2 (PR ipa/85329).

2018-04-12 Thread Jan Hubicka
> 2018-04-12  Martin Liska  
> 
>   PR ipa/85329
>   * multiple_target.c (create_dispatcher_calls): Set apostrophes
>   for target_clone error message.
>   (separate_attrs): Add new argument and check for an emptry
>   string.
>   (expand_target_clones): Handle it.
>   (ipa_target_clone): Make redirection just for target_clones
>   functions.
> 
> gcc/testsuite/ChangeLog:
> 
> 2018-04-12  Martin Liska  
> 
>   PR ipa/85329
>   * g++.dg/ext/pr85329.C: New test.
>   * gcc.target/i386/mvc12.c: New test.
> @@ -413,7 +426,11 @@ expand_target_clones (struct cgraph_node *node, bool 
> definition)
>tree attributes = make_attribute ("target", "default",
>   DECL_ATTRIBUTES (node->decl));
>DECL_ATTRIBUTES (node->decl) = attributes;
> +  DECL_COMDAT (node->decl) = 0;
> +  DECL_WEAK (node->decl) = 0;
> +  DECL_ARTIFICIAL (node->decl) = 1;
>node->local.local = false;
> +  node->set_comdat_group (NULL);

If you make C++ inline and get the idea to use target cloning attribute on this,
this will likely lead to link error if you compile multiple files because you
turn comdat to non-comdat.

For comdats this woudl effectivly need to become C++ abi extension and we would
need to define comdat sections for these.  Perhaps easiest way is to simply
reject the attribute on comdats and probaby also extern functions?

Otherwise patch looks OK.
Honza


Re: [PATCH] Prefer mempcpy to memcpy on x86_64 target (PR middle-end/81657).

2018-04-12 Thread Jan Hubicka
> Hi.
> 
> I'm reminding review request from Richi for generic part
> and Uros/Honza for target part.

OK for i386 bits.
Honza
> 
> Thanks,
> Martin


[og7, nvptx] Simplifly logic in nvptx_single

2018-04-12 Thread Tom de Vries

Hi,

this patch simplifies the logic in nvptx_single.

Build x86_64 with nvptx accelerator and tested libgomp.

Thanks,
- Tom
[nvptx] Simplifly logic in nvptx_single

2018-04-12  Tom de Vries  

	* config/nvptx/nvptx.c (nvptx_single): Simplify init of vector variable.
	Add and use variable use_partitioning_p.

---
 gcc/config/nvptx/nvptx.c | 28 +++-
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 3c48c14..547022e 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -4305,22 +4305,24 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
 	 we should never have worker mode only. */
 	  broadcast_data_t data;
 	  unsigned size = GET_MODE_SIZE (SImode);
-	  bool vector = true;
+	  bool vector = (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask) != 0;
 	  rtx barrier = GEN_INT (0);
 	  int threads = 0;
 
-	  if (GOMP_DIM_MASK (GOMP_DIM_WORKER) == mask)
-	vector = false;
-
 	  data.base = oacc_bcast_sym;
 	  data.ptr = 0;
 
-	  if (vector
-	  && nvptx_mach_max_workers () > 1
-	  && cfun->machine->bcast_partition)
-	data.base = cfun->machine->bcast_partition;
-
+	  bool use_partitioning_p = (vector
+ && nvptx_mach_max_workers () > 1
+ && cfun->machine->bcast_partition);
+	  if (use_partitioning_p)
+	{
+	  data.base = cfun->machine->bcast_partition;
+	  barrier = cfun->machine->sync_bar;
+	  threads = nvptx_mach_vector_length ();
+	}
 	  gcc_assert (data.base != NULL);
+	  gcc_assert (barrier);
 
 	  unsigned int psize = ROUND_UP (size, oacc_bcast_align);
 	  unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
@@ -4335,14 +4337,6 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
 		vector),
 			before);
 
-	  if (vector
-	  && nvptx_mach_max_workers () > 1
-	  && cfun->machine->sync_bar)
-	{
-	  barrier = cfun->machine->sync_bar;
-	  threads = nvptx_mach_vector_length ();
-	}
-
 	  /* Barrier so other workers can see the write.  */
 	  emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
 	  data.offset = 0;


Re: [PATCH] Don't mark IFUNC resolver as only called directly

2018-04-12 Thread H.J. Lu
On Thu, Apr 12, 2018 at 5:17 AM, Jan Hubicka  wrote:
>> On Thu, Apr 12, 2018 at 1:29 PM, H.J. Lu  wrote:
>> > Since IFUNC resolver is called indirectly, don't mark IFUNC resolver as
>> > only called directly.
>> >
>> > OK for trunk?
>> >
>> >
>> > H.J.
>> > ---
>> > gcc/
>> >
>> > PR target/85345
>> > * cgraph.h: Include stringpool.h" and "attribs.h".
>> > (cgraph_node::only_called_directly_or_aliased_p): Return false
>> > for IFUNC resolver.
>> >
>> > gcc/testsuite/
>> >
>> > PR target/85345
>> > * gcc.target/i386/pr85345.c: New test.
>> > ---
>> >  gcc/cgraph.h|  5 +++-
>> >  gcc/testsuite/gcc.target/i386/pr85345.c | 44 
>> > +
>> >  2 files changed, 48 insertions(+), 1 deletion(-)
>> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr85345.c
>> >
>> > diff --git a/gcc/cgraph.h b/gcc/cgraph.h
>> > index d1ef8408497..9e195824fcc 100644
>> > --- a/gcc/cgraph.h
>> > +++ b/gcc/cgraph.h
>> > @@ -24,6 +24,8 @@ along with GCC; see the file COPYING3.  If not see
>> >  #include "profile-count.h"
>> >  #include "ipa-ref.h"
>> >  #include "plugin-api.h"
>> > +#include "stringpool.h"
>> > +#include "attribs.h"
>> >
>> >  class ipa_opt_pass_d;
>> >  typedef ipa_opt_pass_d *ipa_opt_pass;
>> > @@ -2894,7 +2896,8 @@ cgraph_node::only_called_directly_or_aliased_p (void)
>> >   && !DECL_STATIC_CONSTRUCTOR (decl)
>> >   && !DECL_STATIC_DESTRUCTOR (decl)
>> >   && !used_from_object_file_p ()
>> > - && !externally_visible);
>> > + && !externally_visible
>> > + && !lookup_attribute ("ifunc", DECL_ATTRIBUTES (decl)));
>>
>> How's it handled for our own generated resolver functions?  That is,
>> isn't there sth cheaper than doing a lookup_attribute here?  I see
>> that make_dispatcher_decl nor ix86_get_function_versions_dispatcher
>> adds the 'ifunc' attribute (though they are TREE_PUBLIC there).
>
> Is there any drawback of setting force_output flag?
> Honza

Setting force_output may prevent some optimizations.  Can we add a bit
for IFUNC resolver?

-- 
H.J.


Re: [PATCH] Don't mark IFUNC resolver as only called directly

2018-04-12 Thread H.J. Lu
On Thu, Apr 12, 2018 at 5:13 AM, Richard Biener
 wrote:
> On Thu, Apr 12, 2018 at 1:29 PM, H.J. Lu  wrote:
>> Since IFUNC resolver is called indirectly, don't mark IFUNC resolver as
>> only called directly.
>>
>> OK for trunk?
>>
>>
>> H.J.
>> ---
>> gcc/
>>
>> PR target/85345
>> * cgraph.h: Include stringpool.h" and "attribs.h".
>> (cgraph_node::only_called_directly_or_aliased_p): Return false
>> for IFUNC resolver.
>>
>> gcc/testsuite/
>>
>> PR target/85345
>> * gcc.target/i386/pr85345.c: New test.
>> ---
>>  gcc/cgraph.h|  5 +++-
>>  gcc/testsuite/gcc.target/i386/pr85345.c | 44 
>> +
>>  2 files changed, 48 insertions(+), 1 deletion(-)
>>  create mode 100644 gcc/testsuite/gcc.target/i386/pr85345.c
>>
>> diff --git a/gcc/cgraph.h b/gcc/cgraph.h
>> index d1ef8408497..9e195824fcc 100644
>> --- a/gcc/cgraph.h
>> +++ b/gcc/cgraph.h
>> @@ -24,6 +24,8 @@ along with GCC; see the file COPYING3.  If not see
>>  #include "profile-count.h"
>>  #include "ipa-ref.h"
>>  #include "plugin-api.h"
>> +#include "stringpool.h"
>> +#include "attribs.h"
>>
>>  class ipa_opt_pass_d;
>>  typedef ipa_opt_pass_d *ipa_opt_pass;
>> @@ -2894,7 +2896,8 @@ cgraph_node::only_called_directly_or_aliased_p (void)
>>   && !DECL_STATIC_CONSTRUCTOR (decl)
>>   && !DECL_STATIC_DESTRUCTOR (decl)
>>   && !used_from_object_file_p ()
>> - && !externally_visible);
>> + && !externally_visible
>> + && !lookup_attribute ("ifunc", DECL_ATTRIBUTES (decl)));
>
> How's it handled for our own generated resolver functions?  That is,
> isn't there sth cheaper than doing a lookup_attribute here?  I see
> that make_dispatcher_decl nor ix86_get_function_versions_dispatcher
> adds the 'ifunc' attribute (though they are TREE_PUBLIC there).
>

ext/mv*.C tests failed to compile:

error: '-fcf-protection=full' requires Intel CET support. Use -mcet or
both of -mibt and -mshstk options to enable CET

with -fcf-protection -mcet.   So it is unsupported.

-- 
H.J.


Re: [PATCH] Make redirection only for target_clones: V2 (PR ipa/85329).

2018-04-12 Thread Martin Liška
Forgot to add the patch.

Martin
>From fb1bbf142af6668eeb1bdfeec96920de2f0edb21 Mon Sep 17 00:00:00 2001
From: marxin 
Date: Thu, 12 Apr 2018 12:15:17 +0200
Subject: [PATCH] Make redirection only for target_clones: V2 (PR ipa/85329).

gcc/ChangeLog:

2018-04-12  Martin Liska  

	PR ipa/85329
	* multiple_target.c (create_dispatcher_calls): Set apostrophes
	for target_clone error message.
	(separate_attrs): Add new argument and check for an emptry
	string.
	(expand_target_clones): Handle it.
	(ipa_target_clone): Make redirection just for target_clones
	functions.

gcc/testsuite/ChangeLog:

2018-04-12  Martin Liska  

	PR ipa/85329
	* g++.dg/ext/pr85329.C: New test.
	* gcc.target/i386/mvc12.c: New test.
---
 gcc/multiple_target.c | 43 ---
 gcc/testsuite/g++.dg/ext/pr85329.C| 19 
 gcc/testsuite/gcc.target/i386/mvc12.c | 11 +
 3 files changed, 60 insertions(+), 13 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/ext/pr85329.C
 create mode 100644 gcc/testsuite/gcc.target/i386/mvc12.c

diff --git a/gcc/multiple_target.c b/gcc/multiple_target.c
index b006a5ab6ec..2357e458ec8 100644
--- a/gcc/multiple_target.c
+++ b/gcc/multiple_target.c
@@ -88,7 +88,7 @@ create_dispatcher_calls (struct cgraph_node *node)
   if (!idecl)
 {
   error_at (DECL_SOURCE_LOCATION (node->decl),
-		"default target_clones attribute was not set");
+		"default % attribute was not set");
   return;
 }
 
@@ -216,26 +216,30 @@ get_attr_str (tree arglist, char *attr_str)
 }
 
 /* Return number of attributes separated by comma and put them into ARGS.
-   If there is no DEFAULT attribute return -1.  */
+   If there is no DEFAULT attribute return -1.  If there is an empty
+   string in attribute return -2.  */
 
 static int
-separate_attrs (char *attr_str, char **attrs)
+separate_attrs (char *attr_str, char **attrs, int attrnum)
 {
   int i = 0;
-  bool has_default = false;
+  int default_count = 0;
 
   for (char *attr = strtok (attr_str, ",");
attr != NULL; attr = strtok (NULL, ","))
 {
   if (strcmp (attr, "default") == 0)
 	{
-	  has_default = true;
+	  default_count++;
 	  continue;
 	}
   attrs[i++] = attr;
 }
-  if (!has_default)
+  if (default_count == 0)
 return -1;
+  else if (i + default_count < attrnum)
+return -2;
+
   return i;
 }
 
@@ -321,7 +325,7 @@ expand_target_clones (struct cgraph_node *node, bool definition)
 {
   warning_at (DECL_SOURCE_LOCATION (node->decl),
 		  0,
-		  "single target_clones attribute is ignored");
+		  "single % attribute is ignored");
   return false;
 }
 
@@ -345,7 +349,7 @@ expand_target_clones (struct cgraph_node *node, bool definition)
   int attrnum = get_attr_str (arglist, attr_str);
   char **attrs = XNEWVEC (char *, attrnum);
 
-  attrnum = separate_attrs (attr_str, attrs);
+  attrnum = separate_attrs (attr_str, attrs, attrnum);
   if (attrnum == -1)
 {
   error_at (DECL_SOURCE_LOCATION (node->decl),
@@ -354,6 +358,14 @@ expand_target_clones (struct cgraph_node *node, bool definition)
   XDELETEVEC (attr_str);
   return false;
 }
+  else if (attrnum == -2)
+{
+  error_at (DECL_SOURCE_LOCATION (node->decl),
+		"an empty string cannot be in % attribute");
+  XDELETEVEC (attrs);
+  XDELETEVEC (attr_str);
+  return false;
+}
 
   cgraph_function_version_info *decl1_v = NULL;
   cgraph_function_version_info *decl2_v = NULL;
@@ -382,6 +394,7 @@ expand_target_clones (struct cgraph_node *node, bool definition)
   DECL_ATTRIBUTES (new_node->decl) = attributes;
   location_t saved_loc = input_location;
   input_location = DECL_SOURCE_LOCATION (node->decl);
+
   if (!targetm.target_option.valid_attribute_p (new_node->decl, NULL,
 		TREE_VALUE (attributes),
 		0))
@@ -413,7 +426,11 @@ expand_target_clones (struct cgraph_node *node, bool definition)
   tree attributes = make_attribute ("target", "default",
 DECL_ATTRIBUTES (node->decl));
   DECL_ATTRIBUTES (node->decl) = attributes;
+  DECL_COMDAT (node->decl) = 0;
+  DECL_WEAK (node->decl) = 0;
+  DECL_ARTIFICIAL (node->decl) = 1;
   node->local.local = false;
+  node->set_comdat_group (NULL);
   location_t saved_loc = input_location;
   input_location = DECL_SOURCE_LOCATION (node->decl);
   bool ret
@@ -427,14 +444,14 @@ static unsigned int
 ipa_target_clone (void)
 {
   struct cgraph_node *node;
+  auto_vec to_dispatch;
 
-  bool target_clone_pass = false;
   FOR_EACH_FUNCTION (node)
-target_clone_pass |= expand_target_clones (node, node->definition);
+if (expand_target_clones (node, node->definition))
+  to_dispatch.safe_push (node);
 
-  if (target_clone_pass)
-FOR_EACH_FUNCTION (node)
-  create_dispatcher_calls (node);
+  for (unsigned i = 0; i < to_dispatch.length (); i++)
+create_dispatcher_calls (to_dispatch[i]);
 
   return 0;
 }
diff --git a/gcc/testsuite/g++.dg/ext/pr85329.C b/gcc/testsuite/g++.dg/ext/

[PATCH] Make redirection only for target_clones: V2 (PR ipa/85329).

2018-04-12 Thread Martin Liška
Hi.

I'm sending V2. The patch adjusts:
- make redirection just for target_clones, done simply by recording nodes
  where expand_target_clones return true
- reset various DECL_* flags on default version, needed for ipa-visibility 
assert I've seen
- handle empty string in target_clones: __attribute__((target_clones("",..
  I saw that during reduction of the ICE.

Patch can bootstrap on x86_64-linux-gnu and survives regression tests.

Ready to be installed?
Martin


[PATCH] libgcc/CET: Add _CET_ENDBR to __stack_split_initialize

2018-04-12 Thread H.J. Lu
Program received signal SIGSEGV, Segmentation fault.
__stack_split_initialize ()
at /export/gnu/import/git/sources/gcc/libgcc/config/i386/morestack.S:751
751 leaq-16000(%rsp),%rax   # We should have at least 16K.
Missing separate debuginfos, use: dnf debuginfo-install 
libgcc-8.0.1-0.21.0.fc28.x86_64
(gdb) disass
Dump of assembler code for function __stack_split_initialize:
=> 0x00402858 <+0>: lea-0x3e80(%rsp),%rax
   0x00402860 <+8>: mov%rax,%fs:0x70
   0x00402869 <+17>:sub$0x8,%rsp
   0x0040286d <+21>:mov%rsp,%rdi
   0x00402870 <+24>:mov$0x3e80,%esi
   0x00402875 <+29>:callq  0x401810 
<__generic_morestack_set_initial_sp>
   0x0040287a <+34>:add$0x8,%rsp
   0x0040287e <+38>:retq
End of assembler dump.
(gdb)

This patch adds the missing ENDBR to __stack_split_initialize.

OK for trunk?

H.J.
---
PR libgcc/85379
* config/i386/morestack.S (__stack_split_initialize): Add
_CET_ENDBR.
---
 libgcc/config/i386/morestack.S | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libgcc/config/i386/morestack.S b/libgcc/config/i386/morestack.S
index eca441a2867..99e65eaaff4 100644
--- a/libgcc/config/i386/morestack.S
+++ b/libgcc/config/i386/morestack.S
@@ -730,6 +730,7 @@ __morestack_large_model:
 #endif
 
 __stack_split_initialize:
+   _CET_ENDBR
 
 #ifndef __x86_64__
 
-- 
2.14.3



Re: [wwwdocs] document new options in gcc-8/changes.html

2018-04-12 Thread Jason Merrill
On Wed, Apr 11, 2018 at 10:02 PM, Martin Sebor  wrote:
> On 04/04/2018 05:03 PM, Paolo Carlini wrote:
>>
>> Hi Martin
>>
>> On 05/04/2018 00:28, Martin Sebor wrote:
>>>
>>> +  implementations do suppresses the warning.
>>
>> suppress
>
>
> I was about to fix this but re-reading the full sentence made
> me realize it's correct as is:
>
>   Note that due to GCC bug 82944, defining strncat, strncpy, or
>   stpncpy as a macro in a system header as some implementations
>   do suppresses the warning.
>
> I've added a comma after the suppresses to make it clearer and
> checked in revision 1.63.

Sounds good, though you will also want a matching comma after "system header".

Jason


Re: [PATCH] Prefer mempcpy to memcpy on x86_64 target (PR middle-end/81657).

2018-04-12 Thread Martin Liška
Hi.

I'm reminding review request from Richi for generic part
and Uros/Honza for target part.

Thanks,
Martin


Re: [PATCH] Fix non-AVX512VL handling of lo extraction from AVX512F xmm16+ (PR target/85328)

2018-04-12 Thread Jakub Jelinek
On Thu, Apr 12, 2018 at 01:46:40PM +0300, Kirill Yukhin wrote:
> 
> Hello Jakub!
> 
> > On 11 Apr 2018, at 16:27, Jakub Jelinek  wrote:
> > In lots of patterns we assume that we never see xmm16+ hard registers
> > with 128-bit and 256-bit vector modes when not -mavx512vl, because
> > HARD_REGNO_MODE_OK refuses those.
> > Unfortunately, as this testcase and patch shows, the vec_extract_lo*
> > splitters work as a loophole around this, we happily create instructions
> > like (set (reg:V32QI xmm5) (reg:V32QI xmm16)) and then hard register
> > propagation can propagate the V32QI xmm16 into other insns like vpand.
> > 
> > The following patch fixes it by making sure we never create such registers,
> > just emit (set (reg:V64QI xmm5) (reg:V64QI xmm16)) instead, which by copying
> > all the 512 bits also copies the low bits, and as the destination is
> > originally V32QI which is not HARD_REGNO_MODE_OK in xmm16+, this should be
> > fine.
> > 
> > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
> Patch is OK for trunk.

I've posted an updated version of this patch later on in
https://gcc.gnu.org/ml/gcc-patches/2018-04/msg00563.html
Is that one ok for trunk instead?

And sorry for not getting it right the first time.

Jakub


Re: [PATCH] Don't mark IFUNC resolver as only called directly

2018-04-12 Thread Jan Hubicka
> On Thu, Apr 12, 2018 at 1:29 PM, H.J. Lu  wrote:
> > Since IFUNC resolver is called indirectly, don't mark IFUNC resolver as
> > only called directly.
> >
> > OK for trunk?
> >
> >
> > H.J.
> > ---
> > gcc/
> >
> > PR target/85345
> > * cgraph.h: Include stringpool.h" and "attribs.h".
> > (cgraph_node::only_called_directly_or_aliased_p): Return false
> > for IFUNC resolver.
> >
> > gcc/testsuite/
> >
> > PR target/85345
> > * gcc.target/i386/pr85345.c: New test.
> > ---
> >  gcc/cgraph.h|  5 +++-
> >  gcc/testsuite/gcc.target/i386/pr85345.c | 44 
> > +
> >  2 files changed, 48 insertions(+), 1 deletion(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr85345.c
> >
> > diff --git a/gcc/cgraph.h b/gcc/cgraph.h
> > index d1ef8408497..9e195824fcc 100644
> > --- a/gcc/cgraph.h
> > +++ b/gcc/cgraph.h
> > @@ -24,6 +24,8 @@ along with GCC; see the file COPYING3.  If not see
> >  #include "profile-count.h"
> >  #include "ipa-ref.h"
> >  #include "plugin-api.h"
> > +#include "stringpool.h"
> > +#include "attribs.h"
> >
> >  class ipa_opt_pass_d;
> >  typedef ipa_opt_pass_d *ipa_opt_pass;
> > @@ -2894,7 +2896,8 @@ cgraph_node::only_called_directly_or_aliased_p (void)
> >   && !DECL_STATIC_CONSTRUCTOR (decl)
> >   && !DECL_STATIC_DESTRUCTOR (decl)
> >   && !used_from_object_file_p ()
> > - && !externally_visible);
> > + && !externally_visible
> > + && !lookup_attribute ("ifunc", DECL_ATTRIBUTES (decl)));
> 
> How's it handled for our own generated resolver functions?  That is,
> isn't there sth cheaper than doing a lookup_attribute here?  I see
> that make_dispatcher_decl nor ix86_get_function_versions_dispatcher
> adds the 'ifunc' attribute (though they are TREE_PUBLIC there).

Is there any drawback of setting force_output flag?
Honza
> 
> Richard.
> 
> >  }
> >
> >  /* Return true when function can be removed from callgraph
> > diff --git a/gcc/testsuite/gcc.target/i386/pr85345.c 
> > b/gcc/testsuite/gcc.target/i386/pr85345.c
> > new file mode 100644
> > index 000..63f771294ad
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr85345.c
> > @@ -0,0 +1,44 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -fcf-protection -mcet" } */
> > +/* { dg-final { scan-assembler-times {\mendbr} 4 } } */
> > +
> > +int resolver_fn = 0;
> > +int resolved_fn = 0;
> > +
> > +static inline void
> > +do_it_right_at_runtime_A (void)
> > +{
> > +  resolved_fn++;
> > +}
> > +
> > +static inline void
> > +do_it_right_at_runtime_B (void)
> > +{
> > +  resolved_fn++;
> > +}
> > +
> > +static inline void do_it_right_at_runtime (void);
> > +
> > +void do_it_right_at_runtime (void)
> > +  __attribute__ ((ifunc ("resolve_do_it_right_at_runtime")));
> > +
> > +extern int r;
> > +static void (*resolve_do_it_right_at_runtime (void)) (void)
> > +{
> > +  resolver_fn++;
> > +
> > +  typeof(do_it_right_at_runtime) *func;
> > +  if (r & 1)
> > +func = do_it_right_at_runtime_A;
> > +  else
> > +func = do_it_right_at_runtime_B;
> > +
> > +  return (void *) func;
> > +}
> > +
> > +int
> > +main ()
> > +{
> > +  do_it_right_at_runtime ();
> > +  return 0;
> > +}
> > --
> > 2.14.3
> >


Re: [PATCH] Don't mark IFUNC resolver as only called directly

2018-04-12 Thread Richard Biener
On Thu, Apr 12, 2018 at 1:29 PM, H.J. Lu  wrote:
> Since IFUNC resolver is called indirectly, don't mark IFUNC resolver as
> only called directly.
>
> OK for trunk?
>
>
> H.J.
> ---
> gcc/
>
> PR target/85345
> * cgraph.h: Include stringpool.h" and "attribs.h".
> (cgraph_node::only_called_directly_or_aliased_p): Return false
> for IFUNC resolver.
>
> gcc/testsuite/
>
> PR target/85345
> * gcc.target/i386/pr85345.c: New test.
> ---
>  gcc/cgraph.h|  5 +++-
>  gcc/testsuite/gcc.target/i386/pr85345.c | 44 
> +
>  2 files changed, 48 insertions(+), 1 deletion(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr85345.c
>
> diff --git a/gcc/cgraph.h b/gcc/cgraph.h
> index d1ef8408497..9e195824fcc 100644
> --- a/gcc/cgraph.h
> +++ b/gcc/cgraph.h
> @@ -24,6 +24,8 @@ along with GCC; see the file COPYING3.  If not see
>  #include "profile-count.h"
>  #include "ipa-ref.h"
>  #include "plugin-api.h"
> +#include "stringpool.h"
> +#include "attribs.h"
>
>  class ipa_opt_pass_d;
>  typedef ipa_opt_pass_d *ipa_opt_pass;
> @@ -2894,7 +2896,8 @@ cgraph_node::only_called_directly_or_aliased_p (void)
>   && !DECL_STATIC_CONSTRUCTOR (decl)
>   && !DECL_STATIC_DESTRUCTOR (decl)
>   && !used_from_object_file_p ()
> - && !externally_visible);
> + && !externally_visible
> + && !lookup_attribute ("ifunc", DECL_ATTRIBUTES (decl)));

How's it handled for our own generated resolver functions?  That is,
isn't there sth cheaper than doing a lookup_attribute here?  I see
that make_dispatcher_decl nor ix86_get_function_versions_dispatcher
adds the 'ifunc' attribute (though they are TREE_PUBLIC there).

Richard.

>  }
>
>  /* Return true when function can be removed from callgraph
> diff --git a/gcc/testsuite/gcc.target/i386/pr85345.c 
> b/gcc/testsuite/gcc.target/i386/pr85345.c
> new file mode 100644
> index 000..63f771294ad
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr85345.c
> @@ -0,0 +1,44 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fcf-protection -mcet" } */
> +/* { dg-final { scan-assembler-times {\mendbr} 4 } } */
> +
> +int resolver_fn = 0;
> +int resolved_fn = 0;
> +
> +static inline void
> +do_it_right_at_runtime_A (void)
> +{
> +  resolved_fn++;
> +}
> +
> +static inline void
> +do_it_right_at_runtime_B (void)
> +{
> +  resolved_fn++;
> +}
> +
> +static inline void do_it_right_at_runtime (void);
> +
> +void do_it_right_at_runtime (void)
> +  __attribute__ ((ifunc ("resolve_do_it_right_at_runtime")));
> +
> +extern int r;
> +static void (*resolve_do_it_right_at_runtime (void)) (void)
> +{
> +  resolver_fn++;
> +
> +  typeof(do_it_right_at_runtime) *func;
> +  if (r & 1)
> +func = do_it_right_at_runtime_A;
> +  else
> +func = do_it_right_at_runtime_B;
> +
> +  return (void *) func;
> +}
> +
> +int
> +main ()
> +{
> +  do_it_right_at_runtime ();
> +  return 0;
> +}
> --
> 2.14.3
>


Re: [PATCH] Disable -gsplit-dwarf for all LTO debug

2018-04-12 Thread Richard Biener
On Thu, 12 Apr 2018, Jakub Jelinek wrote:

> On Thu, Apr 12, 2018 at 01:35:46PM +0200, Richard Biener wrote:
> > 
> > The following disables split-dwarf for the LTO part of the early debug
> > (keeping it for the fat part) and makes sure the driver doesn't
> > see -gsplit-dwarf in effect.
> > 
> > That works for all but the compile stage and slim objects
> > (the default) which then ends up generating an empty .dwo file.
> > I'm not sure where to prune for this case given that 
> > -fno-fat-lto-objects seems to be just ignored if it isn't supported.
> > 
> > Note that we need the dwarf2out.c part as otherwise the late
> > references cannot be resolved since the debug is copied from .o to .dwo
> > files early.
> 
> Wouldn't it be better to just sorry on the -flto -gsplit-dwarf combination?
> It really isn't clear what the user is asking for in that case and what
> exactly he wants.

Certainly easier though then not necessary at this point (nor is this
patch of course).

It might be reasonable to keep -gsplit-dwarf working for the fat part
of the object -- OTOH I think that support for fat LTO objects should
be removed at some point as well...

Richard.


Re: [PATCH] Disable -gsplit-dwarf for all LTO debug

2018-04-12 Thread Jakub Jelinek
On Thu, Apr 12, 2018 at 01:35:46PM +0200, Richard Biener wrote:
> 
> The following disables split-dwarf for the LTO part of the early debug
> (keeping it for the fat part) and makes sure the driver doesn't
> see -gsplit-dwarf in effect.
> 
> That works for all but the compile stage and slim objects
> (the default) which then ends up generating an empty .dwo file.
> I'm not sure where to prune for this case given that 
> -fno-fat-lto-objects seems to be just ignored if it isn't supported.
> 
> Note that we need the dwarf2out.c part as otherwise the late
> references cannot be resolved since the debug is copied from .o to .dwo
> files early.

Wouldn't it be better to just sorry on the -flto -gsplit-dwarf combination?
It really isn't clear what the user is asking for in that case and what
exactly he wants.

Jakub


Re: Patch ping^3

2018-04-12 Thread Jakub Jelinek
On Thu, Apr 12, 2018 at 10:41:22AM +0200, Richard Biener wrote:
> On Thu, 12 Apr 2018, Jakub Jelinek wrote:
> > I'd like to ping the
> > 
> > http://gcc.gnu.org/ml/gcc-patches/2018-03/msg01244.html
> >   - PR83157 - improve debug info for x86 setcc peepholes
> > 
> > patch.  Thanks.
> 
> OK for stage1 and backporting after it soaked there for a while.
> I'm too unfamiliar with the code to approve it at this point
> (esp. concerned about the cselib part affecting others than
> var-tracking in unexpected ways).

Thanks.

It shouldn't affect anything other than var-tracking,
as it is guarded with cselib_record_sets_hook != NULL and var-tracking is
the only cselib user that ever sets it to non-NULL (and clears afterwards).
Even if that would be left out, no other cselib user preserves cselib
VALUEs, so ! PRESERVED_VALUE_P (sets[n_sets + i].src_elt->val_rtx)
would be always true for non-var-tracking and the second loop wouldn't do
anything.

Jakub


[PATCH] Disable -gsplit-dwarf for all LTO debug

2018-04-12 Thread Richard Biener

The following disables split-dwarf for the LTO part of the early debug
(keeping it for the fat part) and makes sure the driver doesn't
see -gsplit-dwarf in effect.

That works for all but the compile stage and slim objects
(the default) which then ends up generating an empty .dwo file.
I'm not sure where to prune for this case given that 
-fno-fat-lto-objects seems to be just ignored if it isn't supported.

Note that we need the dwarf2out.c part as otherwise the late
references cannot be resolved since the debug is copied from .o to .dwo
files early.

Opinions?

Thanks,
Richard.

2018-04-12  Richard Biener  

* lto-wrapper.c (run_gcc): Add -gno-split-dwarf to cancel
any such option from compile or link time in a way visible
to the driver.
* dwarf2out.c (add_top_level_skeleton_die_attrs): Avoid adding
DW_AT_GNU_pubnames twice.
(dwarf2out_early_finish): Unconditionally call add_AT_pubnames.
Disable dwarf_split_debug_info around the early LTO DWARF emission
and remove then dead code.

Index: gcc/lto-wrapper.c
===
--- gcc/lto-wrapper.c   (revision 259337)
+++ gcc/lto-wrapper.c   (working copy)
@@ -1123,6 +1123,7 @@ run_gcc (unsigned argc, char *argv[])
   append_compiler_options (&argv_obstack, fdecoded_options,
   fdecoded_options_count);
   append_linker_options (&argv_obstack, decoded_options, 
decoded_options_count);
+  obstack_ptr_grow (&argv_obstack, "-gno-split-dwarf");
 
   /* Scan linker driver arguments for things that are of relevance to us.  */
   for (j = 1; j < decoded_options_count; ++j)
Index: gcc/dwarf2out.c
===
--- gcc/dwarf2out.c (revision 259337)
+++ gcc/dwarf2out.c (working copy)
@@ -11105,7 +11105,8 @@ add_top_level_skeleton_die_attrs (dw_die
   add_skeleton_AT_string (die, dwarf_AT (DW_AT_dwo_name), dwo_file_name);
   if (comp_dir != NULL)
 add_skeleton_AT_string (die, DW_AT_comp_dir, comp_dir);
-  add_AT_pubnames (die);
+  if (!get_AT (die, DW_AT_GNU_pubnames))
+add_AT_pubnames (die);
   add_AT_lineptr (die, DW_AT_GNU_addr_base, debug_addr_section_label);
 }
 
@@ -31811,11 +31812,8 @@ dwarf2out_early_finish (const char *file
 note_variable_value (node->die);
 
   /* The AT_pubnames attribute needs to go in all skeleton dies, including
- both the main_cu and all skeleton TUs.  Making this call unconditional
- would end up either adding a second copy of the AT_pubnames attribute, or
- requiring a special case in add_top_level_skeleton_die_attrs.  */
-  if (!dwarf_split_debug_info)
-add_AT_pubnames (comp_unit_die ());
+ both the main_cu and all skeleton TUs.  */
+  add_AT_pubnames (comp_unit_die ());
 
   /* The early debug phase is now finished.  */
   early_dwarf_finished = true;
@@ -31824,6 +31822,11 @@ dwarf2out_early_finish (const char *file
   if (!flag_generate_lto && !flag_generate_offload)
 return;
 
+  /* For the early LTO DWARF we do not want split DWARF because it really
+ doesn't make much sense.  */
+  int saved_dwarf_split_debug_info = dwarf_split_debug_info;
+  dwarf_split_debug_info = 0;
+
   /* Now as we are going to output for LTO initialize sections and labels
  to the LTO variants.  We don't need a random-seed postfix as other
  LTO sections as linking the LTO debug sections into one in a partial
@@ -31858,12 +31861,6 @@ dwarf2out_early_finish (const char *file
 
   save_macinfo_strings ();
 
-  if (dwarf_split_debug_info)
-{
-  unsigned int index = 0;
-  debug_str_hash->traverse_noresize (&index);
-}
-
   /* Output all of the compilation units.  We put the main one last so that
  the offsets are available to output_pubnames.  */
   for (limbo_die_node *node = limbo_die_list; node; node = node->next)
@@ -31884,9 +31881,7 @@ dwarf2out_early_finish (const char *file
  attributes.  */
   if (debug_info_level >= DINFO_LEVEL_TERSE)
 add_AT_lineptr (ctnode->root_die, DW_AT_stmt_list,
-(!dwarf_split_debug_info
- ? debug_line_section_label
- : debug_skeleton_line_section_label));
+debug_line_section_label);
 
   output_comdat_type_unit (ctnode);
   *slot = ctnode;
@@ -31939,6 +31934,8 @@ dwarf2out_early_finish (const char *file
 
   /* Switch back to the text section.  */
   switch_to_section (text_section);
+
+  dwarf_split_debug_info = saved_dwarf_split_debug_info;
 }
 
 /* Reset all state within dwarf2out.c so that we can rerun the compiler


[PATCH] Don't mark IFUNC resolver as only called directly

2018-04-12 Thread H.J. Lu
Since IFUNC resolver is called indirectly, don't mark IFUNC resolver as
only called directly.

OK for trunk?


H.J.
---
gcc/

PR target/85345
* cgraph.h: Include stringpool.h" and "attribs.h".
(cgraph_node::only_called_directly_or_aliased_p): Return false
for IFUNC resolver.

gcc/testsuite/

PR target/85345
* gcc.target/i386/pr85345.c: New test.
---
 gcc/cgraph.h|  5 +++-
 gcc/testsuite/gcc.target/i386/pr85345.c | 44 +
 2 files changed, 48 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr85345.c

diff --git a/gcc/cgraph.h b/gcc/cgraph.h
index d1ef8408497..9e195824fcc 100644
--- a/gcc/cgraph.h
+++ b/gcc/cgraph.h
@@ -24,6 +24,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "profile-count.h"
 #include "ipa-ref.h"
 #include "plugin-api.h"
+#include "stringpool.h"
+#include "attribs.h"
 
 class ipa_opt_pass_d;
 typedef ipa_opt_pass_d *ipa_opt_pass;
@@ -2894,7 +2896,8 @@ cgraph_node::only_called_directly_or_aliased_p (void)
  && !DECL_STATIC_CONSTRUCTOR (decl)
  && !DECL_STATIC_DESTRUCTOR (decl)
  && !used_from_object_file_p ()
- && !externally_visible);
+ && !externally_visible
+ && !lookup_attribute ("ifunc", DECL_ATTRIBUTES (decl)));
 }
 
 /* Return true when function can be removed from callgraph
diff --git a/gcc/testsuite/gcc.target/i386/pr85345.c 
b/gcc/testsuite/gcc.target/i386/pr85345.c
new file mode 100644
index 000..63f771294ad
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr85345.c
@@ -0,0 +1,44 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fcf-protection -mcet" } */
+/* { dg-final { scan-assembler-times {\mendbr} 4 } } */
+
+int resolver_fn = 0;
+int resolved_fn = 0;
+
+static inline void
+do_it_right_at_runtime_A (void)
+{
+  resolved_fn++;
+}
+
+static inline void
+do_it_right_at_runtime_B (void)
+{
+  resolved_fn++;
+}
+
+static inline void do_it_right_at_runtime (void);
+
+void do_it_right_at_runtime (void)
+  __attribute__ ((ifunc ("resolve_do_it_right_at_runtime")));
+
+extern int r;
+static void (*resolve_do_it_right_at_runtime (void)) (void)
+{
+  resolver_fn++;
+
+  typeof(do_it_right_at_runtime) *func;
+  if (r & 1)
+func = do_it_right_at_runtime_A;
+  else
+func = do_it_right_at_runtime_B;
+
+  return (void *) func;
+}
+
+int
+main ()
+{
+  do_it_right_at_runtime ();
+  return 0;
+}
-- 
2.14.3



Re: [PATCH] Fix non-AVX512VL handling of lo extraction from AVX512F xmm16+ (PR target/85328)

2018-04-12 Thread Kirill Yukhin


> On 12 Apr 2018, at 13:53, Jakub Jelinek  wrote:
> 
> On Thu, Apr 12, 2018 at 01:46:40PM +0300, Kirill Yukhin wrote:
>> 
>> Hello Jakub!
>> 
>>> On 11 Apr 2018, at 16:27, Jakub Jelinek  wrote:
>>> In lots of patterns we assume that we never see xmm16+ hard registers
>>> with 128-bit and 256-bit vector modes when not -mavx512vl, because
>>> HARD_REGNO_MODE_OK refuses those.
>>> Unfortunately, as this testcase and patch shows, the vec_extract_lo*
>>> splitters work as a loophole around this, we happily create instructions
>>> like (set (reg:V32QI xmm5) (reg:V32QI xmm16)) and then hard register
>>> propagation can propagate the V32QI xmm16 into other insns like vpand.
>>> 
>>> The following patch fixes it by making sure we never create such registers,
>>> just emit (set (reg:V64QI xmm5) (reg:V64QI xmm16)) instead, which by copying
>>> all the 512 bits also copies the low bits, and as the destination is
>>> originally V32QI which is not HARD_REGNO_MODE_OK in xmm16+, this should be
>>> fine.
>>> 
>>> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>> Patch is OK for trunk.
> 
> I've posted an updated version of this patch later on in
> https://gcc.gnu.org/ml/gcc-patches/2018-04/msg00563.html
> Is that one ok for trunk instead?
Yes.

—
Thanks, K
> 
> And sorry for not getting it right the first time.
> 
>   Jakub



Re: [PATCH] Ada: Fix s-oscons.ads generation

2018-04-12 Thread Sebastian Huber

On 07/03/18 14:15, Arnaud Charlet wrote:

The $(GNATLIBCFLAGS) are already included in $(GNATLIBCFLAGS_FOR_C).

We must call the C compiler with the right machine flags.  So, add
$(GNATLIBCFLAGS_FOR_C) to $(OSCONS_EXTRACT).  For example, on a
bi-arch
compiler supporting 32-bit and 64-bit instruction sets we pick
otherwise
only one variant due to a missing -m32 or -m64 flag.

gcc/ada
* gcc-interface/Makefile.in (OSCONS_CPP): Remove redundant
$(GNATLIBCFLAGS).
(OSCONS_EXTRACT): Add $(GNATLIBCFLAGS_FOR_C).

OK, thanks.

Thanks for the quick review. I would like to back port this to GCC 7.

Seems fine to me if it doesn't cause troubles on trunk.


I back ported this to GCC 7 today.

--
Sebastian Huber, embedded brains GmbH

Address : Dornierstr. 4, D-82178 Puchheim, Germany
Phone   : +49 89 189 47 41-16
Fax : +49 89 189 47 41-09
E-Mail  : sebastian.hu...@embedded-brains.de
PGP : Public key available on request.

Diese Nachricht ist keine geschäftliche Mitteilung im Sinne des EHUG.



[PATCH] Fix PR85371

2018-04-12 Thread Richard Biener

This fixes crashes on Darwin with -flto -g because we pass the wrong
(NULL) debug_line_section in

case dw_val_class_lineptr:
  dw2_asm_output_offset (DWARF_OFFSET_SIZE, AT_lbl (a),
 debug_line_section, "%s", name);
  break;

which is because for some reason I used debug_sekelton_line_section
and friends for the early LTO dwarf.  That looks mistaken in the
above light so the following reverts that to use debug_line_section
and friends.

I verified that -flto -g -gsplit-dwarf still "works" (though that combo
doesn't make much sense, if only because .dwo objects for the ltrans
objects end up in /tmp ...).

LTO bootstrap with -g[23] succeeded on x86_64-unknown-linux-gnu,
bootstrap for all languages as well, testing in progress.

Approved by Jakub on IRC so I'll go ahead with this after the above
finished.

Haven't yet found a convenient place to disable -gsplit-dwarf for
the non-fat part of LTO in a way the driver sees it.  Any hints
appreciated.  There's also the (unwanted) side-effect of
-gsplit-dwarf enabling -ggnu-pubnames.  Disabling on the LTRANS
side alone should be possible by massaging lto-wrapper to append
-gno-split-dwarf I guess.

Richard.

2018-04-12  Richard Biener  

PR lto/85371
* dwarf2out.c (init_sections_and_labels): Use debug_line_section[_label]
for the early LTO debug to properly generate references to it
during DIE emission.  Do not re-use that for the skeleton for
split-dwarf.
(dwarf2out_early_finish): Likewise.

Index: gcc/dwarf2out.c
===
--- gcc/dwarf2out.c (revision 259337)
+++ gcc/dwarf2out.c (working copy)
@@ -28405,14 +28406,6 @@ init_sections_and_labels (bool early_lto
  debug_macinfo_section = get_section (debug_macinfo_section_name,
   SECTION_DEBUG
   | SECTION_EXCLUDE, NULL);
- /* For macro info we have to refer to a debug_line section, so
-similar to split-dwarf emit a skeleton one for early debug.  */
- debug_skeleton_line_section
-   = get_section (DEBUG_LTO_LINE_SECTION,
-  SECTION_DEBUG | SECTION_EXCLUDE, NULL);
- ASM_GENERATE_INTERNAL_LABEL (debug_skeleton_line_section_label,
-  DEBUG_SKELETON_LINE_SECTION_LABEL,
-  generation);
}
   else
{
@@ -28459,6 +28452,13 @@ init_sections_and_labels (bool early_lto
   SECTION_DEBUG | SECTION_EXCLUDE,
   NULL);
}
+  /* For macro info and the file table we have to refer to a
+debug_line section.  */
+  debug_line_section = get_section (DEBUG_LTO_LINE_SECTION,
+   SECTION_DEBUG | SECTION_EXCLUDE, NULL);
+  ASM_GENERATE_INTERNAL_LABEL (debug_line_section_label,
+  DEBUG_LINE_SECTION_LABEL, generation);
+
   debug_str_section = get_section (DEBUG_LTO_STR_SECTION,
   DEBUG_STR_SECTION_FLAGS
   | SECTION_EXCLUDE, NULL);
@@ -31845,7 +31849,7 @@ dwarf2out_early_finish (const char *file
 
   /* AIX Assembler inserts the length, so adjust the reference to match the
  offset expected by debuggers.  */
-  strcpy (dl_section_ref, debug_skeleton_line_section_label);
+  strcpy (dl_section_ref, debug_line_section_label);
   if (XCOFF_DEBUGGING_INFO)
 strcat (dl_section_ref, DWARF_INITIAL_LENGTH_SIZE_STR);
 
@@ -31918,7 +31922,7 @@ dwarf2out_early_finish (const char *file
 
   switch_to_section (debug_macinfo_section);
   ASM_OUTPUT_LABEL (asm_out_file, macinfo_section_label);
-  output_macinfo (debug_skeleton_line_section_label, true);
+  output_macinfo (debug_line_section_label, true);
   dw2_asm_output_data (1, 0, "End compilation unit");
 
   if (flag_fat_lto_objects)
@@ -31929,8 +31933,8 @@ dwarf2out_early_finish (const char *file
 }
 
   /* Emit a skeleton debug_line section.  */
-  switch_to_section (debug_skeleton_line_section);
-  ASM_OUTPUT_LABEL (asm_out_file, debug_skeleton_line_section_label);
+  switch_to_section (debug_line_section);
+  ASM_OUTPUT_LABEL (asm_out_file, debug_line_section_label);
   output_line_info (true);
 
   /* If we emitted any indirect strings, output the string table too.  */


Re: [PATCH] Fix non-AVX512VL handling of lo extraction from AVX512F xmm16+ (PR target/85328)

2018-04-12 Thread Kirill Yukhin

Hello Jakub!

> On 11 Apr 2018, at 16:27, Jakub Jelinek  wrote:
> 
> Hi!
> 
> In lots of patterns we assume that we never see xmm16+ hard registers
> with 128-bit and 256-bit vector modes when not -mavx512vl, because
> HARD_REGNO_MODE_OK refuses those.
> Unfortunately, as this testcase and patch shows, the vec_extract_lo*
> splitters work as a loophole around this, we happily create instructions
> like (set (reg:V32QI xmm5) (reg:V32QI xmm16)) and then hard register
> propagation can propagate the V32QI xmm16 into other insns like vpand.
> 
> The following patch fixes it by making sure we never create such registers,
> just emit (set (reg:V64QI xmm5) (reg:V64QI xmm16)) instead, which by copying
> all the 512 bits also copies the low bits, and as the destination is
> originally V32QI which is not HARD_REGNO_MODE_OK in xmm16+, this should be
> fine.
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
Patch is OK for trunk.

—
Thanks, K



[Committed] IBM Z: Spectre: Prevent thunk cfi to be emitted with -fno-dwarf2-cfi-asm

2018-04-12 Thread Andreas Krebbel
The CFI magic we emit as part of the indirect branch thunks in order to
have somewhat sane unwind information must not be emitted with
-fno-dwarf2-cfi-asm.

Committed to mainline, gcc-7-branch, and gcc-6-branch.

gcc/ChangeLog:

2018-04-12  Andreas Krebbel  

* config/s390/s390.c (s390_output_indirect_thunk_function): Check
also for flag_dwarf2_cfi_asm.

gcc/testsuite/ChangeLog:

2018-04-12  Andreas Krebbel  

* gcc.target/s390/nobp-no-dwarf2-cfi.c: New test.
---
 gcc/config/s390/s390.c |  2 +-
 gcc/testsuite/gcc.target/s390/nobp-no-dwarf2-cfi.c | 19 +++
 2 files changed, 20 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/s390/nobp-no-dwarf2-cfi.c

diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c
index 59f5de9..5add598 100644
--- a/gcc/config/s390/s390.c
+++ b/gcc/config/s390/s390.c
@@ -16522,7 +16522,7 @@ s390_output_indirect_thunk_function (unsigned int 
regno, bool z10_p)
  Stopping in the thunk: backtrace will point to the thunk target
  is if it was interrupted by a signal.  For a call this means that
  the call chain will be: caller->callee->thunk   */
-  if (flag_asynchronous_unwind_tables)
+  if (flag_asynchronous_unwind_tables && flag_dwarf2_cfi_asm)
 {
   fputs ("\t.cfi_signal_frame\n", asm_out_file);
   fprintf (asm_out_file, "\t.cfi_return_column %d\n", regno);
diff --git a/gcc/testsuite/gcc.target/s390/nobp-no-dwarf2-cfi.c 
b/gcc/testsuite/gcc.target/s390/nobp-no-dwarf2-cfi.c
new file mode 100644
index 000..75e32a1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/nobp-no-dwarf2-cfi.c
@@ -0,0 +1,19 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -march=z900 --save-temps -mfunction-return-reg=thunk 
-mindirect-branch-table -fno-dwarf2-cfi-asm" } */
+
+/* Make sure that we do not emit .cfi directives when -fno-dwarf2-cfi-asm is 
being used.  */
+
+int
+main ()
+{
+  return 0;
+}
+
+/* 1 x main
+/* { dg-final { scan-assembler-times "jg\t__s390_indirect_jump" 1 } } */
+/* { dg-final { scan-assembler "ex\t" } } */
+
+/* { dg-final { scan-assembler-not "section\t.s390_indirect_jump" } } */
+/* { dg-final { scan-assembler-not "section\t.s390_indirect_call" } } */
+/* { dg-final { scan-assembler "section\t.s390_return_reg" } } */
+/* { dg-final { scan-assembler-not "section\t.s390_return_mem" } } */
-- 
2.9.1



Patch ping^3

2018-04-12 Thread Jakub Jelinek
Hi!

I'd like to ping the

http://gcc.gnu.org/ml/gcc-patches/2018-03/msg01244.html
  - PR83157 - improve debug info for x86 setcc peepholes

patch.  Thanks.

Jakub


Re: Patch ping^3

2018-04-12 Thread Richard Biener
On Thu, 12 Apr 2018, Jakub Jelinek wrote:

> Hi!
> 
> I'd like to ping the
> 
> http://gcc.gnu.org/ml/gcc-patches/2018-03/msg01244.html
>   - PR83157 - improve debug info for x86 setcc peepholes
> 
> patch.  Thanks.

OK for stage1 and backporting after it soaked there for a while.
I'm too unfamiliar with the code to approve it at this point
(esp. concerned about the cselib part affecting others than
var-tracking in unexpected ways).

Thanks,
Richard.


Re: [PATCH] Fix copyprop_hardreg_forward_1 (PR rtl-optimization/85342)

2018-04-12 Thread Richard Biener
On Wed, 11 Apr 2018, Jakub Jelinek wrote:

> Hi!
> 
> When switching regcprop.c to use validate_* and apply_change_group,
> I have added code to restore recog_data.operands[i] if they have been
> replaced after apply_change_group failure.  That is bogus though, when
> apply_change_group fails, recog_data.insn is NULL and rest of recog_data
> structure is complete garbage; and nothing in copyprop_hardreg_forward_1
> seems to use it afterwards anyway, just will call extract_insn on the next
> insn.  Furthermore, the "fixups" were only for the recog_data structure
> operands itself, nothing else, the instruction itself has been already
> corrected by cancel_changes.
> 
> Fixed thusly, bootstrapped/regtested on x86_64-linux and i686-linux, ok for
> trunk?

OK.

Richard.

> 2018-04-11  Jakub Jelinek  
> 
>   PR rtl-optimization/85342
>   * regcprop.c (copyprop_hardreg_forward_1): Remove replaced array, use
>   a bool scalar var inside of the loop instead.  Don't try to update
>   recog_data.operand after failed apply_change_group.
> 
>   * gcc.target/i386/pr85342.c: New test.
> 
> --- gcc/regcprop.c.jj 2018-01-04 00:43:17.996703342 +0100
> +++ gcc/regcprop.c2018-04-11 16:17:29.883575142 +0200
> @@ -751,7 +751,6 @@ copyprop_hardreg_forward_1 (basic_block
>bool is_asm, any_replacements;
>rtx set;
>rtx link;
> -  bool replaced[MAX_RECOG_OPERANDS];
>bool changed = false;
>struct kill_set_value_data ksvd;
>  
> @@ -934,7 +933,7 @@ copyprop_hardreg_forward_1 (basic_block
>eldest live copy that's in an appropriate register class.  */
>for (i = 0; i < n_ops; i++)
>   {
> -   replaced[i] = false;
> +   bool replaced = false;
>  
> /* Don't scan match_operand here, since we've no reg class
>information to pass down.  Any operands that we could
> @@ -951,26 +950,26 @@ copyprop_hardreg_forward_1 (basic_block
> if (recog_data.operand_type[i] == OP_IN)
>   {
> if (op_alt[i].is_address)
> - replaced[i]
> + replaced
> = replace_oldest_value_addr (recog_data.operand_loc[i],
>  alternative_class (op_alt, i),
>  VOIDmode, ADDR_SPACE_GENERIC,
>  insn, vd);
> else if (REG_P (recog_data.operand[i]))
> - replaced[i]
> + replaced
> = replace_oldest_value_reg (recog_data.operand_loc[i],
> alternative_class (op_alt, i),
> insn, vd);
> else if (MEM_P (recog_data.operand[i]))
> - replaced[i] = replace_oldest_value_mem (recog_data.operand[i],
> - insn, vd);
> + replaced = replace_oldest_value_mem (recog_data.operand[i],
> +  insn, vd);
>   }
> else if (MEM_P (recog_data.operand[i]))
> - replaced[i] = replace_oldest_value_mem (recog_data.operand[i],
> - insn, vd);
> + replaced = replace_oldest_value_mem (recog_data.operand[i],
> +  insn, vd);
>  
> /* If we performed any replacement, update match_dups.  */
> -   if (replaced[i])
> +   if (replaced)
>   {
> int j;
> rtx new_rtx;
> @@ -989,13 +988,6 @@ copyprop_hardreg_forward_1 (basic_block
>   {
> if (! apply_change_group ())
>   {
> -   for (i = 0; i < n_ops; i++)
> - if (replaced[i])
> -   {
> - rtx old = *recog_data.operand_loc[i];
> - recog_data.operand[i] = old;
> -   }
> -
> if (dump_file)
>   fprintf (dump_file,
>"insn %u: reg replacements not verified\n",
> --- gcc/testsuite/gcc.target/i386/pr85342.c.jj2018-04-11 
> 16:25:50.564848408 +0200
> +++ gcc/testsuite/gcc.target/i386/pr85342.c   2018-04-11 16:26:05.534856581 
> +0200
> @@ -0,0 +1,29 @@
> +/* PR rtl-optimization/85342 */
> +/* { dg-do compile { target int128 } } */
> +/* { dg-options "-O2 -mavx512vl" } */
> +
> +typedef unsigned char U __attribute__((vector_size (64)));
> +typedef unsigned int V __attribute__((vector_size (64)));
> +typedef unsigned __int128 W __attribute__((vector_size (64)));
> +int i;
> +V g, h, z, k, l, m;
> +U j;
> +
> +W
> +bar (W o, W p)
> +{
> +  U q;
> +  o |= (W){q[0]} >= o;
> +  o += 1 < o;
> +  j |= (U){} == j;
> +  return i + (W)q + (W)g + (W)h + (W)z + o + (W)j + (W)k + (W)l + (W)m + p;
> +}
> +
> +W
> +foo (U u)
> +{
> +  U q;
> +  W r = bar ((W)(U){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
> ~0}, (W)q);
> +  u += (U)bar ((W){~0}, r);
> +  return (W)u;
> +}
> 
>   Jakub
> 
> 

Re: [PATCH] Handle empty infinite loops in OpenACC for PR84955

2018-04-12 Thread Richard Biener
On Wed, Apr 11, 2018 at 9:30 PM, Cesar Philippidis
 wrote:
> On 04/09/2018 04:31 AM, Richard Biener wrote:
>> On Fri, 6 Apr 2018, Jakub Jelinek wrote:
>>
>>> On Fri, Apr 06, 2018 at 06:48:52AM -0700, Cesar Philippidis wrote:
 2018-04-06  Cesar Philippidis  

 PR middle-end/84955

 gcc/
 * cfgloop.c (flow_loops_find): Add assert.
 * omp-expand.c (expand_oacc_for): Add dummy false branch for
 tiled basic blocks without omp continue statements.
 * tree-cfg.c (execute_fixup_cfg): Handle calls to internal
 functions like regular functions.

 libgomp/
 * testsuite/libgomp.oacc-c-c++-common/pr84955.c: New test.
 * testsuite/libgomp.oacc-fortran/pr84955.f90: New test.
>>>
>>> I'd like to defer the cfgloop.c and tree-cfg.c changes to Richard, just 
>>> want to
>>> mention that:
>>>
 --- a/gcc/tree-cfg.c
 +++ b/gcc/tree-cfg.c
 @@ -9586,10 +9586,7 @@ execute_fixup_cfg (void)
for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
 {
   gimple *stmt = gsi_stmt (gsi);
 - tree decl = is_gimple_call (stmt)
 - ? gimple_call_fndecl (stmt)
 - : NULL;
 - if (decl)
 + if (is_gimple_call (stmt))
>>>
>>> This change doesn't affect just internal functions, but also all indirect
>>> calls through function pointers with const, pure or noreturn attributes.
>>
>> I think the change is desirable nevertheless.  The question is if we
>> want to do it at this point in time.
>>
>> The description of the problem sounds more like LTO writing writing out
>> loops without previously fixing up state.  So sth like the following
>> which I'd prefer at this stage (the above hunk is ok for stage1 then).
>
> OK, I'll save that hunk for stage 1.
>
>> Index: gcc/lto-streamer-out.c
>> ===
>> --- gcc/lto-streamer-out.c  (revision 259227)
>> +++ gcc/lto-streamer-out.c  (working copy)
>> @@ -2084,6 +2151,9 @@ output_function (struct cgraph_node *nod
>>/* Set current_function_decl and cfun.  */
>>push_cfun (fn);
>>
>> +  /* Fixup loops if required to match discovery done in the reader.  */
>> +  loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
>> +
>>/* Make string 0 be a NULL string.  */
>>streamer_write_char_stream (ob->string_stream, 0);
>>
>> @@ -2176,12 +2246,13 @@ output_function (struct cgraph_node *nod
>>streamer_write_record_start (ob, LTO_null);
>>
>>output_cfg (ob, fn);
>> -
>> -  pop_cfun ();
>> }
>>else
>>  streamer_write_uhwi (ob, 0);
>>
>> +  loop_optimizer_finalize ();
>> +  pop_cfun ();
>> +
>>/* Create a section to hold the pickled output of this function.   */
>>produce_asm (ob, function);
>
> That worked. Is this patch OK for trunk, GCC 6 and GCC 7?

Ok if you remove the cfgloop.c hunk.  There's no point in an assert
of sth being non-NULL when the immediately following stmt will
dereference it.  You get an ICE anyway.

Thanks,
Richard.

> Thanks,
> Cesar
>


Re: [PATCH] Invoke maybe_warn_nonstring_arg for strcpy/stpcpy builtins.

2018-04-12 Thread Andreas Krebbel
On 04/11/2018 11:20 PM, Martin Sebor wrote:
> On 04/11/2018 06:47 AM, Andreas Krebbel wrote:
>> On 04/11/2018 10:02 AM, Jakub Jelinek wrote:
>>> On Wed, Apr 11, 2018 at 09:48:05AM +0200, Andreas Krebbel wrote:
 c-c++-common/attr-nonstring-3.c fails on IBM Z. The reason appears to be
 that we provide builtin implementations for strcpy and stpcpy.  The
 warnings currently will only be emitted when expanding these as normal
 calls.

 Bootstrapped and regression tested on x86_64 and s390x.

 Ok?

 gcc/ChangeLog:

 2018-04-11  Andreas Krebbel  

* builtins.c (expand_builtin_strcpy): Invoke
maybe_warn_nonstring_arg.
(expand_builtin_stpcpy): Likewise.
>>>
>>> Don't you then warn twice if builtin implementations for strcpy and stpcpy
>>> aren't available or can't be used, once here and once in calls.c?
>>
>> Looks like this could happen if the expander is present but rejects 
>> expansion. I basically copied
>> this from the strcmp builtin which looks like possibly running into the same 
>> problem:
> 
> I tried to avoid the problem in the other instances of the call
> to maybe_warn_nonstring_arg (e.g., expand_builtin_strlen or
> expand_builtin_strcmp).  I don't know if the expander can fail
> after the maybe_warn_nonstring_arg() call and so I have no
> tests for it.
> 
> In your patch the expander failing seems more likely than in
> the others (in fact, on x86_64 it always fails because the call
> to targetm.have_movstr () in expand_movstr() returns false).
> 
> That said, I see two warnings for a call to strcmp() with
> a nonstring argument even without the expander failing, so
> what I did isn't quite right either.  I opened bug 85359 for
> it.

I've opened BZ85369 for the strcpy / stpcpy issue.

-Andreas-



[nvptx, PR85296] Fix handling of extern var with flexible array member

2018-04-12 Thread Tom de Vries

Hi,

for the recently added test-case pr85244-1.c, we run into the following 
failure with the standalone nvptx toolchain:

...
spawn nvptx-none-run ./pr85244-1.exe
error   : Size doesn't match for 'val' in 'input file 2 at offset 3047', 
first specified in 'input file 1 at offset 1805'

nvptx-run: cuLinkAddData failed: unknown error (CUDA_ERROR_UNKNOWN, 999)
...


The linking problem happens because while in pr85244-2.s we have an 
array of size 3:

...
  .visible .const .align 8 .u64 val[3] = { 0, 180388626432, 1337 };
...

in pr85244-1.s we have an array of size 2:
...
.extern .const .align 8 .u64 val[2]; 


...


The ptx declarations correspond to this source bit in pr85244-1.c:
...
struct s {
 long a;
 int b;
 int tab[];
};

extern const struct s val;
...

and this one in pr85244-2.c (omitting type decl):
...
const struct s val = { 0, 0, { 42, 1337 } };
...

Because ptx has no structs, structs are declared as arrays of a certain 
base type, in this case u64.


In pr85244-2.c we calculate the size of the array, and based on the 
initializer we arrive at a size of 3.


In pr85244-1.c we calculate the size of the array, and based on the type 
we arrive at at size of 2.



The patch fixes this by declaring extern structs which have a flexible 
array member as an array without given dimension.


Build and tested on nvptx.

Committed to stage4 trunk.

Thanks,
- Tom
[nvptx] Fix handling of extern var with flexible array member

2018-04-12  Tom de Vries  

	PR target/85296
	* config/nvptx/nvptx.c (flexible_array_member_type_p): New function.
	(nvptx_assemble_decl_begin): Add undefined param.  Declare undefined
	array with flexible array member as array without given dimension.
	(nvptx_assemble_undefined_decl): Set nvptx_assemble_decl_begin call
	argument for undefined param to true.

---
 gcc/config/nvptx/nvptx.c | 35 +--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index a9a3053..131b495 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -2021,6 +2021,30 @@ nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
 nvptx_assemble_value (str[i], 1);
 }
 
+/* Return true if TYPE is a record type where the last field is an array without
+   given dimension.  */
+
+static bool
+flexible_array_member_type_p (const_tree type)
+{
+  if (TREE_CODE (type) != RECORD_TYPE)
+return false;
+
+  const_tree last_field = NULL_TREE;
+  for (const_tree f = TYPE_FIELDS (type); f; f = TREE_CHAIN (f))
+last_field = f;
+
+  if (!last_field)
+return false;
+
+  const_tree last_field_type = TREE_TYPE (last_field);
+  if (TREE_CODE (last_field_type) != ARRAY_TYPE)
+return false;
+
+  return (! TYPE_DOMAIN (last_field_type)
+	  || ! TYPE_MAX_VALUE (TYPE_DOMAIN (last_field_type)));
+}
+
 /* Emit a PTX variable decl and prepare for emission of its
initializer.  NAME is the symbol name and SETION the PTX data
area. The type is TYPE, object size SIZE and alignment is ALIGN.
@@ -2031,11 +2055,18 @@ nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
 
 static void
 nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
-			   const_tree type, HOST_WIDE_INT size, unsigned align)
+			   const_tree type, HOST_WIDE_INT size, unsigned align,
+			   bool undefined = false)
 {
   bool atype = (TREE_CODE (type) == ARRAY_TYPE)
 && (TYPE_DOMAIN (type) == NULL_TREE);
 
+  if (undefined && flexible_array_member_type_p (type))
+{
+  size = 0;
+  atype = true;
+}
+
   while (TREE_CODE (type) == ARRAY_TYPE)
 type = TREE_TYPE (type);
 
@@ -2172,7 +2203,7 @@ nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
   tree size = DECL_SIZE_UNIT (decl);
   nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
 			 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
-			 DECL_ALIGN (decl));
+			 DECL_ALIGN (decl), true);
   nvptx_assemble_decl_end ();
 }
 


Re: [PATCH] Use --push-state --as-needed and --pop-state instead of --as-needed and --no-as-needed for libgcc

2018-04-12 Thread Matthias Klose
On 11.04.2018 20:55, Jakub Jelinek wrote:
> On Wed, Apr 11, 2018 at 06:07:17PM +0200, Matthias Klose wrote:
>> On 11.04.2018 12:31, Jakub Jelinek wrote:
>>> Hi!
>>>
>>> As discussed, using --as-needed and --no-as-needed is dangerous, because
>>> it results in --no-as-needed even for libraries after -lgcc_s, even when the
>>> default is --as-needed or --as-needed has been specified earlier on the
>>> command line.
>>>
>>> If the linker supports --push-state/--pop-state, we should IMHO use it.
>>>
>>> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for stage1?
>>>
>>> Or is this something we want in GCC8 too?
>>
>> this is problematic for binutils versions with --push-state/--pop-state 
>> support
>> in the BFD linker but not in gold, and then using -fuse-ld=gold.  So maybe 
>> the
>> version check for the BFD linker should only succeed for the first binutils
>> version which also has -push-state/--pop-state support in gold.
> 
> Does anybody use -fuse-ld=gold?

grep the build log of your favorite distro, unless these are not beautified and
not showing any command line options.

For Debian/Ubuntu it's haskell using gold by default, and some upstreams like
systemd turns it on by default, assuming it has the same quality on any
architecture.


Re: [patch, fortran] Remove parallell annotation from DO CONCURRENT

2018-04-12 Thread Jakub Jelinek
On Wed, Apr 11, 2018 at 09:47:22PM +0200, Thomas Koenig wrote:
> Am 11.04.2018 um 20:33 schrieb Jakub Jelinek:
> 
> > > I have attached updated patch which moves the test case to
> > > gfortran.dg/gomp (where it actually passes).
> > 
> > How could it pass there?  dg-do run tests don't belong into g*.dg/gomp/,
> > nothing adds the -B etc. options needed to find libgomp.spec or libgomp
> > as a library, or adds it to LD_LIBRARY_PATH etc.
> > There are zero dg-do run tests in gfortran.dg/gomp/, there are 4
> > dg-do run tests in c-c++-common/gomp/, but those work fine because they
> > use -fopenmp-simd option rather than
> > -fopenmp/-fopenacc/-ftree-parallelize-loops= etc.
> 
> So, where should the test go?
> 
> The suggestion in PR 85346, to put it into
> libgomp/testsuite/libgomp.fortran/, does not work:

Yes, and I said what can be done to make it work; in patch form below.

> Running ../../../../trunk/libgomp/testsuite/libgomp.fortran/fortran.exp ...
> FAIL: libgomp.fortran/do_concurrent_5.f90   -O  execution test
> 
> even when ne (the array size) has been reduced to 2**20, far below

ne is not the array size, the array size is 8 * ne, and 8 * 1MB is 8MB and
you eat all of the usual stack limit just by that.

> reasonable memory limits.  The test passes when given the
> -O1 -ftree-parallelize-loops=2 options by hand.
> 
> So, what's the idea? Is there actually a directory which works,
> or are we left with a wrong-code bug for which no test case is
> possible? That would be quite bad, I think.

Here is incremental diff.  With the dg-skip-if and removal of explicit -O3,
you make the test run only once with -O3 -g, and skip the other variants:
UNSUPPORTED: libgomp.fortran/do_concurrent_5.f90   -O0 
UNSUPPORTED: libgomp.fortran/do_concurrent_5.f90   -O1 
UNSUPPORTED: libgomp.fortran/do_concurrent_5.f90   -O2 
UNSUPPORTED: libgomp.fortran/do_concurrent_5.f90   -O3 -fomit-frame-pointer 
-funroll-loops -fpeel-loops -ftracer -finline-functions 
UNSUPPORTED: libgomp.fortran/do_concurrent_5.f90   -Os 
and with -fno-openmp you disable the default -fopenmp which you really don't
need for the testcase, there are no OpenMP directives in there.

--- libgomp/testsuite/libgomp.fortran/do_concurrent_5.f90   2018-04-11 
17:27:59.035100057 +0200
+++ libgomp/testsuite/libgomp.fortran/do_concurrent_5.f90   2018-04-12 
09:12:40.611789503 +0200
@@ -1,6 +1,7 @@
 ! { dg-do run }
 ! PR 83064 - this used to give wrong results.
-! { dg-additional-options "-O3 -ftree-parallelize-loops=2" }
+! { dg-skip-if "" { ! run_expensive_tests } { "*" } { "-O3 -g" } }
+! { dg-additional-options "-fno-openmp -ftree-parallelize-loops=2" }
 ! Original test case by Christian Felter
 
 program main
@@ -8,7 +9,7 @@ program main
 implicit none
 
 integer, parameter :: nsplit = 4
-integer(int64), parameter :: ne = 2000
+integer(int64), parameter :: ne = 200
 integer(int64) :: stride, low(nsplit), high(nsplit), edof(ne), i
 real(real64), dimension(nsplit) :: pi
 


Jakub