[PATCH] MIPS: Default to --with-llsc for the R5900 Linux target as well

2018-10-05 Thread Fredrik Noring
The Linux kernel requires and emulates LL and SC for the R5900 too. The
special --without-llsc default for the R5900 is therefore not applicable
in that case.
---
 gcc/config.gcc | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gcc/config.gcc b/gcc/config.gcc
index 0c579d1f5ea..1fea2c0beaa 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -3709,14 +3709,14 @@ fi
 # Infer a default setting for --with-llsc.
 if test x$with_llsc = x; then
   case ${target} in
-mips64r5900-*-* | mips64r5900el-*-* | mipsr5900-*-* | mipsr5900el-*-*)
-  # The R5900 doesn't support LL(D) and SC(D).
-  with_llsc=no
-  ;;
 mips*-*-linux*)
   # The kernel emulates LL and SC where necessary.
   with_llsc=yes
   ;;
+mips64r5900-*-* | mips64r5900el-*-* | mipsr5900-*-* | mipsr5900el-*-*)
+  # The R5900 doesn't support LL(D) and SC(D).
+  with_llsc=no
+  ;;
   esac
 fi
 
-- 
2.16.4



Re: [nvptx] vector length patch series

2018-10-05 Thread Tom de Vries
On 9/18/18 10:04 PM, Cesar Philippidis wrote:
> 591973d3c3a [nvptx] use user-defined vectors when possible

If I drop this patch, I get the same test results. Can you find a
testcase for which this patch has an effect?

Thanks,
- Tom


Re: [PATCH 2/2 v3][IRA,LRA] Fix PR86939, IRA incorrectly creates an interference between a pseudo register and a hard register

2018-10-05 Thread Peter Bergner
On 10/5/18 4:12 PM, Vladimir Makarov wrote:
> On 10/05/2018 04:00 PM, Peter Bergner wrote:
>> How about non_conflicting_reg_copy or non_conflicting_copy_insn?
> OK. I like the first name more.

Ok, I committed the patch using the first function name.
Thank you very much for the patch reviews and approvals!

Peter





Re: [PATCHv3][PR 81376] Remove unnecessary float casts in comparisons

2018-10-05 Thread Yuri Gribov
On Wed, Oct 3, 2018 at 5:11 PM Jeff Law  wrote:
snip
> OK.  You've got commit privileges, right?

Yup, will commit myself.

-I


[PATCH] diagnose bogus assume_aligned attributes (PR 87533)

2018-10-05 Thread Martin Sebor

While working on tests for an enhancement in the area of
attributes I noticed that the handler for attribute assume_aligned
(among others) does only a superficial job of detecting meaningless
specifications such as using the attribute on a function returning
void or alignments that aren't powers of two, out-of-range offsets,
and so on.  None of the expected warnings in the test case triggers
(Clang diagnoses all of them).

The attached patch improves the detection of these nonsensical
constructs, and brings GCC closer to the more thorough job other
compilers do.  Tested on x86_64-linux.

Martin
PR middle-end/87533 - bogus assume_aligned attribute silently accepted

gcc/c-family/ChangeLog:

	PR middle-end/87533
	* c-attribs.c (handle_assume_aligned_attribute): Diagnose and
	reject invalid attribute specifications.

gcc/testsuite/ChangeLog:

	PR middle-end/87533
	* gcc.dg/attr-assume_aligned-4.c: New test.

Index: gcc/c-family/c-attribs.c
===
@@ -2451,23 +2454,63 @@ static tree
struct attribute_spec.handler.  */
 
 static tree
-handle_assume_aligned_attribute (tree *, tree, tree args, int,
+handle_assume_aligned_attribute (tree *node, tree name, tree args, int,
  bool *no_add_attrs)
 {
+  tree decl = *node;
+  tree rettype = TREE_TYPE (decl);
+  if (TREE_CODE (rettype) != POINTER_TYPE)
+{
+  warning (OPT_Wattributes,
+	   "%qE attribute ignored on a function returning %qT",
+	   name, rettype);
+  *no_add_attrs = true;
+  return NULL_TREE;
+}
+
+  /* The alignment specified by the first argument.  */
+  tree align = NULL_TREE;
+
   for (; args; args = TREE_CHAIN (args))
 {
-  tree position = TREE_VALUE (args);
-  if (position && TREE_CODE (position) != IDENTIFIER_NODE
-	  && TREE_CODE (position) != FUNCTION_DECL)
-	position = default_conversion (position);
+  tree val = TREE_VALUE (args);
+  if (val && TREE_CODE (val) != IDENTIFIER_NODE
+	  && TREE_CODE (val) != FUNCTION_DECL)
+	val = default_conversion (val);
 
-  if (TREE_CODE (position) != INTEGER_CST)
+  if (!tree_fits_shwi_p (val))
 	{
 	  warning (OPT_Wattributes,
-		   "assume_aligned parameter not integer constant");
+		   "%qE attribute argument %E is not an integer constant",
+		   name, val);
 	  *no_add_attrs = true;
 	  return NULL_TREE;
 	}
+
+  if (!align)
+	{
+	  /* Validate and save the alignment.  */
+	  if (!integer_pow2p (val))
+	{
+	  warning (OPT_Wattributes,
+		   "%qE attribute argument %E is not a power of 2",
+		   name, val);
+	  *no_add_attrs = true;
+	  return NULL_TREE;
+	}
+
+	  align = val;
+	}
+  else if (tree_int_cst_sgn (val) < 0 || tree_int_cst_le (align, val))
+	{
+	  /* The misalignment specified by the second argument
+	 must be non-negative and less than the alignment.  */
+	  warning (OPT_Wattributes,
+		   "%qE attribute argument %E is not in the range [0, %E)",
+		   name, val, align);
+	  *no_add_attrs = true;
+	  return NULL_TREE;
+	}
 }
   return NULL_TREE;
 }
Index: gcc/testsuite/gcc.dg/attr-assume_aligned-4.c
===
--- gcc/testsuite/gcc.dg/attr-assume_aligned-4.c	(nonexistent)
+++ gcc/testsuite/gcc.dg/attr-assume_aligned-4.c	(working copy)
@@ -0,0 +1,36 @@
+/* PR middle-end/87533 - bogus assume_aligned attribute silently accepted
+   { dg-do compile }
+   { dg-options "-Wall" } */
+
+#define A(...)  __attribute__ ((assume_aligned (__VA_ARGS__)))
+
+A (1) void fv_1 (void);   /* { dg-warning ".assume_aligned. attribute ignored on a function returning .void." } */
+
+A (1) int fi_1 (void);/* { dg-warning ".assume_aligned. attribute ignored on a function returning .int." } */
+
+A (-1) void* fpv_m1 (void);   /* { dg-warning ".assume_aligned. attribute argument -1 is not a power of 2" } */
+
+A (0) void* fpv_0 (void); /* { dg-warning ".assume_aligned. attribute argument 0 is not a power of 2" } */
+
+/* Alignment of 1 is fine, it just doesn't offer any benefits.  */
+A (1) void* fpv_1 (void);
+
+A (3) void* fpv_3 (void); /* { dg-warning ".assume_aligned. attribute argument 3 is not a power of 2" } */
+
+A (16383) void* fpv_16km1 (void); /* { dg-warning ".assume_aligned. attribute argument 16383 is not a power of 2" } */
+A (16384) void* fpv_16k (void);
+A (16385) void* fpv_16kp1 (void);/* { dg-warning ".assume_aligned. attribute argument 16385 is not a power of 2" } */
+
+A (32767) void* fpv_32km1 (void); /* { dg-warning ".assume_aligned. attribute argument 32767 is not a power of 2" } */
+
+A (4, -1) void* fpv_4_m1 (void);  /* { dg-warning ".assume_aligned. attribute argument -1 is not in the range \\\[0, 4\\\)" } */
+
+A (4, 0) void* fpv_4_0 (void);
+A (4, 1) void* fpv_4_1 (void);
+A (4, 2) void* fpv_4_2 (void);
+A (4, 3) void* fpv_4_3 (void);
+
+A (4, 4) void* fpv_4_3 (void);/* { dg-warning ".assume_aligned. attribute argument 4 is not in the 

Re: [PATCH, testsuite] memchr-1.c wide char and AIX

2018-10-05 Thread Martin Sebor

David,

Attached is a patch to conditionalize the memchr-1.c test
to pass even with 2-byte wchar_t's.  It also adds a compile
only test to verify memchr with -fnarrow-wchar.  I verified
the changes on LE x86_64-linux and BE powerpc64-linux but
if you could confirm they also work on AIX that would be
great.

Unless some concerns come up I will plan to commit these
changes sometime next week.

Thanks
Martin

On 10/05/2018 02:10 PM, Martin Sebor wrote:

On 10/05/2018 12:54 PM, David Edelsohn wrote:

memchr-1.c tests for char (test_narrow) and wchar (test_wide).  The
wide character test assumes 32 bit wide character, while 32 bit AIX
uses 16 bit wide character.  This assumption causes the wide character
part of the test to fail in 32 bit mode on AIX (it succeeds on 64 bit
AIX).

The testcase already includes ifdefs for endianness.  The "narrow"
part of the test succeeds and is a useful test on AIX.  Me proposed
solution adds an AIX-specific ifdef in the testcase to avoid the
compile-time errors in 32 bit mode.

Because of the structure of the testcase, I need to #ifdef test_wide()
and its constants, and separately it's invocation in main(), as
opposed to making test_wide() a no-op that is called.

Another alternative is to split memchr-1.c into memchr-1.c for
test_narrow and memchr-2.c for test_wide, with the latter skipped on
AIX using a DejaGNU directive.

Is the #ifdef okay or would others prefer that I split the testcase?
No solution is particularly elegant.


That's my bad for hardwiring 4 as the wchar_t size.  Sorry about
the breakage.  I can't think of any better solutions than what
you covered above.  It would be nice to exercise this optimization
with 16-bit wchar_t.  It looks like GCC has a -fshort-wchar option
to force wchar_t to be 2 bytes wide that I didn't know about.
That will make the problem easier to solve without necessarily
having to build all of GCC on AIX.  Let me take care of it.

Martin



Thanks, David

* gcc.c-torture/execute/memchr-1.c (test_wide): Skip on 32 bit AIX.

Index: memchr-1.c
===
--- memchr-1.c  (revision 264869)
+++ memchr-1.c  (working copy)
@@ -106,6 +106,7 @@
   A (memchr (&s5_3[1][i0], 0, sizeof s5_3[1] - i0) == &s5_3[1][4]);
 }

+#if !defined(_AIX) || defined(__64BIT__)
 static const wchar_t wc = L'1';
 static const wchar_t ws1[] = L"1";
 static const wchar_t ws4[] =
L"\x00123456\x12005678\x12340078\x12345600";
@@ -144,10 +145,13 @@
   A (memchr (&ws4[3], 0, nb - 3 * nwb) == pws4 + 3 * nwb + 3);
 #endif
 }
+#endif


 int main ()
 {
   test_narrow ();
+#if !defined(_AIX) || defined(__64BIT__)
   test_wide ();
+#endif
 }






gcc/tewstsuite/ChangeLog:

	* gcc.c-torture/execute/memchr-1.c: Avoid assuming 4-byte wchar_t.
	Add a test for 2-byte wchar_t.
	* gcc.dg/builtin-memchr.c: New test.

Index: gcc/testsuite/gcc.c-torture/execute/memchr-1.c
===
--- gcc/testsuite/gcc.c-torture/execute/memchr-1.c	(revision 264875)
+++ gcc/testsuite/gcc.c-torture/execute/memchr-1.c	(working copy)
@@ -106,6 +106,8 @@ void test_narrow (void)
   A (memchr (&s5_3[1][i0], 0, sizeof s5_3[1] - i0) == &s5_3[1][4]);
 }
 
+#if 4 == __WCHAR_WIDTH__
+
 static const wchar_t wc = L'1';
 static const wchar_t ws1[] = L"1";
 static const wchar_t ws4[] = L"\x00123456\x12005678\x12340078\x12345600";
@@ -145,7 +147,59 @@ void test_wide (void)
 #endif
 }
 
+#elif 2 == __WCHAR_WIDTH__
 
+static const wchar_t wc = L'1';
+static const wchar_t ws1[] = L"1";
+static const wchar_t ws2[2] = L"\x1234\x5678";   /* no terminating nul */
+static const wchar_t ws4[] = L"\x0012\x1200\x1234";
+
+void test_wide (void)
+{
+  int i0 = 0;
+  int i1 = i0 + 1;
+  int i2 = i1 + 1;
+
+  A (sizeof (wchar_t) == 2);
+
+  A (memchr (L"" + 1, 0, 0) == 0);
+  A (memchr (&wc + 1, 0, 0) == 0);
+  A (memchr (L"\x1234", 0, sizeof (wchar_t)) == 0);
+
+  A (memchr (L"" + i1, i0, i0) == 0);
+  A (memchr (&wc + i1, i0, i0) == 0);
+  A (memchr (L"\x1234", i0, sizeof (wchar_t)) == 0);
+
+  A (memchr (ws2, 0, sizeof ws2) == 0);
+  A (memchr (ws2, i0, sizeof ws2) == 0);
+
+  const size_t nb = sizeof ws4;
+  const size_t nwb = sizeof (wchar_t);
+
+  const char *pws1 = (const char*)ws1;
+  const char *pws4 = (const char*)ws4;
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  A (memchr (ws1, i0, sizeof ws1) == pws1 + 1);
+
+  A (memchr (&ws4[0], i0, nb) == pws4 + i1);
+  A (memchr (&ws4[1], i0, nb - i1 * nwb) == pws4 + i1 * nwb);
+  A (memchr (&ws4[2], i0, nb - i2 * nwb) == pws4 + i2 * nwb + i2);
+#else
+  A (memchr (ws1, i0, sizeof ws1) == pws1 + 0);
+
+  A (memchr (&ws4[0], i0, nb) == pws4 + 0);
+  A (memchr (&ws4[1], i0, nb - i1 * nwb) == pws4 + i1 * nwb + i1);
+  A (memchr (&ws4[2], i0, nb - i2 * nwb) == pws4 + i2 * nwb + i2);
+#endif
+}
+
+#else
+
+void test_wide (void) { }
+
+#endif
+
 int main ()
 {
   test_narrow ();
Index: gcc/testsuite/gcc.dg/builtin-memchr.c

Re: [PATCH 2/2 v3][IRA,LRA] Fix PR86939, IRA incorrectly creates an interference between a pseudo register and a hard register

2018-10-05 Thread Vladimir Makarov

On 10/05/2018 04:00 PM, Peter Bergner wrote:

On 10/5/18 1:32 PM, Vladimir Makarov wrote:

On 10/05/2018 12:40 PM, Peter Bergner wrote:

On 10/4/18 3:01 PM, Vladimir Makarov wrote:

IMHO, the name copy_insn_p is too common and confusing (we already have
functions copy_insn and copy_insn_1 in GCC).  The name does not reflect its
result meaning.  I would call it something like non_conflict_copy_source_reg
although it is long.

How about is_reg_copy_insn_p() or is_reg_to_reg_copy_p() or ???


Personally I like the first name more.  But it is up to you.  I don't want
to bother you anymore.

It's not a bother, so lets get something we both are ok with.
How about non_conflicting_reg_copy or non_conflicting_copy_insn?

OK. I like the first name more.



Re: [PATCH] RISC-V: Fix unordered float compare for Signaling NaN.

2018-10-05 Thread Jim Wilson
On Fri, Oct 5, 2018 at 9:44 AM Jim Wilson  wrote:
> With a modified version of Andrew's patch, the testcase in Kito's
> patch works, and I see the glibc testsuite failures drop from 151 to
> 131, so there is still something wrong.  Last time I ran the glibc
> testsuite we were around 50-60 failures without Andrew's patch, so
> either something broke, or there are more glibc tests than before.

This turned out to be a problem with ulps for the math tests.  After
regenerating the ulps file, I see 32 failures, which looks like the
expected set of failures.  I checked in my modified version of Andrew
Waterman's patch.

Jim


[PATCH] RISC-V: Fix -fsignaling-nans for glibc testsuite.

2018-10-05 Thread Jim Wilson
This makes -fsignaling-nans work correctly, fixing 20 glibc testsuite failures.
The FP quiet compare pattern is ignoring all exceptions, including for SNaNs.
To make this work right when -fsignaling-nan, we need an extra eq compare to
raise an exception, but only when HONOR_SNANS is true.  So we get the more
efficient code in the default case, and code that works for the glibc
testsuite when -fsignaling-nans is used.

This was tested with cross riscv32-elf and riscv64-linux builds and checks.
It was also tested with a riscv64-linux glibc build and check

gcc/
* config/riscv/riscv.md (f_quiet4):
Add define_expand.  Add ! HONOR_SNANS check to current pattern.  Add
new pattern using HONOR_SNANS that emits one extra instruction.
---
 gcc/config/riscv/riscv.md | 34 --
 1 file changed, 28 insertions(+), 6 deletions(-)

diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 4162dc578e8..b6c20230ffd 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -1957,19 +1957,41 @@
   [(set_attr "type" "fcmp")
(set_attr "mode" "")])
 
-(define_insn "f_quiet4"
-   [(set (match_operand:X 0 "register_operand" "=r")
+(define_expand "f_quiet4"
+   [(parallel [(set (match_operand:X  0 "register_operand")
+   (unspec:X
+[(match_operand:ANYF 1 "register_operand")
+ (match_operand:ANYF 2 "register_operand")]
+QUIET_COMPARISON))
+  (clobber (match_scratch:X 3))])]
+  "TARGET_HARD_FLOAT")
+
+(define_insn "*f_quiet4_default"
+   [(set (match_operand:X  0 "register_operand" "=r")
 (unspec:X
-[(match_operand:ANYF 1 "register_operand" " f")
- (match_operand:ANYF 2 "register_operand" " f")]
-QUIET_COMPARISON))
+ [(match_operand:ANYF 1 "register_operand" " f")
+  (match_operand:ANYF 2 "register_operand" " f")]
+ QUIET_COMPARISON))
 (clobber (match_scratch:X 3 "=&r"))]
-  "TARGET_HARD_FLOAT"
+  "TARGET_HARD_FLOAT && ! HONOR_SNANS (mode)"
   "frflags\t%3\n\tf.\t%0,%1,%2\n\tfsflags %3"
   [(set_attr "type" "fcmp")
(set_attr "mode" "")
(set (attr "length") (const_int 12))])
 
+(define_insn "*f_quiet4_snan"
+   [(set (match_operand:X  0 "register_operand" "=r")
+(unspec:X
+ [(match_operand:ANYF 1 "register_operand" " f")
+  (match_operand:ANYF 2 "register_operand" " f")]
+ QUIET_COMPARISON))
+(clobber (match_scratch:X 3 "=&r"))]
+  "TARGET_HARD_FLOAT && HONOR_SNANS (mode)"
+  "frflags\t%3\n\tf.\t%0,%1,%2\n\tfsflags 
%3\n\tfeq.\tzero,%1,%2"
+  [(set_attr "type" "fcmp")
+   (set_attr "mode" "")
+   (set (attr "length") (const_int 16))])
+
 (define_insn "*seq_zero_"
   [(set (match_operand:GPR   0 "register_operand" "=r")
(eq:GPR (match_operand:X 1 "register_operand" " r")
-- 
2.17.1



Re: Merge from trunk to gccgo branch

2018-10-05 Thread Ian Lance Taylor
On Fri, Oct 5, 2018 at 1:13 PM, Ian Lance Taylor  wrote:
> I merged trunk revision 264890 to the gccgo branch.

Sorry, trunk revision was 264887.

Ian


Re: Merge from trunk to gccgo branch

2018-10-05 Thread Ian Lance Taylor
I merged trunk revision 264890 to the gccgo branch.

Ian


Re: [PATCH, testsuite] memchr-1.c wide char and AIX

2018-10-05 Thread Martin Sebor

On 10/05/2018 12:54 PM, David Edelsohn wrote:

memchr-1.c tests for char (test_narrow) and wchar (test_wide).  The
wide character test assumes 32 bit wide character, while 32 bit AIX
uses 16 bit wide character.  This assumption causes the wide character
part of the test to fail in 32 bit mode on AIX (it succeeds on 64 bit
AIX).

The testcase already includes ifdefs for endianness.  The "narrow"
part of the test succeeds and is a useful test on AIX.  Me proposed
solution adds an AIX-specific ifdef in the testcase to avoid the
compile-time errors in 32 bit mode.

Because of the structure of the testcase, I need to #ifdef test_wide()
and its constants, and separately it's invocation in main(), as
opposed to making test_wide() a no-op that is called.

Another alternative is to split memchr-1.c into memchr-1.c for
test_narrow and memchr-2.c for test_wide, with the latter skipped on
AIX using a DejaGNU directive.

Is the #ifdef okay or would others prefer that I split the testcase?
No solution is particularly elegant.


That's my bad for hardwiring 4 as the wchar_t size.  Sorry about
the breakage.  I can't think of any better solutions than what
you covered above.  It would be nice to exercise this optimization
with 16-bit wchar_t.  It looks like GCC has a -fshort-wchar option
to force wchar_t to be 2 bytes wide that I didn't know about.
That will make the problem easier to solve without necessarily
having to build all of GCC on AIX.  Let me take care of it.

Martin



Thanks, David

* gcc.c-torture/execute/memchr-1.c (test_wide): Skip on 32 bit AIX.

Index: memchr-1.c
===
--- memchr-1.c  (revision 264869)
+++ memchr-1.c  (working copy)
@@ -106,6 +106,7 @@
   A (memchr (&s5_3[1][i0], 0, sizeof s5_3[1] - i0) == &s5_3[1][4]);
 }

+#if !defined(_AIX) || defined(__64BIT__)
 static const wchar_t wc = L'1';
 static const wchar_t ws1[] = L"1";
 static const wchar_t ws4[] = L"\x00123456\x12005678\x12340078\x12345600";
@@ -144,10 +145,13 @@
   A (memchr (&ws4[3], 0, nb - 3 * nwb) == pws4 + 3 * nwb + 3);
 #endif
 }
+#endif


 int main ()
 {
   test_narrow ();
+#if !defined(_AIX) || defined(__64BIT__)
   test_wide ();
+#endif
 }





Re: Default compute dimensions (runtime)

2018-10-05 Thread Julian Brown
Hi,

Continuing the thread from here:

https://gcc.gnu.org/ml/gcc-patches/2016-02/msg00198.html

On Wed, 3 Feb 2016 19:52:09 +0300
Alexander Monakov  wrote:

> On Wed, 3 Feb 2016, Nathan Sidwell wrote:
> > You can only override at runtime those dimensions that you said
> > you'd override at runtime when you compiled your program.  
> 
> Ah, I see.  That's not obvious to me, so perhaps added documentation
> can be expanded to explain that?  (I now see that the plugin silently
> drops user-provided dimensions where a value recorded at compile time
> is present; not sure if that'd be worth a runtime diagnostic, could
> be very noisy) 

This version of the patch has slightly-expanded documentation.

> > > I don't see why you say that because cuDeviceGetAttribute provides
> > > CU_DEVICE_ATTRIBUTE_WARP_SIZE,
> > > CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
> > > CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X (which is not too useful for
> > > this case) and cuFuncGetAttribute that allows to get a
> > > per-function thread limit. There's a patch on gomp-nvptx branch
> > > that adds querying some of those to the plugin.  
> > 
> > thanks.  There doesn't appear to be one for number of physical CTAs
> > though, right?  
> 
> Sorry, I don't understand the question: CTA is a logical entity.  One
> could derive limit of possible concurrent CTAs from number of SMs
> (CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT) multiplied by how many
> CTAs fit on one multiprocessor.  The latter figure can be taken as a
> rough worst-case value, or semi-intelligent per-kernel estimate based
> on register limits (there's code on gomp-nvptx branch that does
> this), or one can use the cuOcc* API to ask the driver for a precise
> per-kernel figure.

While the runtime part of the patch already appears to have been
committed as part of the following patch:

https://gcc.gnu.org/ml/gcc-patches/2016-02/msg01589.html

The compile-time part of the patch has not made it upstream yet. Thus,
this rebased and retested patch consists of the parsing changes (for
-fopenacc-dim=X:Y:Z, allowing '-') and warning changes (for strange
partitioning choices), plus associated testsuite adjustments.

Tested with offloading to NVPTX and bootstrapped.

OK for trunk?

Thanks,

Julian

20xx-xx-xx  Nathan Sidwell  
Tom de Vries  
Thomas Schwinge  
Julian Brown  

gcc/
* doc/invoke.texi (fopenacc-dim): Update.
* omp-offload.c (oacc_parse_default_dims): Update.
(oacc_validate_dims): Emit warnings about strange partitioning choices.

gcc/testsuite/
* c-c++-common/goacc/acc-icf.c: Update.
* c-c++-common/goacc/parallel-dims-1.c: Likewise.
* c-c++-common/goacc/parallel-reduction.c: Likewise.
* c-c++-common/goacc/pr70688.c: Likewise.
* c-c++-common/goacc/routine-1.c: Likewise.
* c-c++-common/goacc/uninit-dim-clause.c: Likewise.
* gfortran.dg/goacc/parallel-tree.f95: Likewise.
* gfortran.dg/goacc/routine-4.f90: Likewise.
* gfortran.dg/goacc/routine-level-of-parallelism-1.f90: Likewise.
* gfortran.dg/goacc/uninit-dim-clause.f95: Likewise.

libgomp/
* testsuite/libgomp.oacc-c-c++-common/loop-g-1.c: Add -w.
* testsuite/libgomp.oacc-c-c++-common/loop-g-2.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/loop-red-g-1.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/loop-red-w-1.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/loop-red-w-2.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/loop-warn-1.c: New.
* testsuite/libgomp.oacc-c-c++-common/firstprivate-1.c: Update.
* testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/loop-w-1.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/mode-transitions.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/private-variables.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/reduction-7.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/routine-g-1.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/routine-w-1.c: Likewise.
* testsuite/libgomp.oacc-fortran/par-reduction-2-1.f: Likewise.
* testsuite/libgomp.oacc-fortran/par-reduction-2-2.f: Likewise.
* testsuite/libgomp.oacc-fortran/pr84028.f90: Likewise.
* testsuite/libgomp.oacc-fortran/private-variables.f90: Likewise.
* testsuite/libgomp.oacc-fortran/routine-7.f90: Likewise.
* testsuite/libgomp.oacc-c-c++-common/loop-default-compile.c: New.
commit a918a8739ae7652250c978b0ececa181a587b0c0
Author: Julian Brown 
Date:   Fri Oct 5 11:11:47 2018 -0700

OpenACC default compute dimensions

20xx-xx-xx  Nathan Sidwell  
	Tom de Vries  
	Thomas Schwinge  
	Julian Brown  

	gcc/
	* doc/invoke.texi (fopenacc-dim): Update.
   

Re: [PATCH 2/2 v3][IRA,LRA] Fix PR86939, IRA incorrectly creates an interference between a pseudo register and a hard register

2018-10-05 Thread Peter Bergner
On 10/5/18 1:32 PM, Vladimir Makarov wrote:
> On 10/05/2018 12:40 PM, Peter Bergner wrote:
>> On 10/4/18 3:01 PM, Vladimir Makarov wrote:
>>> IMHO, the name copy_insn_p is too common and confusing (we already have
>>> functions copy_insn and copy_insn_1 in GCC).  The name does not reflect its
>>> result meaning.  I would call it something like non_conflict_copy_source_reg
>>> although it is long.
>> How about is_reg_copy_insn_p() or is_reg_to_reg_copy_p() or ???
>>
> Personally I like the first name more.  But it is up to you.  I don't want
> to bother you anymore.

It's not a bother, so lets get something we both are ok with.
How about non_conflicting_reg_copy or non_conflicting_copy_insn?

Peter



[PATCH] rs6000: Some mfcr pattern simplification

2018-10-05 Thread Segher Boessenkool
2018-10-05  Segher Boessenkool  

* config/rs6000/rs6000.md (unnamed mfcr scc_comparison_operator
patterns): Merge SI and DI patterns to a GPR pattern.
(unnamed define_insn and define_split for record form of that): Merge
to a single define_insn_and_split pattern.

---
 gcc/config/rs6000/rs6000.md | 43 ++-
 1 file changed, 10 insertions(+), 33 deletions(-)

diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 5db3e57..0e7cf35 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -11765,10 +11765,10 @@ (define_insn_and_split "*cmp_internal2"
 ;; cases the insns below which don't use an intermediate CR field will
 ;; be used instead.
 (define_insn ""
-  [(set (match_operand:SI 0 "gpc_reg_operand" "=r")
-   (match_operator:SI 1 "scc_comparison_operator"
-  [(match_operand 2 "cc_reg_operand" "y")
-   (const_int 0)]))]
+  [(set (match_operand:GPR 0 "gpc_reg_operand" "=r")
+   (match_operator:GPR 1 "scc_comparison_operator"
+   [(match_operand 2 "cc_reg_operand" "y")
+(const_int 0)]))]
   ""
   "mfcr %0%Q2\;rlwinm %0,%0,%J1,1"
   [(set (attr "type")
@@ -11778,21 +11778,7 @@ (define_insn ""
(const_string "mfcr")))
(set_attr "length" "8")])
 
-(define_insn ""
-  [(set (match_operand:DI 0 "gpc_reg_operand" "=r")
-   (match_operator:DI 1 "scc_comparison_operator"
-  [(match_operand 2 "cc_reg_operand" "y")
-   (const_int 0)]))]
-  "TARGET_POWERPC64"
-  "mfcr %0%Q2\;rlwinm %0,%0,%J1,1"
-  [(set (attr "type")
- (cond [(match_test "TARGET_MFCRF")
-   (const_string "mfcrf")
-  ]
-   (const_string "mfcr")))
-   (set_attr "length" "8")])
-
-(define_insn ""
+(define_insn_and_split ""
   [(set (match_operand:CC 0 "cc_reg_operand" "=x,?y")
(compare:CC (match_operator:SI 1 "scc_comparison_operator"
   [(match_operand 2 "cc_reg_operand" "y,y")
@@ -11804,25 +11790,16 @@ (define_insn ""
   "@
mfcr %3%Q2\;rlwinm. %3,%3,%J1,1
#"
-  [(set_attr "type" "shift")
-   (set_attr "dot" "yes")
-   (set_attr "length" "8,16")])
-
-(define_split
-  [(set (match_operand:CC 0 "cc_reg_not_cr0_operand")
-   (compare:CC (match_operator:SI 1 "scc_comparison_operator"
-  [(match_operand 2 "cc_reg_operand")
-   (const_int 0)])
-   (const_int 0)))
-   (set (match_operand:SI 3 "gpc_reg_operand")
-   (match_op_dup 1 [(match_dup 2) (const_int 0)]))]
-  "TARGET_32BIT && reload_completed"
+  "&& reload_completed"
   [(set (match_dup 3)
(match_op_dup 1 [(match_dup 2) (const_int 0)]))
(set (match_dup 0)
(compare:CC (match_dup 3)
(const_int 0)))]
-  "")
+  ""
+  [(set_attr "type" "shift")
+   (set_attr "dot" "yes")
+   (set_attr "length" "8,16")])
 
 (define_insn ""
   [(set (match_operand:SI 0 "gpc_reg_operand" "=r")
-- 
1.8.3.1



Re: [PATCHv2] Handle not explicitly zero terminated strings in merge sections

2018-10-05 Thread Bernd Edlinger
On 10/05/18 20:15, Andreas Schwab wrote:
> On Sep 14 2018, Bernd Edlinger  wrote:
> 
>> diff -Npur gcc/testsuite/gnat.dg/string_merge1.adb 
>> gcc/testsuite/gnat.dg/string_merge1.adb
>> --- gcc/testsuite/gnat.dg/string_merge1.adb  1970-01-01 01:00:00.0 
>> +0100
>> +++ gcc/testsuite/gnat.dg/string_merge1.adb  2018-08-26 16:31:12.650271931 
>> +0200
>> @@ -0,0 +1,19 @@
>> +-- { dg-do compile }
>> +-- { dg-options "-O1 -fmerge-all-constants" }
>> +
>> +procedure String_Merge1 is
>> +   procedure Process (X : String);
>> +   pragma Import (Ada, Process);
>> +begin
>> +   Process ("ABCD");
>> +end;
>> +
>> +-- We expect something like:
>> +
>> +-- .section  .rodata.str1.1,"aMS",@progbits,1
>> +-- .LC1:
>> +-- .string "ABCD"
>> +
>> +-- { dg-final { scan-assembler-times "\\.rodata\\.str" 1 } }
>> +-- { dg-final { scan-assembler-times "\\.string" 1 } }
>> +-- { dg-final { scan-assembler-times "\"ABCD\"" 1 } }
> 
> FAIL: gnat.dg/string_merge1.adb scan-assembler-times \\.string 1
> 
> $ grep ABCD string_merge1.s
>  stringz "ABCD"
> 

Ah, thanks.

Turns out there are too much variations, like mentioned stringz, and asciz, and
probably lots more here.

But for the purpose of testing the optimization it should be sufficient to look 
for
".rodata.str" in the assembler.

So I committed the following as obvious:

Index: gnat.dg/string_merge2.adb
===
--- gnat.dg/string_merge2.adb   (Revision 264887)
+++ gnat.dg/string_merge2.adb   (Revision 264888)
@@ -15,5 +15,3 @@
  -- .string "ABCD"
  
  -- { dg-final { scan-assembler-times "\\.rodata\\.str" 1 } }
--- { dg-final { scan-assembler-times "\\.string" 1 } }
--- { dg-final { scan-assembler-times "\"ABCD\"" 1 } }
Index: gnat.dg/string_merge1.adb
===
--- gnat.dg/string_merge1.adb   (Revision 264887)
+++ gnat.dg/string_merge1.adb   (Revision 264888)
@@ -15,5 +15,3 @@
  -- .string "ABCD"
  
  -- { dg-final { scan-assembler-times "\\.rodata\\.str" 1 } }
--- { dg-final { scan-assembler-times "\\.string" 1 } }
--- { dg-final { scan-assembler-times "\"ABCD\"" 1 } }
Index: ChangeLog
===
--- ChangeLog   (Revision 264887)
+++ ChangeLog   (Revision 264888)
@@ -1,3 +1,8 @@
+2018-10-05  Bernd Edlinger  
+
+   * gnat.dg/string_merge1.adb: Fix test expectations.
+   * gnat.dg/string_merge2.adb: Likewise.
+
  2018-10-05  David Malcolm  
  
PR c++/56856



Thanks
Bernd.


Re: [PATCH v2, rs6000] 2/2 Add x86 SSE3 intrinsics to GCC PPC64LE target

2018-10-05 Thread Segher Boessenkool
On Fri, Oct 05, 2018 at 12:59:14PM -0500, Paul Clarke wrote:
> This is part 2/2 for contributing PPC64LE support for X86 SSE3
> instrisics. This patch includes testsuite/gcc.target tests for the
> intrinsics defined in pmmintrin.h. 
> 
> Tested on POWER8 ppc64le and ppc64 (-m64 and -m32, the latter only reporting
> 10 new unsupported tests.)
> 
> [gcc/testsuite]
> 
> 2018-10-01  Paul A. Clarke  
> 
>   * gcc.target/powerpc/sse3-check.h: New file.
>   * gcc.target/powerpc/sse3-addsubps.c: New file.
>   * gcc.target/powerpc/sse3-addsubpd.c: New file.
>   * gcc.target/powerpc/sse3-haddps.c: New file.
>   * gcc.target/powerpc/sse3-hsubps.c: New file.
>   * gcc.target/powerpc/sse3-haddpd.c: New file.
>   * gcc.target/powerpc/sse3-hsubpd.c: New file.
>   * gcc.target/powerpc/sse3-lddqu.c: New file.
>   * gcc.target/powerpc/sse3-movsldup.c: New file.
>   * gcc.target/powerpc/sse3-movshdup.c: New file.
>   * gcc.target/powerpc/sse3-movddup.c: New file.
>   * gcc.target/powerpc/pr37191.c: New file.

It seems you posted the patch to most files twice, but I trust you'll
sort it out (looks fine otherwise).  Okay for trunk.  Thanks!


Segher


[PATCH, testsuite] memchr-1.c wide char and AIX

2018-10-05 Thread David Edelsohn
memchr-1.c tests for char (test_narrow) and wchar (test_wide).  The
wide character test assumes 32 bit wide character, while 32 bit AIX
uses 16 bit wide character.  This assumption causes the wide character
part of the test to fail in 32 bit mode on AIX (it succeeds on 64 bit
AIX).

The testcase already includes ifdefs for endianness.  The "narrow"
part of the test succeeds and is a useful test on AIX.  Me proposed
solution adds an AIX-specific ifdef in the testcase to avoid the
compile-time errors in 32 bit mode.

Because of the structure of the testcase, I need to #ifdef test_wide()
and its constants, and separately it's invocation in main(), as
opposed to making test_wide() a no-op that is called.

Another alternative is to split memchr-1.c into memchr-1.c for
test_narrow and memchr-2.c for test_wide, with the latter skipped on
AIX using a DejaGNU directive.

Is the #ifdef okay or would others prefer that I split the testcase?
No solution is particularly elegant.

Thanks, David

* gcc.c-torture/execute/memchr-1.c (test_wide): Skip on 32 bit AIX.

Index: memchr-1.c
===
--- memchr-1.c  (revision 264869)
+++ memchr-1.c  (working copy)
@@ -106,6 +106,7 @@
   A (memchr (&s5_3[1][i0], 0, sizeof s5_3[1] - i0) == &s5_3[1][4]);
 }

+#if !defined(_AIX) || defined(__64BIT__)
 static const wchar_t wc = L'1';
 static const wchar_t ws1[] = L"1";
 static const wchar_t ws4[] = L"\x00123456\x12005678\x12340078\x12345600";
@@ -144,10 +145,13 @@
   A (memchr (&ws4[3], 0, nb - 3 * nwb) == pws4 + 3 * nwb + 3);
 #endif
 }
+#endif


 int main ()
 {
   test_narrow ();
+#if !defined(_AIX) || defined(__64BIT__)
   test_wide ();
+#endif
 }


[PATCH, i386]: Remove cmp_*_cc_i387 FP compare patterns

2018-10-05 Thread Uros Bizjak
These are ineffective, since no pass considers compares inside
PARALLELs. They just weather through all pre-reload passes and then
split to fcom+fnstsw/sahf sequence. We can as well expand compares to
the above sequence at expand time.

2018-10-05  Uros Bizjak  

* config/i386/i386.md (*cmpxf_cc_i387): Remove pattern.
(*cmp_cc_i387): Ditto.
(*cmpu_cc_i387): Ditto.
(*cmp__cc_i387): Ditto.
* config/i386/i386.c (ix86_expand_fp_compare): Remove
"scratch" argument.
: Do not generate pattern with HImode clobber.
Emit x86_sahf_1 pattern.
(ix86_expand_compare): Update call to ix86_expand_fp_compare.
(ix86_expand_carry_flag_compare): Ditto.

Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.

Committed to mainline SVN.

Uros.
Index: config/i386/i386.c
===
--- config/i386/i386.c  (revision 264875)
+++ config/i386/i386.c  (working copy)
@@ -22258,45 +22258,38 @@ ix86_fp_compare_code_to_integer (enum rtx_code cod
 /* Generate insn patterns to do a floating point compare of OPERANDS.  */
 
 static rtx
-ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
+ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
 {
   bool unordered_compare = ix86_unordered_fp_compare (code);
-  machine_mode intcmp_mode;
-  rtx tmp, tmp2;
+  machine_mode cmp_mode;
+  rtx tmp, scratch;
 
   code = ix86_prepare_fp_compare_args (code, &op0, &op1);
 
+  tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
+  if (unordered_compare)
+tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
+
   /* Do fcomi/sahf based test when profitable.  */
   switch (ix86_fp_comparison_strategy (code))
 {
 case IX86_FPCMP_COMI:
-  intcmp_mode = CCFPmode;
-  tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
-  if (unordered_compare)
-   tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
+  cmp_mode = CCFPmode;
   emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
   break;
 
 case IX86_FPCMP_SAHF:
-  intcmp_mode = CCFPmode;
-  tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
-  if (unordered_compare)
-   tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
-  tmp = gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp);
-  if (!scratch)
-   scratch = gen_reg_rtx (HImode);
-  tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
-  emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
+  cmp_mode = CCFPmode;
+  tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
+  scratch = gen_reg_rtx (HImode);
+  emit_insn (gen_rtx_SET (scratch, tmp));
+  emit_insn (gen_x86_sahf_1 (scratch));
   break;
 
 case IX86_FPCMP_ARITH:
-  /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first.  */
-  tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
-  if (unordered_compare)
-   tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
+  cmp_mode = CCNOmode;
   tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
-  if (!scratch)
-   scratch = gen_reg_rtx (HImode);
+  scratch = gen_reg_rtx (HImode);
   emit_insn (gen_rtx_SET (scratch, tmp));
 
   /* In the unordered case, we have to check C2 for NaN's, which
@@ -22304,7 +22297,6 @@ static rtx
 So do some bit twiddling on the value we've got in AH to come
 up with an appropriate set of condition codes.  */
 
-  intcmp_mode = CCNOmode;
   switch (code)
{
case GT:
@@ -22319,7 +22311,7 @@ static rtx
  emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
  emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
  emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
- intcmp_mode = CCmode;
+ cmp_mode = CCmode;
  code = GEU;
}
  break;
@@ -22329,7 +22321,7 @@ static rtx
{
  emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
  emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
- intcmp_mode = CCmode;
+ cmp_mode = CCmode;
  code = EQ;
}
  else
@@ -22359,7 +22351,7 @@ static rtx
  emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
  emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
  emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
- intcmp_mode = CCmode;
+ cmp_mode = CCmode;
  code = LTU;
}
  else
@@ -22374,7 +22366,7 @@ static rtx
{
  emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
  emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
- intcmp_mode = CCmode;
+ cmp_mode = CCmode;
  code = EQ;
}

Re: [PATCH 2/2 v3][IRA,LRA] Fix PR86939, IRA incorrectly creates an interference between a pseudo register and a hard register

2018-10-05 Thread Vladimir Makarov

On 10/05/2018 12:40 PM, Peter Bergner wrote:

On 10/4/18 3:01 PM, Vladimir Makarov wrote:

IMHO, the name copy_insn_p is too common and confusing (we already have
functions copy_insn and copy_insn_1 in GCC).  The name does not reflect its
result meaning.  I would call it something like non_conflict_copy_source_reg
although it is long.

I'm fine with renaming it.  I'm not sure I like the use of source reg in
the name even though it is what is returned.  That is just a convenience for
the caller of the function.  Its true purpose is recognizing whether INSN
is or is not a reg to reg copy for which we can ignore their interference.

OK.

How about is_reg_copy_insn_p() or is_reg_to_reg_copy_p() or ???

Personally I like the first name more.  But it is up to you.  I don't 
want to bother you anymore.



Also I would rename last_regno to bound_regno because it is better reflect
variable value meaning or at least to end_regno as it is a value of END_REGNO
macro.

Ok, I went with end_regno, since that seems to be used elsewhere.


Great.

Thank you, Peter.


Re: [PATCHv2] Handle not explicitly zero terminated strings in merge sections

2018-10-05 Thread Andreas Schwab
On Sep 14 2018, Bernd Edlinger  wrote:

> diff -Npur gcc/testsuite/gnat.dg/string_merge1.adb 
> gcc/testsuite/gnat.dg/string_merge1.adb
> --- gcc/testsuite/gnat.dg/string_merge1.adb   1970-01-01 01:00:00.0 
> +0100
> +++ gcc/testsuite/gnat.dg/string_merge1.adb   2018-08-26 16:31:12.650271931 
> +0200
> @@ -0,0 +1,19 @@
> +-- { dg-do compile }
> +-- { dg-options "-O1 -fmerge-all-constants" }
> +
> +procedure String_Merge1 is
> +   procedure Process (X : String);
> +   pragma Import (Ada, Process);
> +begin
> +   Process ("ABCD");
> +end;
> +
> +-- We expect something like:
> +
> +-- .section  .rodata.str1.1,"aMS",@progbits,1
> +-- .LC1:
> +-- .string "ABCD"
> +
> +-- { dg-final { scan-assembler-times "\\.rodata\\.str" 1 } }
> +-- { dg-final { scan-assembler-times "\\.string" 1 } }
> +-- { dg-final { scan-assembler-times "\"ABCD\"" 1 } }

FAIL: gnat.dg/string_merge1.adb scan-assembler-times \\.string 1

$ grep ABCD string_merge1.s 
stringz "ABCD"

Andreas.

-- 
Andreas Schwab, sch...@linux-m68k.org
GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510  2552 DF73 E780 A9DA AEC1
"And now for something completely different."


[PATCH v2, rs6000] 2/2 Add x86 SSE3 intrinsics to GCC PPC64LE target

2018-10-05 Thread Paul Clarke
This is part 2/2 for contributing PPC64LE support for X86 SSE3
instrisics. This patch includes testsuite/gcc.target tests for the
intrinsics defined in pmmintrin.h. 

Tested on POWER8 ppc64le and ppc64 (-m64 and -m32, the latter only reporting
10 new unsupported tests.)

[gcc/testsuite]

2018-10-01  Paul A. Clarke  

* gcc.target/powerpc/sse3-check.h: New file.
* gcc.target/powerpc/sse3-addsubps.c: New file.
* gcc.target/powerpc/sse3-addsubpd.c: New file.
* gcc.target/powerpc/sse3-haddps.c: New file.
* gcc.target/powerpc/sse3-hsubps.c: New file.
* gcc.target/powerpc/sse3-haddpd.c: New file.
* gcc.target/powerpc/sse3-hsubpd.c: New file.
* gcc.target/powerpc/sse3-lddqu.c: New file.
* gcc.target/powerpc/sse3-movsldup.c: New file.
* gcc.target/powerpc/sse3-movshdup.c: New file.
* gcc.target/powerpc/sse3-movddup.c: New file.
* gcc.target/powerpc/pr37191.c: New file.

v2: tested with -mcpu=power7; fixed up ChangeLog;
universally used "-mpower8-vector" (per Segher review);
changed "TEST" function declarations to be globally consistent

Index: gcc/testsuite/gcc.target/powerpc/pr37191.c
===
--- gcc/testsuite/gcc.target/powerpc/pr37191.c  (revision 0)
+++ gcc/testsuite/gcc.target/powerpc/pr37191.c  (working copy)
@@ -0,0 +1,51 @@
+/* { dg-do compile } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
+/* { dg-options "-O3 -mpower8-vector" } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-require-effective-target p8vector_hw } */
+
+#define NO_WARN_X86_INTRINSICS 1
+
+#include 
+#include 
+#include 
+
+#if 0
+extern const uint64_t ff_bone;
+#endif
+
+static inline void transpose4x4(uint8_t *dst, uint8_t *src, ptrdiff_t 
dst_stride, ptrdiff_t src_stride) {
+  __m64 row0 = _mm_cvtsi32_si64(*(unsigned*)(src + (0 * src_stride)));
+  __m64 row1 = _mm_cvtsi32_si64(*(unsigned*)(src + (1 * src_stride)));
+  __m64 row2 = _mm_cvtsi32_si64(*(unsigned*)(src + (2 * src_stride)));
+  __m64 row3 = _mm_cvtsi32_si64(*(unsigned*)(src + (3 * src_stride)));
+  __m64 tmp0 = _mm_unpacklo_pi8(row0, row1);
+  __m64 tmp1 = _mm_unpacklo_pi8(row2, row3);
+  __m64 row01 = _mm_unpacklo_pi16(tmp0, tmp1);
+  __m64 row23 = _mm_unpackhi_pi16(tmp0, tmp1);
+  *((unsigned*)(dst + (0 * dst_stride))) = _mm_cvtsi64_si32(row01);
+  *((unsigned*)(dst + (1 * dst_stride))) = 
_mm_cvtsi64_si32(_mm_unpackhi_pi32(row01, row01));
+  *((unsigned*)(dst + (2 * dst_stride))) = _mm_cvtsi64_si32(row23);
+  *((unsigned*)(dst + (3 * dst_stride))) = 
_mm_cvtsi64_si32(_mm_unpackhi_pi32(row23, row23));
+}
+
+#if 0
+static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int 
stride, int alpha1, int beta1)
+{
+asm volatile(
+""
+:: "r"(pix-2*stride), "r"(pix), "r"((long)stride),
+   "m"(alpha1), "m"(beta1), "m"(ff_bone)
+);
+}
+#endif
+
+void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, 
int beta)
+{
+  uint8_t trans[8*4] __attribute__ ((aligned (8)));
+  transpose4x4(trans, pix-2, 8, stride);
+  transpose4x4(trans+4, pix-2+4*stride, 8, stride);
+//h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1);
+  transpose4x4(pix-2, trans, stride, 8);
+  transpose4x4(pix-2+4*stride, trans+4, stride, 8);
+}
Index: gcc/testsuite/gcc.target/powerpc/sse3-addsubpd.c
===
--- gcc/testsuite/gcc.target/powerpc/sse3-addsubpd.c(revision 0)
+++ gcc/testsuite/gcc.target/powerpc/sse3-addsubpd.c(working copy)
@@ -0,0 +1,101 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -mpower8-vector -Wno-psabi" } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-require-effective-target p8vector_hw } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse3-check.h"
+#endif
+
+#include CHECK_H
+
+#ifndef TEST
+#define TEST sse3_test_addsubpd_1
+#endif
+
+#define NO_WARN_X86_INTRINSICS 1
+#include 
+
+static void
+sse3_test_addsubpd (double *i1, double *i2, double *r)
+{
+  __m128d t1 = _mm_loadu_pd (i1);
+  __m128d t2 = _mm_loadu_pd (i2);
+
+  t1 = _mm_addsub_pd (t1, t2);
+
+  _mm_storeu_pd (r, t1);
+}
+
+static void
+sse3_test_addsubpd_subsume (double *i1, double *i2, double *r)
+{
+  __m128d t1 = _mm_load_pd (i1);
+  __m128d t2 = _mm_load_pd (i2);
+
+  t1 = _mm_addsub_pd (t1, t2);
+
+  _mm_storeu_pd (r, t1);
+}
+
+static int
+chk_pd (double *v1, double *v2)
+{
+  int i;
+  int n_fails = 0;
+
+  for (i = 0; i < 2; i++)
+if (v1[i] != v2[i])
+  n_fails += 1;
+
+  return n_fails;
+}
+
+static double p1[2] __attribute__ ((aligned(16)));
+static double p2[2] __attribute__ ((aligned(16)));
+static double p3[2];
+static double ck[2];
+
+double vals[] =
+  {
+100.0,  200.0, 300.0, 400.0, 5.0, -1.0, .345, -21.5,
+1100.0, 0.235, 321.3, 53.40, 0.3, 10.0, 42.0, 32.52,
+32.6,   123.3, 1.234, 2.156, 0.1, 3.25, 4.75, 32.44,
+12.16,  52.34, 64.12, 71.13, -.1, 2.30

libgo patch committed: Use inline assembler for xgetbv

2018-10-05 Thread Ian Lance Taylor
This patch by Than McIntosh uses inline assembler instead of the
_xgetbv intrinsic, so that libgo can be built by compilers that don't
support the intrinsic.  Bootstrapped and ran Go testsuite on
x86_64-pc-linux-gnu.  Committed to mainline.

Ian
Index: gcc/go/gofrontend/MERGE
===
--- gcc/go/gofrontend/MERGE (revision 264872)
+++ gcc/go/gofrontend/MERGE (working copy)
@@ -1,4 +1,4 @@
-9f4cf23e716bcf65e071260afa032a64acd3fdde
+d0739c13ca3686df1f8d0fae7c6c5caaed058503
 
 The first line of this file holds the git revision number of the last
 merge done from the gofrontend repository.
Index: libgo/go/internal/cpu/cpu_gccgo.c
===
--- libgo/go/internal/cpu/cpu_gccgo.c   (revision 264813)
+++ libgo/go/internal/cpu/cpu_gccgo.c   (working copy)
@@ -52,12 +52,18 @@ struct xgetbv_ret xgetbv(void)
 #pragma GCC target("xsave")
 
 struct xgetbv_ret xgetbv(void) {
-   long long r;
struct xgetbv_ret ret;
 
-   r = _xgetbv(0);
-   ret.eax = r & 0x;
-   ret.edx = r >> 32;
+// At some point, use call to _xgetbv() instead:
+//
+//   long long r = _xgetbv(0);
+//   ret.eax = r & 0x;
+//   ret.edx = r >> 32;
+//
+unsigned int __eax, __edx, __xcr_no = 0;
+__asm__ ("xgetbv" : "=a" (__eax), "=d" (__edx) : "c" (__xcr_no));
+ret.eax = __eax;
+ret.edx = __edx;
return ret;
 }
 


[PATCH, pdp11] remove -mfloat32, -mfloat64

2018-10-05 Thread Paul Koning
This patch removes switches that allow the size of "float" to be either the 
usual 4, or 8 -- which is also the size of "double".  That second choice 
creates problems for Fortran and violates the Fortran standard.  I don't see a 
reason for having the option; it certainly is not a familiar thing to do on 
this machine.

Committed.

paul

ChangeLog:

2018-10-05  Paul Koning  

* config/pdp11/pdp11.h (FLOAT_TYPE_SIZE): Always 32.
* config/pdp11/pdp11.opt (mfloat32): Remove.
(mfloat64): Remove.
* doc/invoke.texi (pdp11 -mfloat32): Remove:
(pdp11 -mfloat64): Remove.

Index: doc/invoke.texi
===
--- doc/invoke.texi (revision 264880)
+++ doc/invoke.texi (revision 264881)
@@ -1007,7 +1007,6 @@ Objective-C and Objective-C++ Dialects}.
 @emph{PDP-11 Options}
 @gccoptlist{-mfpu  -msoft-float  -mac0  -mno-ac0  -m40  -m45  -m10 @gol
 -mint32  -mno-int16 -mint16  -mno-int32 @gol
--mfloat32  -mno-float64 -mfloat64  -mno-float32 @gol
 -msplit -munix-asm  -mdec-asm -mgnu-asm -mlra}
 
 @emph{picoChip Options}
@@ -22722,18 +22721,6 @@ Use 16-bit @code{int}.  This is the default.
 @opindex mno-int16
 Use 32-bit @code{int}.
 
-@item -mfloat64
-@itemx -mno-float32
-@opindex mfloat64
-@opindex mno-float32
-Use 64-bit @code{float}.  This is the default.
-
-@item -mfloat32
-@itemx -mno-float64
-@opindex mfloat32
-@opindex mno-float64
-Use 32-bit @code{float}.
-
 @item -msplit
 @opindex msplit
 Target has split instruction and data space.  Implies -m45.
Index: config/pdp11/pdp11.opt
===
--- config/pdp11/pdp11.opt  (revision 264880)
+++ config/pdp11/pdp11.opt  (revision 264881)
@@ -42,14 +42,6 @@ mgnu-asm
 Target RejectNegative Report Mask(GNU_ASM) Negative(munix-asm)
 Use the GNU assembler syntax.
 
-mfloat32
-Target Report Mask(FLOAT32)
-Use 32 bit float.
-
-mfloat64
-Target Report InverseMask(FLOAT32, FLOAT64)
-Use 64 bit float.
-
 mfpu
 Target RejectNegative Report Mask(FPU)
 Use hardware floating point.
Index: config/pdp11/pdp11.h
===
--- config/pdp11/pdp11.h(revision 264880)
+++ config/pdp11/pdp11.h(revision 264881)
@@ -59,12 +59,14 @@ along with GCC; see the file COPYING3.  If not see
 #define LONG_TYPE_SIZE 32
 #define LONG_LONG_TYPE_SIZE64 
 
-/* if we set FLOAT_TYPE_SIZE to 32, we could have the benefit 
-   of saving core for huge arrays - the definitions are 
-   already in md - but floats can never reside in 
-   an FPU register - we keep the FPU in double float mode 
-   all the time !! */
-#define FLOAT_TYPE_SIZE(TARGET_FLOAT32 ? 32 : 64)
+/* In earlier versions, FLOAT_TYPE_SIZE was selectable as 32 or 64,
+   but that conflicts with Fortran language rules.  Since there is no
+   obvious reason why we should have that feature -- other targets
+   generally don't have float and double the same size -- I've removed
+   it.  Note that it continues to be true (for now) that arithmetic is
+   always done with 64-bit values, i.e., the FPU is always in "double"
+   mode.  */
+#define FLOAT_TYPE_SIZE32
 #define DOUBLE_TYPE_SIZE   64
 #define LONG_DOUBLE_TYPE_SIZE  64
 
@@ -200,12 +202,11 @@ extern const struct real_format pdp11_d_format;
 
 MUL_REGS are used for odd numbered regs, to use in 16-bit multiplication
  (even numbered do 32-bit multiply)
-LMUL_REGS long multiply registers (even numbered regs )
- (don't need them, all 32-bit regs are even numbered!)
 GENERAL_REGS is all cpu
 LOAD_FPU_REGS is the first four cpu regs, they are easier to load
 NO_LOAD_FPU_REGS is ac4 and ac5, currently - difficult to load them
 FPU_REGS is all fpu regs 
+CC_REGS is the condition codes (CPU and FPU)
 */
 
 enum reg_class



Re: [PATCH 2/2] Support string locations for C++ in -Wformat (PR c++/56856)

2018-10-05 Thread Jeff Law
On 10/4/18 9:00 AM, David Malcolm wrote:
> -Wformat in the C++ FE doesn't work as well as it could:
> (a) it doesn't report precise locations within the string literal, and
> (b) it doesn't underline arguments for those arguments !CAN_HAVE_LOCATION_P,
> despite having location wrapper nodes.
> 
> For example:
> 
>   Wformat-ranges.C:32:10: warning: format '%s' expects argument of type 
> 'char*', but argument 2 has type 'int' [-Wformat=]
>   32 |   printf("hello %s", 42);
>  |  ^~
> 
> (a) is due to not wiring up the langhook for extracting substring
> locations.
> 
> This patch uses the one in c-family; it also fixes string literal
> parsing so that it records string concatenations (needed for
> extracting substring locations from concatenated strings).
> 
> (b) is due to the call to maybe_constant_value here:
>fargs[j] = maybe_constant_value (argarray[j]);
> within build_over_call.
> 
> The patch fixes this by building a vec of location_t values when
> calling check_function_arguments.
> I attempted to eliminate the maybe_constant_value call here, but
> it's needed by e.g. check_function_sentinel for detecting NULL,
> and that code is in "c-family", so it can't simply call into
> maybe_constant_value (which is in "cp").
> 
> With this patch, the output for the above example is improved to:
> 
>   Wformat-ranges.C:32:18: warning: format '%s' expects argument of type 
> 'char*', but argument 2 has type 'int' [-Wformat=]
>   32 |   printf("hello %s", 42);
>  | ~^   ~~
>  |  |   |
>  |  |   int
>  |  char*
>  | %d
> 
> Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu (on top of
> the multiline.exp patch).
> 
> OK for trunk?
> 
> gcc/cp/ChangeLog:
>   PR c++/56856
>   * call.c (build_over_call): Build a vec of locations of the
>   arguments before the call to maybe_constant_value, and pass to
>   check_function_arguments.
>   * cp-lang.c (LANG_HOOKS_GET_SUBSTRING_LOCATION): Define as
>   c_get_substring_location.
>   * parser.c (cp_parser_string_literal): Capture string
>   concatenation locations.
> 
> gcc/ChangeLog:
>   PR c++/56856
>   * input.c (expand_location_to_spelling_point): Add param "aspect"
>   and use rather than hardcoding LOCATION_ASPECT_CARET.
>   (get_substring_ranges_for_loc): Handle the case of a single token
>   within a macro expansion.
>   * input.h (expand_location_to_spelling_point): Add "aspect" param,
>   defaulting to LOCATION_ASPECT_CARET.
> 
> gcc/testsuite/ChangeLog:
>   PR c++/56856
>   * g++.dg/ext/builtin4.C: Set expected location for warning to the
>   correct location within the format string.
>   * g++.dg/plugin/plugin.exp (plugin_test_list): Add the plugin and
>   files for testing locations within string literal locations from
>   the C frontend.
>   * g++.dg/warn/Wformat-method.C: New test.
>   * g++.dg/warn/Wformat-pr71863.C: New test.
>   * g++.dg/warn/Wformat-ranges-c++11.C: New test.
>   * g++.dg/warn/Wformat-ranges.C: New test, based on
>   gcc.dg/format/diagnostic-ranges.c.
>   * gcc.dg/plugin/diagnostic-test-string-literals-1.c
>   (test_multitoken_macro): Generalize expected output to work with
>   both C and C++.
>   * gcc.dg/plugin/diagnostic-test-string-literals-2.c
>   (test_stringified_token_1): Likewise.
>   (test_stringified_token_3): Likewise.
I typically leave the C++ bits to others, but this looks fine to me.

jeff


Re: [PATCH 1/2] testsuite: multiline.exp: implement optional target/xfail selector

2018-10-05 Thread Jeff Law
On 10/4/18 9:00 AM, David Malcolm wrote:
> Successfully regrtested on x86_64-pc-linux-gnu.
> 
> OK for trunk?
> 
> gcc/testsuite/ChangeLog:
>   * lib/multiline.exp (proc dg-end-multiline-output): Check argument
>   count.  If there's a 3rd argument, use dg-process-target on it,
>   bailing out, or recording expected failures as "maybe_x".
>   (proc handle-multiline-outputs): Extract "maybe_x", and use it
>   to convert pass/fail into xpass/xfail.
OK
jeff


[PATCH, i386]: Merge ftest insn patterns with FP compare insn patterns

2018-10-05 Thread Uros Bizjak
Now that we have universal "C" constraint, we can use it in FP compare
insn patterns to merge ftest insn patterns.

2018-10-05  Uros Bizjak  

* config/i386/i386.md (*cmpxf_i387): Change operand 2 predicate
to reg_or_0_operand.  Add "C" constraint.
(*cmpxf_cc_i387): Ditto.
(*cmp_i387): Change operand 2 predicate
to nonimm_or_0_operand.  Add "C" constraint.
(*cmp_cc_i387): Ditto.
(*cmp_0_i387): Remove insn pattern.
(*cmp_0_cc_i387): Ditto.

Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.

Committed to mainline SVN.

Uros.
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 122e57f98cc4..d7afb6a0bdaf 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -1461,52 +1461,18 @@
   DONE;
 })
 
-
 ;; FP compares, step 1:
 ;; Set the FP condition codes and move fpsr to ax.
 
 ;; We may not use "#" to split and emit these
 ;; due to reg-stack pops killing fpsr.
 
-(define_insn "*cmp_0_i387"
-  [(set (match_operand:HI 0 "register_operand" "=a")
-   (unspec:HI
- [(compare:CCFP
-(match_operand:X87MODEF 1 "register_operand" "f")
-(match_operand:X87MODEF 2 "const0_operand"))]
-   UNSPEC_FNSTSW))]
-  "TARGET_80387"
-  "* return output_fp_compare (insn, operands, false, false);"
-  [(set_attr "type" "multi")
-   (set_attr "unit" "i387")
-   (set_attr "mode" "")])
-
-(define_insn_and_split "*cmp_0_cc_i387"
-  [(set (reg:CCFP FLAGS_REG)
-   (compare:CCFP
- (match_operand:X87MODEF 1 "register_operand" "f")
- (match_operand:X87MODEF 2 "const0_operand")))
-   (clobber (match_operand:HI 0 "register_operand" "=a"))]
-  "TARGET_80387 && TARGET_SAHF && !TARGET_CMOVE"
-  "#"
-  "&& reload_completed"
-  [(set (match_dup 0)
-   (unspec:HI
- [(compare:CCFP (match_dup 1)(match_dup 2))]
-   UNSPEC_FNSTSW))
-   (set (reg:CC FLAGS_REG)
-   (unspec:CC [(match_dup 0)] UNSPEC_SAHF))]
-  ""
-  [(set_attr "type" "multi")
-   (set_attr "unit" "i387")
-   (set_attr "mode" "")])
-
 (define_insn "*cmpxf_i387"
   [(set (match_operand:HI 0 "register_operand" "=a")
(unspec:HI
  [(compare:CCFP
 (match_operand:XF 1 "register_operand" "f")
-(match_operand:XF 2 "register_operand" "f"))]
+(match_operand:XF 2 "reg_or_0_operand" "fC"))]
  UNSPEC_FNSTSW))]
   "TARGET_80387"
   "* return output_fp_compare (insn, operands, false, false);"
@@ -1518,7 +1484,7 @@
   [(set (reg:CCFP FLAGS_REG)
(compare:CCFP
  (match_operand:XF 1 "register_operand" "f")
- (match_operand:XF 2 "register_operand" "f")))
+ (match_operand:XF 2 "reg_or_0_operand" "fC")))
(clobber (match_operand:HI 0 "register_operand" "=a"))]
   "TARGET_80387 && TARGET_SAHF && !TARGET_CMOVE"
   "#"
@@ -1539,7 +1505,7 @@
(unspec:HI
  [(compare:CCFP
 (match_operand:MODEF 1 "register_operand" "f")
-(match_operand:MODEF 2 "nonimmediate_operand" "fm"))]
+(match_operand:MODEF 2 "nonimm_or_0_operand" "fmC"))]
  UNSPEC_FNSTSW))]
   "TARGET_80387"
   "* return output_fp_compare (insn, operands, false, false);"
@@ -1551,7 +1517,7 @@
   [(set (reg:CCFP FLAGS_REG)
(compare:CCFP
  (match_operand:MODEF 1 "register_operand" "f")
- (match_operand:MODEF 2 "nonimmediate_operand" "fm")))
+ (match_operand:MODEF 2 "nonimm_or_0_operand" "fmC")))
(clobber (match_operand:HI 0 "register_operand" "=a"))]
   "TARGET_80387 && TARGET_SAHF && !TARGET_CMOVE"
   "#"


Re: [PATCH, rs6000] 2/2 Add x86 SSE3 intrinsics to GCC PPC64LE target

2018-10-05 Thread Segher Boessenkool
On Fri, Oct 05, 2018 at 10:54:18AM -0500, Paul Clarke wrote:
> On 10/05/2018 04:20 AM, Segher Boessenkool wrote:
> >> @@ -0,0 +1,49 @@
> >> +/* { dg-do compile } */
> >> +/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
> >> +/* { dg-options "-O3 -mdirect-move" } */
> > 
> > -mdirect-move is deprecated and doesn't do anything.  You want -mcpu=power8
> > if you want to enable power8 instructions.  (Or -mpower8-vector also works,
> > for the time being anyway, but it is not preferred).
> 
> All of the gcc/testsuite/gcc.target/powerpc/sse2*.c use "-mpower8-vector".  
> Shall I use that, or "-mcpu=power8"?

Ah right.  No, just keep it all the same, it is easiest.

> > Have you tested this with -mcpu= an older cpu?  Did that work?  (It won't
> > _do_ much of course, but are there extra unexpected errors, etc.)
> 
> I just did, at your urging.  Seems OK.

Nice, thanks.

> >> +/* { dg-require-effective-target lp64 } */
> > 
> > Do these tests actually need this?  For what, then?
> 
> All of the gcc/testsuite/gcc.target/powerpc/sse2*.c use it.  I will profess 
> my ignorance.  Should it be used?

It means this test will only run on 64-bit compiles.  As long as we allow
the header to be used on 32-bit compiles (or on BE, etc.), preventing it
from being tested there is not so great.

But if all the existing things do this, it's fine to follow suit.


Segher


Re: [PATCH, GCC/ARM] Fix PR87374: ICE with -mslow-flash-data and -mword-relocations

2018-10-05 Thread Thomas Preudhomme
Hi Ramana and Kyrill,

I've reworked the patch to add some documentation of the option
conflict and reworked the -mword-relocation logic slightly to set the
variable explicitely in PIC mode rather than test for PIC and word
relocation everywhere.

ChangeLog entries are now as follows:

*** gcc/ChangeLog ***

2018-10-02  Thomas Preud'homme  

PR target/87374
* config/arm/arm.c (arm_option_check_internal): Disable the combined
use of -mslow-flash-data and -mword-relocations.
(arm_option_override): Enable -mword-relocations if -fpic or -fPIC.
* config/arm/arm.md (SYMBOL_REF MOVT splitter): Stop checking for
flag_pic.
* doc/invoke.texi (-mword-relocations): Mention conflict with
-mslow-flash-data.
(-mslow-flash-data): Reciprocally.

*** gcc/testsuite/ChangeLog ***

2018-09-25  Thomas Preud'homme  

PR target/87374
* gcc.target/arm/movdi_movt.c: Skip if both -mslow-flash-data and
-mword-relocations would be passed when compiling the test.
* gcc.target/arm/movsi_movt.c: Likewise.
* gcc.target/arm/pr81863.c: Likewise.
* gcc.target/arm/thumb2-slow-flash-data-1.c: Likewise.
* gcc.target/arm/thumb2-slow-flash-data-2.c: Likewise.
* gcc.target/arm/thumb2-slow-flash-data-3.c: Likewise.
* gcc.target/arm/thumb2-slow-flash-data-4.c: Likewise.
* gcc.target/arm/thumb2-slow-flash-data-5.c: Likewise.
* gcc.target/arm/tls-disable-literal-pool.c: Likewise.

Is this ok for trunk?

Best regards,

Thomas

On Tue, 2 Oct 2018 at 13:39, Ramana Radhakrishnan
 wrote:
>
> On 02/10/2018 11:42, Thomas Preudhomme wrote:
> > Hi Ramana,
> >
> > On Thu, 27 Sep 2018 at 11:14, Ramana Radhakrishnan
> >  wrote:
> >>
> >> On 27/09/2018 09:26, Kyrill Tkachov wrote:
> >>> Hi Thomas,
> >>>
> >>> On 26/09/18 18:39, Thomas Preudhomme wrote:
>  Hi,
> 
>  GCC ICEs under -mslow-flash-data and -mword-relocations because there
>  is no way to load an address, both literal pools and MOVW/MOVT being
>  forbidden. This patch gives an error message when both options are
>  specified by the user and adds the according dg-skip-if directives for
>  tests that use either of these options.
> 
>  ChangeLog entries are as follows:
> 
>  *** gcc/ChangeLog ***
> 
>  2018-09-25  Thomas Preud'homme  
> 
> PR target/87374
> * config/arm/arm.c (arm_option_check_internal): Disable the 
>  combined
> use of -mslow-flash-data and -mword-relocations.
> 
>  *** gcc/testsuite/ChangeLog ***
> 
>  2018-09-25  Thomas Preud'homme  
> 
> PR target/87374
> * gcc.target/arm/movdi_movt.c: Skip if both -mslow-flash-data and
> -mword-relocations would be passed when compiling the test.
> * gcc.target/arm/movsi_movt.c: Likewise.
> * gcc.target/arm/pr81863.c: Likewise.
> * gcc.target/arm/thumb2-slow-flash-data-1.c: Likewise.
> * gcc.target/arm/thumb2-slow-flash-data-2.c: Likewise.
> * gcc.target/arm/thumb2-slow-flash-data-3.c: Likewise.
> * gcc.target/arm/thumb2-slow-flash-data-4.c: Likewise.
> * gcc.target/arm/thumb2-slow-flash-data-5.c: Likewise.
> * gcc.target/arm/tls-disable-literal-pool.c: Likewise.
> 
> 
>  Testing: Bootstrapped in Thumb-2 mode. No testsuite regression when
>  targeting arm-none-eabi. Modified tests get skipped as expected when
>  running the testsuite with -mslow-flash-data (pr81863.c) or
>  -mword-relocations (all the others).
> 
> 
>  Is this ok for trunk? I'd also appreciate guidance on whether this is
>  worth a backport. It's a simple patch but on the other hand it only
>  prevents some option combination, it does not fix anything so I have
>  mixed feelings.
> >>>
> >>> In my opinion -mslow-flash-data is more of a tuning option rather than a 
> >>> security/ABI feature
> >>> and therefore erroring out on its combination with -mword-relocations 
> >>> feels odd.
> >>> I'm leaning more towards making -mword-relocations or any other option 
> >>> that really requires constant pools
> >>> to bypass/disable the effects of -mslow-flash-data instead.
> >>
> >> -mslow-flash-data and -mword-relocations are contradictory in their
> >> expectations. mslow-flash-data is for not putting anything in the
> >> literal pool whereas mword-relocations is purely around the use of movw
> >> / movt instructions for word sized values. I wish we had called
> >> -mslow-flash-data something else (probably -mno-literal-pools).
> >> -mslow-flash-data is used primarily by M-profile users and
> >> -mword-relocations IIUC was a point fix for use in the Linux kernel for
> >> module loads at a time when not all module loaders in the linux kernel
> >> were fixed for the movw / movt relocations and armv7-a / thumb2 was in
> >> it's infancy :). Thus they are used by different constituencies in
> >> general and I wouldn't see the

[PATCH, i386]: Do not depend "C" constraint on TARGET_SSE

2018-10-05 Thread Uros Bizjak
This constraint is used in move patterns which do not depend on TARGET_SSE.

Also, rename "vector_move_operand" to "nonimm_or_0_operand".

2018-10-05  Uros Bizjak  

* config/i386/constraints.md ("C"): Do not depend on TARGET_SSE.
* config/i386/predicates.md (nonimm_or_0_operand): Rename
from vector_move_operand.  Update all uses.

Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.

Committed to mainline SVN.

Uros.
diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md
index 5456564d3a03..41b8690aeaff 100644
--- a/gcc/config/i386/constraints.md
+++ b/gcc/config/i386/constraints.md
@@ -254,10 +254,9 @@
 
 ;; This can theoretically be any mode's CONST0_RTX.
 (define_constraint "C"
-  "SSE constant zero operand."
-  (and (match_test "TARGET_SSE")
-   (ior (match_test "op == const0_rtx")
-   (match_operand 0 "const0_operand"
+  "Constant zero operand."
+  (ior (match_test "op == const0_rtx")
+   (match_operand 0 "const0_operand")))
 
 ;; Constant-or-symbol-reference constraints.
 
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 122e57f98cc4..b8d4589bd0c8 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -9816,7 +9816,7 @@
 (define_insn_and_split "copysign3_const"
   [(set (match_operand:CSGNMODE 0 "register_operand" "=Yv")
(unspec:CSGNMODE
- [(match_operand: 1 "vector_move_operand" "YvmC")
+ [(match_operand: 1 "nonimm_or_0_operand" "YvmC")
   (match_operand:CSGNMODE 2 "register_operand" "0")
   (match_operand: 3 "nonimmediate_operand" "Yvm")]
  UNSPEC_COPYSIGN))]
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index b1496f5405bf..539671ce4be5 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -79,7 +79,7 @@
 (define_insn "*mov_internal"
   [(set (match_operand:MMXMODE 0 "nonimmediate_operand"
 "=r ,o ,r,r ,m ,?!y,!y,?!y,m  ,r  ,?!y,v,v,v,m,r,v,!y,*x")
-   (match_operand:MMXMODE 1 "vector_move_operand"
+   (match_operand:MMXMODE 1 "nonimm_or_0_operand"
 "rCo,rC,C,rm,rC,C  ,!y,m  ,?!y,?!y,r  ,C,v,m,v,v,r,*x,!y"))]
   "TARGET_MMX
&& !(MEM_P (operands[0]) && MEM_P (operands[1]))"
@@ -582,7 +582,7 @@
   [(set (match_operand:V2SF 0 "register_operand" "=y,y")
(vec_concat:V2SF
  (match_operand:SF 1 "nonimmediate_operand" " 0,rm")
- (match_operand:SF 2 "vector_move_operand"  "ym,C")))]
+ (match_operand:SF 2 "nonimm_or_0_operand"  "ym,C")))]
   "TARGET_MMX && !TARGET_SSE"
   "@
punpckldq\t{%2, %0|%0, %2}
@@ -1276,7 +1276,7 @@
   [(set (match_operand:V2SI 0 "register_operand" "=y,y")
(vec_concat:V2SI
  (match_operand:SI 1 "nonimmediate_operand" " 0,rm")
- (match_operand:SI 2 "vector_move_operand"  "ym,C")))]
+ (match_operand:SI 2 "nonimm_or_0_operand"  "ym,C")))]
   "TARGET_MMX && !TARGET_SSE"
   "@
punpckldq\t{%2, %0|%0, %2}
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 221053f96b6d..bd262d77c6b7 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1042,11 +1042,6 @@
   (ior (match_operand 0 "register_operand")
(match_operand 0 "vector_memory_operand")))
 
-; Return true when OP is operand acceptable for standard SSE move.
-(define_predicate "vector_move_operand"
-  (ior (match_operand 0 "nonimmediate_operand")
-   (match_operand 0 "const0_operand")))
-
 ;; Return true when OP is either nonimmediate operand, or any
 ;; CONST_VECTOR.
 (define_predicate "nonimmediate_or_const_vector_operand"
@@ -1063,6 +1058,11 @@
   (ior (match_operand 0 "register_operand")
(match_operand 0 "const0_operand")))
 
+; Return true when OP is a nonimmediate or zero.
+(define_predicate "nonimm_or_0_operand"
+  (ior (match_operand 0 "nonimmediate_operand")
+   (match_operand 0 "const0_operand")))
+
 (define_predicate "norex_memory_operand"
   (and (match_operand 0 "memory_operand")
(not (match_test "x86_extended_reg_mentioned_p (op)"
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index ce26994f61fa..692959b16661 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1097,7 +1097,7 @@
   [(set (match_operand:V48_AVX512VL 0 "register_operand" "=v,v")
(vec_merge:V48_AVX512VL
  (match_operand:V48_AVX512VL 1 "nonimmediate_operand" "v,m")
- (match_operand:V48_AVX512VL 2 "vector_move_operand" "0C,0C")
+ (match_operand:V48_AVX512VL 2 "nonimm_or_0_operand" "0C,0C")
  (match_operand: 3 "register_operand" "Yk,Yk")))]
   "TARGET_AVX512F"
 {
@@ -1125,7 +1125,7 @@
   [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v,v")
(vec_merge:VI12_AVX512VL
  (match_operand:VI12_AVX512VL 1 "nonimmediate_operand" "v,m")
- (match_operand:VI12_AVX512VL 2 "vector_move_operand" "0C,0C")
+ (match_operand:VI12_AVX512VL 2 "nonimm_or_0_operand" "0C,0C")
  (match_operand: 3 "register_

Re: [PATCH] RISC-V: Fix unordered float compare for Signaling NaN.

2018-10-05 Thread Jim Wilson
On Wed, Oct 3, 2018 at 11:37 AM Jim Wilson  wrote:
> Thanks.  I see that the rs6000.md port has a pattern that checks
> HONOR_SNANS which looks like the right way to solve this problem.  I
> will try modifying Andrew's patch to use that and check against the
> glibc testsuite.

With a modified version of Andrew's patch, the testcase in Kito's
patch works, and I see the glibc testsuite failures drop from 151 to
131, so there is still something wrong.  Last time I ran the glibc
testsuite we were around 50-60 failures without Andrew's patch, so
either something broke, or there are more glibc tests than before.  I
suspect we need another related gcc change as most of the failures are
FP related.  I'll try to pull out another testcase, and if it isn't a
bug in my current patch then I will push it and work on another one
for the new problem.

FYI, some of the failures could be due to a suspected kernel bug.
I've noticed some gcc testsuite failures are due to the fact that the
fenv flags aren't clear at program startup.  But the weird thing is
that the testcases fail when run from the shell, but work when run
from inside gdb, which makes it difficult to debug, and makes me
suspect a kernel bug.

fedora-riscv:1010$ cat tmp.c
#include 
#include 

int
main (void)
{
  if (fetestexcept (FE_INEXACT))
abort ();
  return 0;
}
fedora-riscv:1011$ gcc tmp.c -lm
fedora-riscv:1012$ ./a.out
Aborted (core dumped)
fedora-riscv:1013$

It also fails if I put the test in a constructor, so the problem
happens before _start is called.  It also fails if static linked, so
it doesn't appear to be a dynamic linker problem either.  I've tried
looking at the kernel support for FP regs, but I'm not a kernel
expert, so I haven't made any progress there.

Jim


Re: [PATCH 2/2 v3][IRA,LRA] Fix PR86939, IRA incorrectly creates an interference between a pseudo register and a hard register

2018-10-05 Thread Peter Bergner
On 10/4/18 3:01 PM, Vladimir Makarov wrote:
> IMHO, the name copy_insn_p is too common and confusing (we already have
> functions copy_insn and copy_insn_1 in GCC).  The name does not reflect its
> result meaning.  I would call it something like non_conflict_copy_source_reg
> although it is long.

I'm fine with renaming it.  I'm not sure I like the use of source reg in
the name even though it is what is returned.  That is just a convenience for
the caller of the function.  Its true purpose is recognizing whether INSN
is or is not a reg to reg copy for which we can ignore their interference.

How about is_reg_copy_insn_p() or is_reg_to_reg_copy_p() or ???



> Also I would rename last_regno to bound_regno because it is better reflect
> variable value meaning or at least to end_regno as it is a value of END_REGNO
> macro.

Ok, I went with end_regno, since that seems to be used elsewhere.


Peter



Re: Don't ICE on vectors of enums (PR 87286)

2018-10-05 Thread Richard Biener
On October 5, 2018 2:48:24 PM GMT+02:00, Richard Sandiford 
 wrote:
>We've traditionally allowed vectors of enums (not sure if that's
>deliberate) but vector_types_compatible_elements_p checked for
>INTEGER_TYPE rather than INTEGRAL_TYPE_P.
>
>Tested on aarch64-linux-gnu.  OK to install?

OK. 

Richard. 

>Richard
>
>
>2018-10-05  Richard Sandiford  
>
>gcc/c-family/
>   PR c/87286
>   * c-common.c (vector_types_compatible_elements_p): Use
>   INTEGRAL_TYPE_P instead of checking only for INTEGER_TYPE.
>
>gcc/testsuite/
>   PR c/87286
>   * gcc.dg/pr87286.c: New test.
>
>Index: gcc/c-family/c-common.c
>===
>--- gcc/c-family/c-common.c2018-10-05 13:46:08.28787 +0100
>+++ gcc/c-family/c-common.c2018-10-05 13:47:08.291325001 +0100
>@@ -7465,8 +7465,11 @@ vector_types_compatible_elements_p (tree
> 
>   enum tree_code c1 = TREE_CODE (t1), c2 = TREE_CODE (t2);
> 
>-  gcc_assert ((c1 == INTEGER_TYPE || c1 == REAL_TYPE || c1 ==
>FIXED_POINT_TYPE)
>-&& (c2 == INTEGER_TYPE || c2 == REAL_TYPE
>+  gcc_assert ((INTEGRAL_TYPE_P (t1)
>+ || c1 == REAL_TYPE
>+ || c1 == FIXED_POINT_TYPE)
>+&& (INTEGRAL_TYPE_P (t2)
>+|| c2 == REAL_TYPE
> || c2 == FIXED_POINT_TYPE));
> 
>   t1 = c_common_signed_type (t1);
>@@ -7476,7 +7479,7 @@ vector_types_compatible_elements_p (tree
>   if (t1 == t2)
> return true;
>   if (opaque && c1 == c2
>-  && (c1 == INTEGER_TYPE || c1 == REAL_TYPE)
>+  && (INTEGRAL_TYPE_P (t1) || c1 == REAL_TYPE)
>   && TYPE_PRECISION (t1) == TYPE_PRECISION (t2))
> return true;
>   return false;
>Index: gcc/testsuite/gcc.dg/pr87286.c
>===
>--- /dev/null  2018-09-14 11:16:31.122530289 +0100
>+++ gcc/testsuite/gcc.dg/pr87286.c 2018-10-05 13:47:08.291325001 +0100
>@@ -0,0 +1,3 @@
>+enum foo { F };
>+typedef enum foo vec_foo __attribute__((vector_size (16)));
>+vec_foo add (vec_foo x, vec_foo y) { return x + y; }



Re: introduce --enable-mingw-full32 to default to --large-address-aware

2018-10-05 Thread Joseph Myers
A new configure option needs documenting in install.texi.

-- 
Joseph S. Myers
jos...@codesourcery.com


Re: [PATCH, rs6000] 2/2 Add x86 SSE3 intrinsics to GCC PPC64LE target

2018-10-05 Thread Paul Clarke
On 10/05/2018 04:20 AM, Segher Boessenkool wrote:
> On Tue, Oct 02, 2018 at 09:12:07AM -0500, Paul Clarke wrote:
>> This is part 2/2 for contributing PPC64LE support for X86 SSE3
>> instrisics. This patch includes testsuite/gcc.target tests for the
>> intrinsics defined in pmmintrin.h. 
>>
>> Tested on POWER8 ppc64le and ppc64 (-m64 and -m32, the latter only reporting
>> 10 new unsupported tests.)
>>
>> [gcc/testsuite]
>>
>> 2018-10-01  Paul A. Clarke  
>>
>>  * sse3-check.h: New file.
>>  * sse3-addsubps.h: New file.
>>  * sse3-addsubpd.h: New file.
>>  * sse3-haddps.h: New file.
>>  * sse3-hsubps.h: New file.
>>  * sse3-haddpd.h: New file.
>>  * sse3-hsubpd.h: New file.
>>  * sse3-lddqu.h: New file.
>>  * sse3-movsldup.h: New file.
>>  * sse3-movshdup.h: New file.
>>  * sse3-movddup.h: New file.
> 
> All these entries should have gcc.target/powerpc/ in the file name.

Ack.

>> --- gcc/testsuite/gcc.target/powerpc/pr37191.c   (nonexistent)
>> +++ gcc/testsuite/gcc.target/powerpc/pr37191.c   (working copy)
> 
> You need to mention this file in the changelog, too.

Ack.

>> @@ -0,0 +1,49 @@
>> +/* { dg-do compile } */
>> +/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
>> +/* { dg-options "-O3 -mdirect-move" } */
> 
> -mdirect-move is deprecated and doesn't do anything.  You want -mcpu=power8
> if you want to enable power8 instructions.  (Or -mpower8-vector also works,
> for the time being anyway, but it is not preferred).

All of the gcc/testsuite/gcc.target/powerpc/sse2*.c use "-mpower8-vector".  
Shall I use that, or "-mcpu=power8"?

> Have you tested this with -mcpu= an older cpu?  Did that work?  (It won't
> _do_ much of course, but are there extra unexpected errors, etc.)

I just did, at your urging.  Seems OK.

>> +/* { dg-require-effective-target lp64 } */
> 
> Do these tests actually need this?  For what, then?

All of the gcc/testsuite/gcc.target/powerpc/sse2*.c use it.  I will profess my 
ignorance.  Should it be used?

PC



libgo patch committed: Remove checkgoarm function

2018-10-05 Thread Ian Lance Taylor
The checkgoarm function in libgo's runtime package is never called,
and the whole point of that function is to verify a goarm variable
that libgo never sets it.  This patch removes the function.  Committed
to mainline.

Ian
Index: gcc/go/gofrontend/MERGE
===
--- gcc/go/gofrontend/MERGE (revision 264813)
+++ gcc/go/gofrontend/MERGE (working copy)
@@ -1,4 +1,4 @@
-bde5ac90e0b4efdf3e9a4d72af4eb23250608611
+9f4cf23e716bcf65e071260afa032a64acd3fdde
 
 The first line of this file holds the git revision number of the last
 merge done from the gofrontend repository.
Index: libgo/go/runtime/os_linux_arm.go
===
--- libgo/go/runtime/os_linux_arm.go(revision 264813)
+++ libgo/go/runtime/os_linux_arm.go(working copy)
@@ -19,25 +19,6 @@ var armArch uint8 = 6 // we default to A
 var hwcap uint32  // set by archauxv
 var hardDiv bool  // set if a hardware divider is available
 
-func checkgoarm() {
-   // On Android, /proc/self/auxv might be unreadable and hwcap won't
-   // reflect the CPU capabilities. Assume that every Android arm device
-   // has the necessary floating point hardware available.
-   if GOOS == "android" {
-   return
-   }
-   if goarm > 5 && hwcap&_HWCAP_VFP == 0 {
-   print("runtime: this CPU has no floating point hardware, so it 
cannot run\n")
-   print("this GOARM=", goarm, " binary. Recompile using 
GOARM=5.\n")
-   exit(1)
-   }
-   if goarm > 6 && hwcap&_HWCAP_VFPv3 == 0 {
-   print("runtime: this CPU has no VFPv3 floating point hardware, 
so it cannot run\n")
-   print("this GOARM=", goarm, " binary. Recompile using GOARM=5 
or GOARM=6.\n")
-   exit(1)
-   }
-}
-
 func archauxv(tag, val uintptr) {
switch tag {
case _AT_RANDOM:


Re: [C++ Patch] PR 71128 ("[concepts] ICE on ill-formed explicit instantiation of a function concept")

2018-10-05 Thread Jason Merrill
OK.
On Fri, Oct 5, 2018 at 10:07 AM Paolo Carlini  wrote:
>
> Hi,
>
> another simple issue: here we ICE at the beginning of instantiate_decl
> when we try to explicitly instantiate a concept. Tested x86_64-linux.
>
> Thanks, Paolo.
>
> /
>


libbacktrace patch committed: backtrace_create_state should be called once

2018-10-05 Thread Ian Lance Taylor
This patch to libbacktrace expands the comment for
backtrace_create_state to make clear that it should be called only
once.  There is no backtrace_free_state function.  While it would be
nice to have such a function, it's hard to write completely accurately
as libbacktrace doesn't currently track all memory allocations.
Committed to mainline.

Ian

2018-10-05  Ian Lance Taylor  

PR libbacktrace/87529
* backtrace.h: Document that backtrace_create_state should be
called only once.
Index: backtrace.h
===
--- backtrace.h (revision 264813)
+++ backtrace.h (working copy)
@@ -92,7 +92,13 @@ typedef void (*backtrace_error_callback)
use appropriate atomic operations.  If THREADED is zero the state
may only be accessed by one thread at a time.  This returns a state
pointer on success, NULL on error.  If an error occurs, this will
-   call the ERROR_CALLBACK routine.  */
+   call the ERROR_CALLBACK routine.
+
+   Calling this function allocates resources that can not be freed.
+   There is no backtrace_free_state function.  The state is used to
+   cache information that is expensive to recompute.  Programs are
+   expected to call this function at most once and to save the return
+   value for all later calls to backtrace functions.  */
 
 extern struct backtrace_state *backtrace_create_state (
 const char *filename, int threaded,


[C++ Patch] PR 71128 ("[concepts] ICE on ill-formed explicit instantiation of a function concept")

2018-10-05 Thread Paolo Carlini

Hi,

another simple issue: here we ICE at the beginning of instantiate_decl 
when we try to explicitly instantiate a concept. Tested x86_64-linux.


Thanks, Paolo.

/

/cp
2018-10-05  Paolo Carlini  

PR c++/71128
* pt.c (do_decl_instantiation): Per 12.6.8/5, a concept cannot be
explicitly instantiated.

/testsuite
2018-10-05  Paolo Carlini  

PR c++/71128
* g++.dg/concepts/pr71128.C: New.
Index: cp/pt.c
===
--- cp/pt.c (revision 264862)
+++ cp/pt.c (working copy)
@@ -23127,6 +23127,14 @@ do_decl_instantiation (tree decl, tree storage)
   error ("explicit instantiation of non-template %q#D", decl);
   return;
 }
+  else if (DECL_DECLARED_CONCEPT_P (decl))
+{
+  if (VAR_P (decl))
+   error ("explicit instantiation of variable concept %q#D", decl);
+  else
+   error ("explicit instantiation of function concept %q#D", decl);
+  return;
+}
 
   bool var_templ = (DECL_TEMPLATE_INFO (decl)
 && variable_template_p (DECL_TI_TEMPLATE (decl)));
Index: testsuite/g++.dg/concepts/pr71128.C
===
--- testsuite/g++.dg/concepts/pr71128.C (nonexistent)
+++ testsuite/g++.dg/concepts/pr71128.C (working copy)
@@ -0,0 +1,10 @@
+// { dg-do compile { target c++14 } }
+// { dg-additional-options "-fconcepts" }
+
+template
+concept bool C() { return true; }
+template bool C();  // { dg-error "explicit instantiation of function 
concept" }
+
+template
+concept bool D = true;
+template bool D;  // { dg-error "explicit instantiation of variable 
concept" }


Re: [patch] various OpenACC reduction enhancements - ME and nvptx changes

2018-10-05 Thread Tom de Vries
On 6/29/18 8:19 PM, Cesar Philippidis wrote:
> The attached patch includes the nvptx and GCC ME reductions enhancements.
> 
> Is this patch OK for trunk? It bootstrapped / regression tested cleanly
> for x86_64 with nvptx offloading.
> 

These need fixing:
...
=== ERROR type #5: trailing whitespace (4 error(s)) ===
gcc/config/nvptx/nvptx.c:5139:0:██
gcc/config/nvptx/nvptx.c:5660:8:  do█
gcc/config/nvptx/nvptx.c:5702:0:██
gcc/config/nvptx/nvptx.c:5726:0:██
...


>   gcc/
>   * config/nvptx/nvptx.c (nvptx_propagate_unified): New.
>   (nvptx_split_blocks): Call it for cond_uni insn.
>   (nvptx_expand_cond_uni): New.
>   (enum nvptx_builtins): Add NVPTX_BUILTIN_COND_UNI.
>   (nvptx_init_builtins): Initialize it.
>   (nvptx_expand_builtin):
>   (nvptx_generate_vector_shuffle): Change integral SHIFT operand to
>   tree BITS operand.
>   (nvptx_vector_reduction): New.
>   (nvptx_adjust_reduction_type): New.
>   (nvptx_goacc_reduction_setup): Use it to adjust the type of ref_to_res.
>   (nvptx_goacc_reduction_init): Don't update LHS if it doesn't exist.
>   (nvptx_goacc_reduction_fini): Call nvptx_vector_reduction for vector.
>   Use it to adjust the type of ref_to_res.
>   (nvptx_goacc_reduction_teardown):
>   * config/nvptx/nvptx.md (cond_uni): New pattern.

> diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
> index 5608bee8a8d..33ec3db1153 100644
> --- a/gcc/config/nvptx/nvptx.c
> +++ b/gcc/config/nvptx/nvptx.c
> @@ -2863,6 +2863,52 @@ nvptx_reorg_uniform_simt ()
>  }
>  }
>  
> +/* UNIFIED is a cond_uni insn.  Find the branch insn it affects, and
> +   mark that as unified.  We expect to be in a single block.  */
> +
> +static void
> +nvptx_propagate_unified (rtx_insn *unified)
> +{
> +  rtx_insn *probe = unified;
> +  rtx cond_reg = SET_DEST (PATTERN (unified));
> +  rtx pat = NULL_RTX;
> +
> +  /* Find the comparison.  (We could skip this and simply scan to he
> + blocks' terminating branch, if we didn't care for self
> + checking.)  */
> +  for (;;)
> +{
> +  probe = next_real_insn (probe);
> +  if (!probe)
> + break;
> +  pat = PATTERN (probe);
> +
> +  if (GET_CODE (pat) == SET
> +   && GET_RTX_CLASS (GET_CODE (SET_SRC (pat))) == RTX_COMPARE
> +   && XEXP (SET_SRC (pat), 0) == cond_reg)
> + break;
> +  gcc_assert (NONJUMP_INSN_P (probe));
> +}
> +  gcc_assert (pat);
> +  rtx pred_reg = SET_DEST (pat);
> +
> +  /* Find the branch.  */
> +  do
> +probe = NEXT_INSN (probe);
> +  while (!JUMP_P (probe));
> +
> +  pat = PATTERN (probe);
> +  rtx itec = XEXP (SET_SRC (pat), 0);
> +  gcc_assert (XEXP (itec, 0) == pred_reg);
> +
> +  /* Mark the branch's condition as unified.  */
> +  rtx unspec = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pred_reg),
> +UNSPEC_BR_UNIFIED);
> +  bool ok = validate_change (probe, &XEXP (itec, 0), unspec, false);
> +
> +  gcc_assert (ok);
> +}
> +
>  /* Loop structure of the function.  The entire function is described as
> a NULL loop.  */
>  
> @@ -2964,6 +3010,9 @@ nvptx_split_blocks (bb_insn_map_t *map)
>   continue;
> switch (recog_memoized (insn))
>   {
> + case CODE_FOR_cond_uni:
> +   nvptx_propagate_unified (insn);
> +   /* FALLTHROUGH */
>   default:
> seen_insn = true;
> continue;
> @@ -5080,6 +5129,21 @@ nvptx_expand_cmp_swap (tree exp, rtx target,
>return target;
>  }
>  
> +/* Expander for the compare unified builtin.  */
> +
> +static rtx
> +nvptx_expand_cond_uni (tree exp, rtx target, machine_mode mode, int ignore)
> +{
> +  if (ignore)
> +return target;
> +  
> +  rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
> +  NULL_RTX, mode, EXPAND_NORMAL);
> +
> +  emit_insn (gen_cond_uni (target, src));
> +
> +  return target;
> +}
>  
>  /* Codes for all the NVPTX builtins.  */
>  enum nvptx_builtins
> @@ -5089,6 +5153,7 @@ enum nvptx_builtins
>NVPTX_BUILTIN_WORKER_ADDR,
>NVPTX_BUILTIN_CMP_SWAP,
>NVPTX_BUILTIN_CMP_SWAPLL,
> +  NVPTX_BUILTIN_COND_UNI,
>NVPTX_BUILTIN_MAX
>  };
>  
> @@ -5126,6 +5191,7 @@ nvptx_init_builtins (void)
> (PTRVOID, ST, UINT, UINT, NULL_TREE));
>DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
>DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, 
> NULL_TREE));
> +  DEF (COND_UNI, "cond_uni", (integer_type_node, integer_type_node, 
> NULL_TREE));
>  
>  #undef DEF
>  #undef ST
> @@ -5158,6 +5224,9 @@ nvptx_expand_builtin (tree exp, rtx target, rtx 
> ARG_UNUSED (subtarget),
>  case NVPTX_BUILTIN_CMP_SWAPLL:
>return nvptx_expand_cmp_swap (exp, target, mode, ignore);
>  
> +case NVPTX_BUILTIN_COND_UNI:
> +  return nvptx_expand_cond_uni (exp, target, mode, ignore);
> +
>  default: gcc_unreachable ();
>  }
>  }
> @@ -5284,7 +5353,7 @@ nvptx_get_worker_red_addr (tree type, tree offset)
>

Re: [PATCH, OpenACC] Add support for gang local storage allocation in shared memory

2018-10-05 Thread Tom de Vries
On 8/16/18 5:46 PM, Julian Brown wrote:
> On Wed, 15 Aug 2018 21:56:54 +0200
> Bernhard Reutner-Fischer  wrote:
> 
>> On 15 August 2018 18:46:37 CEST, Julian Brown
>>  wrote:
>>> On Mon, 13 Aug 2018 12:06:21 -0700
>>> Cesar Philippidis  wrote:  
>>
>> atttribute has more t than strictly necessary. 
>> Don't like signed integer levels where they should be some unsigned. 
>> Also don't like single switch cases instead of if.
>> And omitting function comments even if the hook way above is
>> documented may be ok ish but is a bit lazy ;)
> 
> Here's a new version with those comments addressed. I also changed the
> logic around a little to avoid adding decls to the vec in omp_context
> which would never be given the gang-private attribute.
> 
> Re-tested with offloading to NVPTX.
> 
> OK?

As far as the nvptx part is concerned, I see:
...
=== ERROR type #4: trailing operator (1 error(s)) ===
gcc/config/nvptx/nvptx.c:5946:27: gangprivate_shared_size =
...

Otherwise, the nvptx part is OK.

Thanks,
- Tom

> 
> Julian
> 
> 2018-08-10  Julian Brown  
> Chung-Lin Tang  
> 
> gcc/
> * config/nvptx/nvptx.c (tree-hash-traits.h): Include.
> (gangprivate_shared_size): New global variable.
> (gangprivate_shared_align): Likewise.
> (gangprivate_shared_sym): Likewise.
> (gangprivate_shared_hmap): Likewise.
> (nvptx_option_override): Initialize gangprivate_shared_sym,
> gangprivate_shared_align.
> (nvptx_file_end): Output gangprivate_shared_sym.
> (nvptx_goacc_expand_accel_var): New function.
> (nvptx_set_current_function): New function.
> (TARGET_SET_CURRENT_FUNCTION): Define hook.
> (TARGET_GOACC_EXPAND_ACCEL): Likewise.
> * doc/tm.texi (TARGET_GOACC_EXPAND_ACCEL_VAR): Document new hook.
> * doc/tm.texi.in (TARGET_GOACC_EXPAND_ACCEL_VAR): Likewise.
> * expr.c (expand_expr_real_1): Remap decls marked with the
> "oacc gangprivate" attribute.
> * omp-low.c (omp_context): Add oacc_partitioning_level and
> oacc_addressable_var_decls fields.
> (new_omp_context): Initialize oacc_addressable_var_decls in new
> omp_context.
> (delete_omp_context): Delete oacc_addressable_var_decls in old
> omp_context.
> (lower_oacc_head_tail): Record partitioning-level count in omp 
> context.
> (oacc_record_private_var_clauses, oacc_record_vars_in_bind)
> (mark_oacc_gangprivate): New functions.
> (lower_omp_for): Call oacc_record_private_var_clauses with "for"
> clauses.  Call mark_oacc_gangprivate for gang-partitioned loops.
> (lower_omp_target): Call oacc_record_private_var_clauses with "target"
> clauses.
> Call mark_oacc_gangprivate for offloaded target regions.
> (lower_omp_1): Call vars_in_bind for GIMPLE_BIND within OMP regions.
> * target.def (expand_accel_var): New hook.
> 
> libgomp/
> * testsuite/libgomp.oacc-c-c++-common/gang-private-1.c: New test.
> * testsuite/libgomp.oacc-c-c++-common/loop-gwv-2.c: New test.
> * testsuite/libgomp.oacc-c/pr85465.c: New test.
> * testsuite/libgomp.oacc-fortran/gangprivate-attrib-1.f90: New test.
> 


Re: [PATCH 6/6, OpenACC, libgomp] Async re-work, nvptx changes

2018-10-05 Thread Tom de Vries
On 9/25/18 3:11 PM, Chung-Lin Tang wrote:
> Hi Tom,
> this patch removes large portions of plugin/plugin-nvptx.c, since a lot
> of it is
> now in oacc-async.c now.

Yay!

> The new code is essentially a
> NVPTX/CUDA-specific implementation
> of the new-style goacc_asyncqueues.
> 
> Also, some needed functions in cuda-lib.def are added. The cuda.h
> function has also
> been updated to build independently without a CUDA installation.
> 

I see these formatting issues:
...
$ check_GNU_style.sh async-06.nvptx.patch

There should be exactly one space between function name and parenthesis.
35:+CUresult cuStreamAddCallback(CUstream, CUstreamCallback, void *,
unsigned int);

Trailing operator.
1320:+  struct nvptx_thread *nvthd =
...

Otherwise, OK.

Thanks,
- Tom


> Thanks,
> Chung-Lin
> 
> * plugin/plugin-nvptx.c (struct cuda_map): Remove.
> (struct ptx_stream): Remove.
> (struct nvptx_thread): Remove current_stream field.
> (cuda_map_create): Remove.
> (cuda_map_destroy): Remove.
> (map_init): Remove.
> (map_fini): Remove.
> (map_pop): Remove.
> (map_push): Remove.
> (struct goacc_asyncqueue): Define.
> (struct nvptx_callback): Define.
> (struct ptx_free_block): Define.
> (struct ptx_device): Remove null_stream, active_streams, async_streams,
> stream_lock, and next fields.
> (enum ptx_event_type): Remove.
> (struct ptx_event): Remove.
> (ptx_event_lock): Remove.
> (ptx_events): Remove.
> (init_streams_for_device): Remove.
> (fini_streams_for_device): Remove.
> (select_stream_for_async): Remove.
> (nvptx_init): Remove ptx_events and ptx_event_lock references.
> (nvptx_attach_host_thread_to_device): Remove CUDA_ERROR_NOT_PERMITTED
> case.
> (nvptx_open_device): Add free_blocks initialization, remove
> init_streams_for_device call.
> (nvptx_close_device): Remove fini_streams_for_device call, add
> free_blocks destruct code.
> (event_gc): Remove.
> (event_add): Remove.
> (nvptx_exec): Adjust parameters and code.
> (nvptx_free): Likewise.
> (nvptx_host2dev): Remove.
> (nvptx_dev2host): Remove.
> (nvptx_set_async): Remove.
> (nvptx_async_test): Remove.
> (nvptx_async_test_all): Remove.
> (nvptx_wait): Remove.
> (nvptx_wait_async): Remove.
> (nvptx_wait_all): Remove.
> (nvptx_wait_all_async): Remove.
> (nvptx_get_cuda_stream): Remove.
> (nvptx_set_cuda_stream): Remove.
> (GOMP_OFFLOAD_alloc): Adjust code.
> (GOMP_OFFLOAD_free): Likewise.
> (GOMP_OFFLOAD_openacc_register_async_cleanup): Remove.
> (GOMP_OFFLOAD_openacc_exec): Adjust parameters and code.
> (GOMP_OFFLOAD_openacc_async_test_all): Remove.
> (GOMP_OFFLOAD_openacc_async_wait): Remove.
> (GOMP_OFFLOAD_openacc_async_wait_async): Remove.
> (GOMP_OFFLOAD_openacc_async_wait_all): Remove.
> (GOMP_OFFLOAD_openacc_async_wait_all_async): Remove.
> (GOMP_OFFLOAD_openacc_async_set_async): Remove.
> (cuda_free_argmem): New function.
> (GOMP_OFFLOAD_openacc_async_exec): New plugin hook function.
> (GOMP_OFFLOAD_openacc_create_thread_data): Adjust code.
> (GOMP_OFFLOAD_openacc_cuda_get_stream): Adjust code.
> (GOMP_OFFLOAD_openacc_cuda_set_stream): Adjust code.
> (GOMP_OFFLOAD_openacc_async_construct): New plugin hook function.
> (GOMP_OFFLOAD_openacc_async_destruct): New plugin hook function.
> (GOMP_OFFLOAD_openacc_async_test): Remove and re-implement.
> (GOMP_OFFLOAD_openacc_async_synchronize): New plugin hook function.
> (GOMP_OFFLOAD_openacc_async_serialize): New plugin hook function.
> (GOMP_OFFLOAD_openacc_async_queue_callback): New plugin hook function.
> (cuda_callback_wrapper): New function.
> (cuda_memcpy_sanity_check): New function.
> (GOMP_OFFLOAD_host2dev): Remove and re-implement.
> (GOMP_OFFLOAD_dev2host): Remove and re-implement.
> (GOMP_OFFLOAD_openacc_async_host2dev): New plugin hook function.
> (GOMP_OFFLOAD_openacc_async_dev2host): New plugin hook function.


Re: [PATCH 2/4] Remove unused functions and fields.

2018-10-05 Thread Martin Liška
On 10/5/18 12:43 AM, Bernhard Reutner-Fischer wrote:
> Hi!
> 
> So i just added archive handling to ease looking at more than just the
> plain frontends, applied as r264856.

Running the tools for cc1 does not show anything. Please update the comment
in the script with example invocation.

> 
> You can now use the exact files passed to the driver when linking e.g. cc1.
> We link libcommon.a twice? Didn't look.
> 
> e.g.:
> me@there:.../gcc$ /scratch/src/gcc-trunk/contrib/unused_functions.py
> c/c-lang.o c-family/stub-objc.o attribs.o c/c-errors.o c/c-decl.o
> c/c-typeck.o c/c-convert.o c/c-aux-info.o c/c-objc-common.o
> c/c-parser.o c/c-fold.o c/gimple-parser.o c-family/c-common.o
> c-family/c-cppbuiltin.o c-family/c-dump.o c-family/c-format.o
> c-family/c-gimplify.o c-family/c-indentation.o c-family/c-lex.o
> c-family/c-omp.o c-family/c-opts.o c-family/c-pch.o
> c-family/c-ppoutput.o c-family/c-pragma.o c-family/c-pretty-print.o
> c-family/c-semantics.o c-family/c-ada-spec.o c-family/c-ubsan.o
> c-family/known-headers.o c-family/c-attribs.o c-family/c-warn.o
> c-family/c-spellcheck.o i386-c.o glibc-c.o   cc1-checksum.o
> libbackend.a main.o libcommon-target.a libcommon.a ../libcpp/libcpp.a
> ../libdecnumber/libdecnumber.a libcommon.a ../libcpp/libcpp.a
> ../libbacktrace/.libs/libbacktrace.a ../libiberty/libiberty.a
> ../libdecnumber/libdecnumber.a
> 
> results in the attached output.
> 
> This properly flags functions like e.g.: init_branch_prob (dead code),
> bitwise_mode_for_mode in stor-layout.c (should be static).
> 
> Of course it also complains about cases like supports_one_only() in cc1
> where that is only used in cc1plus.
> Likewise constant_pool_empty_p() on i386 which is only used by ppc and spe.
> 
> HTH,
> 

I'm not currently playing with alternative approach. As mentioned I run 
diagnostics
from rtags and it looks they added new functionality --find-dead-functions that
lists dead functions. It provides reasonable results, it finds also overloads 
of functions
and class member functions. Similarly I suggested to add --find-dead-variables:
https://github.com/Andersbakken/rtags/issues/1234
What's nice about it is that it can also find dead local variables, like:
gcc/tree-vect-loop.c:5875:34: widest_int ni, max_loop_value, lhs_max;
It's still WIP, needs to be done properly.

For the external symbols that can be turned into static, I created issue:
https://github.com/Andersbakken/rtags/issues/1235

I'm planning to prepare a patch that will remove the dead symbols.
Martin



Don't ICE on vectors of enums (PR 87286)

2018-10-05 Thread Richard Sandiford
We've traditionally allowed vectors of enums (not sure if that's
deliberate) but vector_types_compatible_elements_p checked for
INTEGER_TYPE rather than INTEGRAL_TYPE_P.

Tested on aarch64-linux-gnu.  OK to install?

Richard


2018-10-05  Richard Sandiford  

gcc/c-family/
PR c/87286
* c-common.c (vector_types_compatible_elements_p): Use
INTEGRAL_TYPE_P instead of checking only for INTEGER_TYPE.

gcc/testsuite/
PR c/87286
* gcc.dg/pr87286.c: New test.

Index: gcc/c-family/c-common.c
===
--- gcc/c-family/c-common.c 2018-10-05 13:46:08.28787 +0100
+++ gcc/c-family/c-common.c 2018-10-05 13:47:08.291325001 +0100
@@ -7465,8 +7465,11 @@ vector_types_compatible_elements_p (tree
 
   enum tree_code c1 = TREE_CODE (t1), c2 = TREE_CODE (t2);
 
-  gcc_assert ((c1 == INTEGER_TYPE || c1 == REAL_TYPE || c1 == FIXED_POINT_TYPE)
- && (c2 == INTEGER_TYPE || c2 == REAL_TYPE
+  gcc_assert ((INTEGRAL_TYPE_P (t1)
+  || c1 == REAL_TYPE
+  || c1 == FIXED_POINT_TYPE)
+ && (INTEGRAL_TYPE_P (t2)
+ || c2 == REAL_TYPE
  || c2 == FIXED_POINT_TYPE));
 
   t1 = c_common_signed_type (t1);
@@ -7476,7 +7479,7 @@ vector_types_compatible_elements_p (tree
   if (t1 == t2)
 return true;
   if (opaque && c1 == c2
-  && (c1 == INTEGER_TYPE || c1 == REAL_TYPE)
+  && (INTEGRAL_TYPE_P (t1) || c1 == REAL_TYPE)
   && TYPE_PRECISION (t1) == TYPE_PRECISION (t2))
 return true;
   return false;
Index: gcc/testsuite/gcc.dg/pr87286.c
===
--- /dev/null   2018-09-14 11:16:31.122530289 +0100
+++ gcc/testsuite/gcc.dg/pr87286.c  2018-10-05 13:47:08.291325001 +0100
@@ -0,0 +1,3 @@
+enum foo { F };
+typedef enum foo vec_foo __attribute__((vector_size (16)));
+vec_foo add (vec_foo x, vec_foo y) { return x + y; }


Re: [patch] nvptx libgcc atomic routines

2018-10-05 Thread Tom de Vries
On 9/26/18 8:33 PM, Cesar Philippidis wrote:
> This patch adds nvptx support for the atomic FETCH_AND_OP functions. I
> recall that this used to be important for OpenACC reductions back in the
> GCC 5.0 days before Nathan split reductions into four phases. Nowadays,
> atomic reductions use a spin lock that's implemented directly by the
> nvptx BE. Therefore, I'm not sure if the nvptx port still needs support
> for atomic fetch_and_*.
> 
> Tom and Thomas, do either of you have any thoughts on this? Should I
> commit it to trunk?

I'd say no. I can think of only one possible use for this, which is to
be able use -fno-inline-atomics to workaround problems in atomics in
ptx, and I think that that's not sufficiently valuable to start
maintaining these routines in trunk.

Thanks,
- Tom

> I bootstrapped and regtested it for x86_64 Linux
> with nvptx offloading.


[PATCH][GCC][DOC] Relocate list under Deprecated in options.texi to Var

2018-10-05 Thread Sam Tebbs
Hi all,

I recently found what seems to be an error in the options documentation
(gcc/doc/options.texi) where a list describing how _var_ is set (referring to
the Var attribute) is written beneath the _Deprecated_ attribute instead. This
patch moves it to the correct location.

gcc/doc
2018-05-10  Sam Tebbs  

* options.texi (Deprecated): Move list to Var section.
diff --git a/gcc/doc/options.texi b/gcc/doc/options.texi
index 
f887d16f88f8e22d280d0ab20a6fde05eb86e3d8..e618b9543511fa102a45c521fe6bd7759c73ef8d
 100644
--- a/gcc/doc/options.texi
+++ b/gcc/doc/options.texi
@@ -314,6 +314,15 @@ The way that the state is stored depends on the type of 
option:
 The option is deprecated and every usage of such option will
 result in a warning.
 
+@item Var(@var{var}, @var{set})
+The option controls an integer variable @var{var} and is active when
+@var{var} equals @var{set}.  The option parser will set @var{var} to
+@var{set} when the positive form of the option is used and @code{!@var{set}}
+when the ``no-'' form is used.
+
+@var{var} is declared in the same way as for the single-argument form
+described above.
+
 @itemize @bullet
 @item
 If the option uses the @code{Mask} or @code{InverseMask} properties,
@@ -351,15 +360,6 @@ and wasn't given.
 The option-processing script will usually zero-initialize @var{var}.
 You can modify this behavior using @code{Init}.
 
-@item Var(@var{var}, @var{set})
-The option controls an integer variable @var{var} and is active when
-@var{var} equals @var{set}.  The option parser will set @var{var} to
-@var{set} when the positive form of the option is used and @code{!@var{set}}
-when the ``no-'' form is used.
-
-@var{var} is declared in the same way as for the single-argument form
-described above.
-
 @item Init(@var{value})
 The variable specified by the @code{Var} property should be statically
 initialized to @var{value}.  If more than one option using the same


Re: [PATCH] Optimize sin(atan(x)), take 2

2018-10-05 Thread Giuliano Augusto Faulin Belinassi
Thank you for the review. I will address all these issues :-).

> Imagine a pause here while I lookup isolation of radicals  It's been
> a long time...   OK.  Sure.  I see what you're doing here...

Sorry, but I really did not understand your comment. Should I write a
shorter comment for that function?

> Not  sure what you mean for safety reasons.  The calculations to produce
> "c" then convert it into a REAL_VALUE_TYPE all make sense.  Just not
> sure what this line is really meant to do.

Imagine the following case:
Let "c" be the real constant such that it is certain that for every x
> "c",  1/sqrt(x*x + 1) = 1.
Suppose now that our calculation leads us to a c' < "c" due to a minor
imprecision.
The logic here is that 10 * c' > "c" and everything will work, thus it is safer.
Note however that I cannot prove that 10 * c' > "c", but I would be
really surprised
if this does not holds.



> A related remark would be: with the precision of double, for x>=cst (about
> 2^53), atan(x) is constant, within .5 ulp of pi/2 if the math library is
> super precise (which it probably isn't). Returning 0 for its cos (what
> happens if x*x gives +Inf) is thus completely fine unless you are using
> crlibm, but then you wouldn't use flag_unsafe_math_optimizations. The main
> issue is that if we have -ffast-math, we have -ffinite-math-only, and we
> are possibly introducing an infinity as intermediate result...

Thank you. This clarifies the need for a similar constant for the cos(atan(x)).
On Thu, Oct 4, 2018 at 9:36 AM Marc Glisse  wrote:
>
> On Wed, 3 Oct 2018, Jeff Law wrote:
>
> >> +/* Simplify cos(atan(x)) -> 1 / sqrt(x*x + 1). */
> >> + (for coss (COS)
> >> +  atans (ATAN)
> >> +  sqrts (SQRT)
> >> +  (simplify
> >> +   (coss (atans:s @0))
> >> +   (rdiv {build_one_cst (type);}
> >> +   (sqrts (plus (mult @0 @0) {build_one_cst (type);})
> > Don't we have the same kinds of issues with the x*x in here?  As X gets
> > large it will overflow, but the result is going to be approaching zero.
> > So we might be able to use a similar trick here.
>
> (I have not read the patch)
>
> The similar trick would say that for X large, this is the same as 1/abs(X)
> I guess. Note that it may be simpler to generate a call to hypot (C99).
>
> A related remark would be: with the precision of double, for x>=cst (about
> 2^53), atan(x) is constant, within .5 ulp of pi/2 if the math library is
> super precise (which it probably isn't). Returning 0 for its cos (what
> happens if x*x gives +Inf) is thus completely fine unless you are using
> crlibm, but then you wouldn't use flag_unsafe_math_optimizations. The main
> issue is that if we have -ffast-math, we have -ffinite-math-only, and we
> are possibly introducing an infinity as intermediate result...
>
> --
> Marc Glisse


Re: [PATCH][4/n] Remove BLOCK_ABSTRACT

2018-10-05 Thread Richard Biener
On Fri, 28 Sep 2018, Richard Biener wrote:

> 
> It turns out that nobody sets this anymore (dwarf2out did with the
> original code of outputting abstract instances, temporarily so IIRC).
> 
> Bootstrap and regtest running on x86_64-unknown-linux-gnu.
> 
> Any objection to purge it completely like this?

It's gone now (r264868).

> DECL_ABSTRACT_P is a similar beast but I see the C++ FE still sets it
> on ctors-in-charge (or so).  Is this pure FE use or does the middle-end
> really need to care about that?  I'd possibly turn each DECL_ABSTRACT_P
> use in the middle-end into an assert but I wonder if you have any
> thoughts about this.

That question still stands.

Richard.

> Thanks,
> Richard.
> 
> 2018-09-28  Richard Biener  
> 
>   * tree-core.h (tree_block::abstract_flag): Remove.
>   (tree_block::block_num): Make full 32bits.
>   * tree.def (BLOCK): Remove docs about BLOCK_ABSTRACT.
>   * tree.h (BLOCK_ABSTRACT): Remove.
>   * dwarf2out.c (gen_lexical_block_die): Remove dead code
>   resulting from BLOCK_ABSTRACT being always false.
>   (gen_inlined_subroutine_die): Likewise.
>   (gen_block_die): Likewise.
>   * tree.c (block_ultimate_origin): Likewise.
>   * tree-pretty-print.c (dump_block_node): Remove code dealing
>   with BLOCK_ABSTRACT.
>   * tree-ssa-live.c (dump_scope_block): Likewise.
>   * tree-streamer-in.c (unpack_ts_block_value_fields): Likewise.
>   * tree-streamer-out.c (pack_ts_block_value_fields): Likewise.
> 
> Index: gcc/dwarf2out.c
> ===
> --- gcc/dwarf2out.c   (revision 264689)
> +++ gcc/dwarf2out.c   (working copy)
> @@ -24071,25 +24071,7 @@ gen_lexical_block_die (tree stmt, dw_die
>equate_block_to_die (stmt, stmt_die);
>  }
>  
> -  if (BLOCK_ABSTRACT (stmt))
> -{
> -  if (old_die)
> - {
> -   /* This must have been generated early and it won't even
> -  need location information since it's a DW_AT_inline
> -  function.  */
> -   if (flag_checking)
> - for (dw_die_ref c = context_die; c; c = c->die_parent)
> -   if (c->die_tag == DW_TAG_inlined_subroutine
> -   || c->die_tag == DW_TAG_subprogram)
> - {
> -   gcc_assert (get_AT (c, DW_AT_inline));
> -   break;
> - }
> -   return;
> - }
> -}
> -  else if (BLOCK_ABSTRACT_ORIGIN (stmt))
> +  if (BLOCK_ABSTRACT_ORIGIN (stmt))
>  {
>/* If this is an inlined or conrecte instance, create a new lexical
>die for anything below to attach DW_AT_abstract_origin to.  */
> @@ -24109,7 +24091,7 @@ gen_lexical_block_die (tree stmt, dw_die
>/* A non abstract block whose blocks have already been reordered
>   should have the instruction range for this block.  If so, set the
>   high/low attributes.  */
> -  if (!early_dwarf && !BLOCK_ABSTRACT (stmt) && TREE_ASM_WRITTEN (stmt))
> +  if (!early_dwarf && TREE_ASM_WRITTEN (stmt))
>  {
>gcc_assert (stmt_die);
>add_high_low_attributes (stmt, stmt_die);
> @@ -24123,48 +24105,38 @@ gen_lexical_block_die (tree stmt, dw_die
>  static void
>  gen_inlined_subroutine_die (tree stmt, dw_die_ref context_die)
>  {
> -  tree decl;
> -
> -  /* The instance of function that is effectively being inlined shall not
> - be abstract.  */
> -  gcc_assert (! BLOCK_ABSTRACT (stmt));
> -
> -  decl = block_ultimate_origin (stmt);
> +  tree decl = block_ultimate_origin (stmt);
>  
>/* Make sure any inlined functions are known to be inlineable.  */
>gcc_checking_assert (DECL_ABSTRACT_P (decl)
>  || cgraph_function_possibly_inlined_p (decl));
>  
> -  if (! BLOCK_ABSTRACT (stmt))
> -{
> -  dw_die_ref subr_die
> - = new_die (DW_TAG_inlined_subroutine, context_die, stmt);
> +  dw_die_ref subr_die = new_die (DW_TAG_inlined_subroutine, context_die, 
> stmt);
>  
> -  if (call_arg_locations || debug_inline_points)
> - equate_block_to_die (stmt, subr_die);
> -  add_abstract_origin_attribute (subr_die, decl);
> -  if (TREE_ASM_WRITTEN (stmt))
> -add_high_low_attributes (stmt, subr_die);
> -  add_call_src_coords_attributes (stmt, subr_die);
> -
> -  /* The inliner creates an extra BLOCK for the parameter setup,
> - we want to merge that with the actual outermost BLOCK of the
> -  inlined function to avoid duplicate locals in consumers.
> -  Do that by doing the recursion to subblocks on the single subblock
> -  of STMT.  */
> -  bool unwrap_one = false;
> -  if (BLOCK_SUBBLOCKS (stmt) && !BLOCK_CHAIN (BLOCK_SUBBLOCKS (stmt)))
> - {
> -   tree origin = block_ultimate_origin (BLOCK_SUBBLOCKS (stmt));
> -   if (origin
> -   && TREE_CODE (origin) == BLOCK
> -   && BLOCK_SUPERCONTEXT (origin) == decl)
> - unwrap_one = true;
> - }
> -  decls_for_scope (stmt, subr_die, !unwrap_one);
> -  if (u

Re: [PATCH] i386: Use TImode for BLKmode values in 2 integer registers

2018-10-05 Thread H.J. Lu
On Sat, Sep 29, 2018 at 11:02 AM Uros Bizjak  wrote:
>
> On Sat, Sep 29, 2018 at 6:36 PM H.J. Lu  wrote:
> >
> > When passing and returning BLKmode values in 2 integer registers, use
> > 1 TImode register instead of 2 DImode registers. Otherwise, V1TImode
> > may be used to move and store such BLKmode values, which prevent RTL
> > optimizations.
> >
> > Tested on x86-64.  OK for trunk?
> >
> > Thanks.
> >
> > H.J.
> > ---
> > gcc/
> >
> > PR target/87370
> > * config/i386/i386.c (construct_container): Use TImode for
> > BLKmode values in 2 integer registers.
> >
> > gcc/testsuite/
> >
> > PR target/87370
> > * gcc.target/i386/pr87370.c: New test.
>
> OK.

I'd like to backport it to release branches.  Is that OK?

Thanks.

H.J.
> Thanks,
> Uros.
>
> > ---
> >  gcc/config/i386/i386.c  | 17 +--
> >  gcc/testsuite/gcc.target/i386/pr87370.c | 39 +
> >  2 files changed, 54 insertions(+), 2 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr87370.c
> >
> > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > index 176cce521b7..54752513076 100644
> > --- a/gcc/config/i386/i386.c
> > +++ b/gcc/config/i386/i386.c
> > @@ -7914,9 +7914,22 @@ construct_container (machine_mode mode, machine_mode 
> > orig_mode,
> >if (n == 2
> >&& regclass[0] == X86_64_INTEGER_CLASS
> >&& regclass[1] == X86_64_INTEGER_CLASS
> > -  && (mode == CDImode || mode == TImode)
> > +  && (mode == CDImode || mode == TImode || mode == BLKmode)
> >&& intreg[0] + 1 == intreg[1])
> > -return gen_rtx_REG (mode, intreg[0]);
> > +{
> > +  if (mode == BLKmode)
> > +   {
> > + /* Use TImode for BLKmode values in 2 integer registers.  */
> > + exp[0] = gen_rtx_EXPR_LIST (VOIDmode,
> > + gen_rtx_REG (TImode, intreg[0]),
> > + GEN_INT (0));
> > + ret = gen_rtx_PARALLEL (mode, rtvec_alloc (1));
> > + XVECEXP (ret, 0, 0) = exp[0];
> > + return ret;
> > +   }
> > +  else
> > +   return gen_rtx_REG (mode, intreg[0]);
> > +}
> >
> >/* Otherwise figure out the entries of the PARALLEL.  */
> >for (i = 0; i < n; i++)
> > diff --git a/gcc/testsuite/gcc.target/i386/pr87370.c 
> > b/gcc/testsuite/gcc.target/i386/pr87370.c
> > new file mode 100644
> > index 000..c7b6295a33b
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr87370.c
> > @@ -0,0 +1,39 @@
> > +/* { dg-do compile { target { ! ia32 } } } */
> > +/* { dg-options "-O2" } */
> > +
> > +struct A
> > +{
> > +  int b[4];
> > +};
> > +struct B
> > +{
> > +  char a[12];
> > +  int b;
> > +};
> > +struct C
> > +{
> > +  char a[16];
> > +};
> > +
> > +struct A
> > +f1 (void)
> > +{
> > +  struct A x = {};
> > +  return x;
> > +}
> > +
> > +struct B
> > +f2 (void)
> > +{
> > +  struct B x = {};
> > +  return x;
> > +}
> > +
> > +struct C
> > +f3 (void)
> > +{
> > +  struct C x = {};
> > +  return x;
> > +}
> > +
> > +/* { dg-final { scan-assembler-not "xmm" } } */
> > --
> > 2.17.1
> >


Re: [PATCH][i386] Fix scalar costing in ix86_add_stmt_cost

2018-10-05 Thread Jan Hubicka
> 
> The following fixes bogus differences in scalar iteration cost
> computed by the vectorizer when comparing 128 and 256 bit vectorizations.
> This was caused by the hook looking at the vector types mode even
> for kind == scalar_stmt and thus returning vector operation costs
> for things like add or negate.
> 
> This is most noticable on targets where ix86_vec_cost applies
> multiplication based on vector size (TARGET_AVX128_OPTIMAL, thus
> Zen and Bulldozer).  But it will also adjust the actual costs
> everywhere where scalar and vector costs diverge.
> 
> The adjustments done for Silvermont also look broken since they
> are applied to both scalar and vector cost which makes it mostly
> a no-op.  The patch adjusts it to only apply for vector costing
> but of course I didn't benchmark this and the magic number of 1.7
> may not make sense after this fix so I'm happy to leave that
> out - Kirill?  Should I just go ahead with that for trunk (we can
> revert or adjust if autotesters pick up regressions on your side)?
> 
> Bootstrap & regtest running on x86_64-unknown-linux-gnu, OK?
> 
> Richard.
> 
> 2018-10-04   Richard Biener  
> 
>   * config/i386/i386.c (ix86_add_stmt_cost): When scalar cost
>   is asked for initialize mode to the component mode of the
>   vector type.  Make sure Bonnel and esp. other Atom cost
>   adjustments are not done for scalar cost estimates.
OK,
thanks!
> 
> Index: gcc/config/i386/i386.c
> ===
> --- gcc/config/i386/i386.c(revision 264837)
> +++ gcc/config/i386/i386.c(working copy)
> @@ -49486,17 +49486,21 @@ ix86_add_stmt_cost (void *data, int coun
>  {
>unsigned *cost = (unsigned *) data;
>unsigned retval = 0;
> +  bool scalar_p
> += (kind == scalar_stmt || kind == scalar_load || kind == scalar_store);
>  
>tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
>int stmt_cost = - 1;
>  
>bool fp = false;
> -  machine_mode mode = TImode;
> +  machine_mode mode = scalar_p ? SImode : TImode;
>  
>if (vectype != NULL)
>  {
>fp = FLOAT_TYPE_P (vectype);
>mode = TYPE_MODE (vectype);
> +  if (scalar_p)
> + mode = TYPE_MODE (TREE_TYPE (vectype));
>  }
>  
>if ((kind == vector_stmt || kind == scalar_stmt)
> @@ -49632,7 +49636,7 @@ ix86_add_stmt_cost (void *data, int coun
>  stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
>  
>/* Penalize DFmode vector operations for Bonnell.  */
> -  if (TARGET_BONNELL && kind == vector_stmt
> +  if (TARGET_BONNELL && !scalar_p
>&& vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
>  stmt_cost *= 5;  /* FIXME: The value here is arbitrary.  */
>  
> @@ -49648,7 +49652,8 @@ ix86_add_stmt_cost (void *data, int coun
>   for Silvermont as it has out of order integer pipeline and can execute
>   2 scalar instruction per tick, but has in order SIMD pipeline.  */
>if ((TARGET_SILVERMONT || TARGET_GOLDMONT || TARGET_GOLDMONT_PLUS
> -   || TARGET_TREMONT || TARGET_INTEL) && stmt_info && stmt_info->stmt)
> +   || TARGET_TREMONT || TARGET_INTEL)
> +  && !scalar_p && stmt_info && stmt_info->stmt)
>  {
>tree lhs_op = gimple_get_lhs (stmt_info->stmt);
>if (lhs_op && TREE_CODE (TREE_TYPE (lhs_op)) == INTEGER_TYPE)


Re: [Patch, Aarch64] Fix testsuite regressions related to PR tree-optimization/71625

2018-10-05 Thread Richard Earnshaw (lists)
On 26/09/18 17:21, Steve Ellcey wrote:
> A patch for PR tree-optimized/71625 caused regressions in the
> gcc.target/aarch64/vclz.c and gcc.target/aarch64/vneg_s.c tests
> because a couple of routines that were not getting inlined before
> started getting inlined.  The inlining is not a bug, the 
> generated code is now smaller so some functions that were previously
> not being inlined due to being too large are now getting inlined.
> Because we also generate out-of-line code the scan-assembler-times
> checks are failing.  Since inlining or not inlining is not the 
> point of this test I added the noinline attribute to all the test_*
> functions and this fixed the test regressions.
> 
> Tested on aarch64, OK for checkin?
> 
> Steve Ellcey
> sell...@cavium.com
> 
> 
> 2018-09-26  Steve Ellcey  
> 
>   PR tree-optimization/71625
>   * /gcc.target/aarch64/vclz.c (test_vclz_s8): Add noinline attribute.
>   (test_vclz_s16): Ditto.
>   (test_vclz_s32): Ditto.
>   (test_vclzq_s8): Ditto.
>   (test_vclzq_s16): Ditto.
>   (test_vclzq_s32): Ditto.
>   (test_vclz_u8): Ditto.
>   (test_vclz_u16): Ditto.
>   (test_vclz_u32): Ditto.
>   (test_vclzq_u8): Ditto.
>   (test_vclzq_u16): Ditto.
>   (test_vclzq_u32): Ditto.
>   * gcc.target/aarch64/vneg_s.c (test_vneg_s8): Ditto.
>   (test_vneg_s16): Ditto.
>   (test_vneg_s32): Ditto.
>   (test_vneg_s64): Ditto.
>   (test_vnegd_s64): Ditto.
>   (test_vnegq_s8): Ditto.
>   (test_vnegq_s16): Ditto.
>   (test_vnegq_s32): Ditto.
>   (test_vnegq_s64): Ditto.
> 
> 

OK.

R.

> diff --git a/gcc/testsuite/gcc.target/aarch64/vclz.c 
> b/gcc/testsuite/gcc.target/aarch64/vclz.c
> index 60494a8..a36ee44 100644
> --- a/gcc/testsuite/gcc.target/aarch64/vclz.c
> +++ b/gcc/testsuite/gcc.target/aarch64/vclz.c
> @@ -75,7 +75,7 @@ extern void abort (void);
>  if (a [i] != b [i])  
> \
>    return 1;
>  
> -int
> +int __attribute__ ((noinline))
>  test_vclz_s8 ()
>  {
>    int i;
> @@ -107,7 +107,7 @@ test_vclz_s8 ()
>  /* Double scan-assembler-times to take account of unsigned functions.  */
>  /* { dg-final { scan-assembler-times "clz\\tv\[0-9\]+\.8b, v\[0-9\]+\.8b" 4 
> } } */
>  
> -int
> +int __attribute__ ((noinline))
>  test_vclz_s16 ()
>  {
>    int i;
> @@ -138,7 +138,7 @@ test_vclz_s16 ()
>  /* Double scan-assembler-times to take account of unsigned functions.  */
>  /* { dg-final { scan-assembler-times "clz\\tv\[0-9\]+\.4h, v\[0-9\]+\.4h" 
> 10} } */
>  
> -int
> +int __attribute__ ((noinline))
>  test_vclz_s32 ()
>  {
>    int i;
> @@ -205,7 +205,7 @@ test_vclz_s32 ()
>  /* Double scan-assembler-times to take account of unsigned functions.  */
>  /* { dg-final { scan-assembler-times "clz\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s" 34 
> } } */
>  
> -int
> +int __attribute__ ((noinline))
>  test_vclzq_s8 ()
>  {
>    int i;
> @@ -226,7 +226,7 @@ test_vclzq_s8 ()
>  /* Double scan-assembler-times to take account of unsigned functions.  */
>  /* { dg-final { scan-assembler-times "clz\\tv\[0-9\]+\.16b, v\[0-9\]+\.16b" 
> 2 } } */
>  
> -int
> +int __attribute__ ((noinline))
>  test_vclzq_s16 ()
>  {
>    int i;
> @@ -262,7 +262,7 @@ test_vclzq_s16 ()
>  /* Double scan-assembler-times to take account of unsigned functions.  */
>  /* { dg-final { scan-assembler-times "clz\\tv\[0-9\]+\.8h, v\[0-9\]+\.8h" 6 
> } } */
>  
> -int
> +int __attribute__ ((noinline))
>  test_vclzq_s32 ()
>  {
>    int i;
> @@ -303,7 +303,7 @@ test_vclzq_s32 ()
>  
>  /* Unsigned versions.  */
>  
> -int
> +int __attribute__ ((noinline))
>  test_vclz_u8 ()
>  {
>    int i;
> @@ -331,7 +331,7 @@ test_vclz_u8 ()
>  
>  /* ASM scan near test for signed version.  */
>  
> -int
> +int __attribute__ ((noinline))
>  test_vclz_u16 ()
>  {
>    int i;
> @@ -361,7 +361,7 @@ test_vclz_u16 ()
>  
>  /* ASM scan near test for signed version.  */
>  
> -int
> +int __attribute__ ((noinline))
>  test_vclz_u32 ()
>  {
>    int i;
> @@ -427,7 +427,7 @@ test_vclz_u32 ()
>  
>  /* ASM scan near test for signed version.  */
>  
> -int
> +int __attribute__ ((noinline))
>  test_vclzq_u8 ()
>  {
>    int i;
> @@ -448,7 +448,7 @@ test_vclzq_u8 ()
>  
>  /* ASM scan near test for signed version.  */
>  
> -int
> +int __attribute__ ((noinline))
>  test_vclzq_u16 ()
>  {
>    int i;
> @@ -485,7 +485,7 @@ test_vclzq_u16 ()
>  
>  /* ASM scan near test for signed version.  */
>  
> -int
> +int __attribute__ ((noinline))
>  test_vclzq_u32 ()
>  {
>    int i;
> diff --git a/gcc/testsuite/gcc.target/aarch64/vneg_s.c 
> b/gcc/testsuite/gcc.target/aarch64/vneg_s.c
> index e7f20f2..6947526 100644
> --- a/gcc/testsuite/gcc.target/aarch64/vneg_s.c
> +++ b/gcc/testsuite/gcc.target/aarch64/vneg_s.c
> @@ -87,7 +87,7 @@ extern void abort (void);
>  force_simd (res); \
>    }
>  
> -int
> +int __attribute__ ((noinline))
>  test_vneg_s8 ()
>  {
>    int8x8_t a;
> @@ -107,7 +107,7 @@ test_vn

[PATCH] Fix PR63155 (again)

2018-10-05 Thread Richard Biener


This fixes yet another bottleneck in the SSA propagator where the way
we process the worklists (in particular the BB one) causes excessive
re-processing of PHI nodes.  The following patch priorizes forward
progress over iteration as that ensures the maximum set of possible
backedges is executable when re-processing PHIs.  Implementation-wise
this is done by using two worklists each for BBs and SSA edges,
making sure to not go back in the RPO iteration.

This improves compile-time for the new "Small testcase more
similar to original environment" from to 197s to 27s.

Not first iterating SSA cycles before processing uses might end up
processing more stmts but for the testcase in question going
back to processing SSA cycles first gets compile-time back up to 35s.
That is likely because doing that isn't the ideal way of iterating
over SSA (which would be processing SCCs...).  Other testcases
might not be so happy about this specific change, if there are any
such ones we might want to consider SCC based iteration for the
SSA propagator...

Note that CCP is still the most expensive pass at -O1 for the
testcase, even after the patch but backprop follows close behind.

Bootstrap and regtest running on x86_64-unknown-linux-gnu.

Note GCC 4.8 compiled this in less than 1s, the abnormal edges
we generate for setjmp really hurt (but it's the testcases fault
to implement a try/catch with setjmp inside a state-machine...
which makes me wonder how a C++ variant with EH would perform)

Richard.

>From 80083abe998e0f75782d206ceda72de88fcf0563 Mon Sep 17 00:00:00 2001
From: Richard Guenther 
Date: Fri, 5 Oct 2018 12:31:44 +0200
Subject: [PATCH] fix-pr63155-8

2018-10-05  Richard Biener  

PR tree-optimization/63155
* tree-ssa-ccp.c (ccp_propagate::visit_phi): Avoid excess
vertical space in dumpfiles.
* tree-ssa-propagate.h
(ssa_propagation_engine::process_ssa_edge_worklist): Remove.
* tree-ssa-propagate.c (cfg_blocks_back): New global.
(ssa_edge_worklist_back): Likewise.
(curr_order): Likewise.
(cfg_blocks_get): Remove abstraction.
(cfg_blocks_add): Likewise.
(cfg_blocks_empty_p): Likewise.
(add_ssa_edge): Add to current or next worklist based on
RPO index.
(add_control_edge): Likewise.
(ssa_propagation_engine::process_ssa_edge_worklist): Fold
into ...
(ssa_propagation_engine::ssa_propagate): ... here.  Unify
iteration from CFG and SSA edge worklist so we process
everything in RPO order, prioritizing forward progress
over iteration.
(ssa_prop_init): Allocate new worklists, do not dump
immediate uses.
(ssa_prop_fini): Free new worklists.

diff --git a/gcc/tree-ssa-ccp.c b/gcc/tree-ssa-ccp.c
index 95368a5c79d..d8a069be529 100644
--- a/gcc/tree-ssa-ccp.c
+++ b/gcc/tree-ssa-ccp.c
@@ -1119,7 +1119,7 @@ ccp_propagate::visit_phi (gphi *phi)
   if (dump_file && (dump_flags & TDF_DETAILS))
{
  fprintf (dump_file,
- "\nArgument #%d (%d -> %d %sexecutable)\n",
+ "\tArgument #%d (%d -> %d %sexecutable)\n",
  i, e->src->index, e->dest->index,
  (e->flags & EDGE_EXECUTABLE) ? "" : "not ");
}
diff --git a/gcc/tree-ssa-propagate.c b/gcc/tree-ssa-propagate.c
index 140b153d5a1..4cb0fbaed15 100644
--- a/gcc/tree-ssa-propagate.c
+++ b/gcc/tree-ssa-propagate.c
@@ -108,51 +108,26 @@
  [3] Advanced Compiler Design and Implementation,
 Steven Muchnick, Morgan Kaufmann, 1997, Section 12.6  */
 
-/* Worklist of control flow edge destinations.  This contains
+/* Worklists of control flow edge destinations.  This contains
the CFG order number of the blocks so we can iterate in CFG
-   order by visiting in bit-order.  */
+   order by visiting in bit-order.  We use two worklists to
+   first make forward progress before iterating.  */
 static bitmap cfg_blocks;
+static bitmap cfg_blocks_back;
 static int *bb_to_cfg_order;
 static int *cfg_order_to_bb;
 
-/* Worklist of SSA edges which will need reexamination as their
+/* Worklists of SSA edges which will need reexamination as their
definition has changed.  SSA edges are def-use edges in the SSA
web.  For each D-U edge, we store the target statement or PHI node
-   UID in a bitmap.  UIDs order stmts in execution order.   */
+   UID in a bitmap.  UIDs order stmts in execution order.  We use
+   two worklists to first make forward progress before iterating.  */
 static bitmap ssa_edge_worklist;
+static bitmap ssa_edge_worklist_back;
 static vec uid_to_stmt;
 
-/* Return true if the block worklist empty.  */
-
-static inline bool
-cfg_blocks_empty_p (void)
-{
-  return bitmap_empty_p (cfg_blocks);
-}
-
-
-/* Add a basic block to the worklist.  The block must not be the ENTRY
-   or EXIT block.  */
-
-static void
-cfg_blocks_add (basic_block bb)
-{
-  gcc_assert (bb != ENTRY_BLOCK_PTR_FOR_FN (cfun)
- && bb != EX

[PATCH] rs6000: Various fixes for the new fpscr builtins (PR87509)

2018-10-05 Thread Segher Boessenkool
With these fixes all testcases test clean for me, both on
powerpc64-linux {-m32,-m64} and on powerpc64le-linux, with all
relevant -mcpu= settings.

Committing to trunk.

The generated code (esp. for -m32) isn't optimal yet, but if we do
not allow SImode in FPRs this isn't easy to solve.  So first this,
correctness, and optimisation can follow later :-)


2018-10-05  Segher Boessenkool  

PR target/87509
* config/rs6000/rs6000-builtin.def (RS6000_BUILTIN_SET_FPSCR_DRN): Use
RS6000_BTM_DFP.
* config/rs6000/rs6000.md (rs6000_set_fpscr_rn): Require the operand
to be DImode.  When using mffscrn, force the operand to a register.

gcc/testsuite/
PR target/87509
* gcc.target/powerpc/test_fpscr_drn_builtin.c: Use hard_dfp instead
of dfp_hw.  Don't include .
* gcc.target/powerpc/test_fpscr_drn_builtin_error.c: Ditto.  Require
lp64.
* gcc.target/powerpc/test_fpscr_rn_builtin.c: Don't include .
* gcc.target/powerpc/test_mffsl.c: Ditto.

---
 gcc/config/rs6000/rs6000-builtin.def| 2 +-
 gcc/config/rs6000/rs6000.md | 7 +++
 gcc/testsuite/gcc.target/powerpc/test_fpscr_drn_builtin.c   | 4 +---
 gcc/testsuite/gcc.target/powerpc/test_fpscr_drn_builtin_error.c | 6 ++
 gcc/testsuite/gcc.target/powerpc/test_fpscr_rn_builtin.c| 2 --
 gcc/testsuite/gcc.target/powerpc/test_fpscr_rn_builtin_error.c  | 2 --
 gcc/testsuite/gcc.target/powerpc/test_mffsl.c   | 2 --
 7 files changed, 7 insertions(+), 18 deletions(-)

diff --git a/gcc/config/rs6000/rs6000-builtin.def 
b/gcc/config/rs6000/rs6000-builtin.def
index 976c36b..ec0528a 100644
--- a/gcc/config/rs6000/rs6000-builtin.def
+++ b/gcc/config/rs6000/rs6000-builtin.def
@@ -2510,7 +2510,7 @@ RS6000_BUILTIN_X (RS6000_BUILTIN_SET_FPSCR_RN, 
"__builtin_set_fpscr_rn",
  CODE_FOR_rs6000_set_fpscr_rn)
 
 RS6000_BUILTIN_X (RS6000_BUILTIN_SET_FPSCR_DRN, "__builtin_set_fpscr_drn",
- RS6000_BTM_ALWAYS,
+ RS6000_BTM_DFP,
  RS6000_BTC_MISC | RS6000_BTM_64BIT | RS6000_BTC_UNARY
  | RS6000_BTC_VOID,
  CODE_FOR_rs6000_set_fpscr_drn)
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index ad78f2d..ed09211 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -5849,7 +5849,7 @@ (define_insn "rs6000_mffscdrn"
   [(set_attr "type" "fp")])
 
 (define_expand "rs6000_set_fpscr_rn"
- [(match_operand 0 "reg_or_cint_operand")]
+ [(match_operand:DI 0 "reg_or_cint_operand")]
   "TARGET_HARD_FLOAT"
 {
   rtx tmp_df = gen_reg_rtx (DFmode);
@@ -5858,9 +5858,8 @@ (define_expand "rs6000_set_fpscr_rn"
  new rounding mode bits from operands[0][62:63] into FPSCR[62:63].  */
   if (TARGET_P9_MISC)
 {
-  rtx src_df = gen_reg_rtx (DImode);
-
-  src_df = simplify_gen_subreg (DFmode, operands[0], DImode, 0);
+  rtx src_df = force_reg (DImode, operands[0]);
+  src_df = simplify_gen_subreg (DFmode, src_df, DImode, 0);
   emit_insn (gen_rs6000_mffscrn (tmp_df, src_df));
   DONE;
 }
diff --git a/gcc/testsuite/gcc.target/powerpc/test_fpscr_drn_builtin.c 
b/gcc/testsuite/gcc.target/powerpc/test_fpscr_drn_builtin.c
index 685bf97..00b11f8 100644
--- a/gcc/testsuite/gcc.target/powerpc/test_fpscr_drn_builtin.c
+++ b/gcc/testsuite/gcc.target/powerpc/test_fpscr_drn_builtin.c
@@ -1,9 +1,7 @@
 /* { dg-do run { target { powerpc*-*-* &&  lp64 } } } */
-/* { dg-require-effective-target dfp_hw } */
+/* { dg-require-effective-target hard_dfp } */
 /* { dg-options "-O2 -std=c99" } */
 
-#include 
-
 #ifdef DEBUG
 #include 
 #endif
diff --git a/gcc/testsuite/gcc.target/powerpc/test_fpscr_drn_builtin_error.c 
b/gcc/testsuite/gcc.target/powerpc/test_fpscr_drn_builtin_error.c
index 58453f0..028ab0b 100644
--- a/gcc/testsuite/gcc.target/powerpc/test_fpscr_drn_builtin_error.c
+++ b/gcc/testsuite/gcc.target/powerpc/test_fpscr_drn_builtin_error.c
@@ -1,9 +1,7 @@
-/* { dg-do compile { target powerpc*-*-* } } */
-/* { dg-require-effective-target dfp_hw } */
+/* { dg-do compile { target { powerpc*-*-* && lp64 } } } */
+/* { dg-require-effective-target hard_dfp } */
 /* { dg-options "-O2 -std=c99" } */
 
-#include 
-
 int main ()
 {
 
diff --git a/gcc/testsuite/gcc.target/powerpc/test_fpscr_rn_builtin.c 
b/gcc/testsuite/gcc.target/powerpc/test_fpscr_rn_builtin.c
index be05dcf..0d0d3f0 100644
--- a/gcc/testsuite/gcc.target/powerpc/test_fpscr_rn_builtin.c
+++ b/gcc/testsuite/gcc.target/powerpc/test_fpscr_rn_builtin.c
@@ -1,8 +1,6 @@
 /* { dg-do run { target { powerpc*-*-* } } } */
 /* { dg-options "-O2 -std=c99" } */
 
-#include 
-
 #ifdef DEBUG
 #include 
 #endif
diff --git a/gcc/testsuite/gcc.target/powerpc/test_fpscr_rn_builtin_error.c 
b/gcc/testsuite/gcc.target/powerpc/test_fpscr_rn_builtin_error.c
index a529ce6..aea6509 100644
--- a/gcc/testsuite/gcc.target/powerpc/test_fpscr_rn_builtin_e

Re: [PATCH][RFC][i386] Change sminmax reduction patterns

2018-10-05 Thread Uros Bizjak
On Thu, Oct 4, 2018 at 2:05 PM Richard Biener  wrote:
>
>
> This tries to apply the same trick to sminmax reduction patterns
> as for the reduc_plus_scal ones, namely reduce %zmm -> %ymm -> %xmm
> first.  On a microbenchmark this improves performance on Zen
> by ~30% for AVX2 and on Skylake-SP by ~10% for AVX512 (for AVX2
> there's no measurable difference).
>
> I guess I'm mostly looking for feedback on the approach I took
> in not rewriting ix86_expand_reduc but instead "recurse" on the
> expanders as well as the need to define recursion stops for
> SSE modes previously not covered.
>
> I'll throw this on a bootstrap & regtest on x86_64-unknown-linux-gnu
> later.
>
> Any comments sofar?  Writing .md patterns is new for me ;)

LGTM for the implementation.

Uros.

> Thanks,
> Richard.
>
> 2018-10-04  Richard Biener  
>
> * config/i386/sse.md (reduc__scal_): Split
> into part reducing to half width and recursing and
> SSE2 vector variant doing the final reduction with
> ix86_expand_reduc.
>
> Index: gcc/config/i386/sse.md
> ===
> --- gcc/config/i386/sse.md  (revision 264837)
> +++ gcc/config/i386/sse.md  (working copy)
> @@ -2544,11 +2544,29 @@ (define_expand "reduc_plus_scal_v4sf"
>  })
>
>  ;; Modes handled by reduc_sm{in,ax}* patterns.
> +(define_mode_iterator REDUC_SSE_SMINMAX_MODE
> +  [(V4SF "TARGET_SSE") (V2DF "TARGET_SSE")
> +   (V2DI "TARGET_SSE") (V4SI "TARGET_SSE") (V8HI "TARGET_SSE")
> +   (V16QI "TARGET_SSE")])
> +
> +(define_expand "reduc__scal_"
> +  [(smaxmin:REDUC_SSE_SMINMAX_MODE
> + (match_operand: 0 "register_operand")
> + (match_operand:REDUC_SSE_SMINMAX_MODE 1 "register_operand"))]
> +  ""
> +{
> +  rtx tmp = gen_reg_rtx (mode);
> +  ix86_expand_reduc (gen_3, tmp, operands[1]);
> +  emit_insn (gen_vec_extract (operands[0], tmp,
> +   const0_rtx));
> +  DONE;
> +})
> +
>  (define_mode_iterator REDUC_SMINMAX_MODE
>[(V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2")
> (V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")
> (V8SF "TARGET_AVX") (V4DF "TARGET_AVX")
> -   (V4SF "TARGET_SSE") (V64QI "TARGET_AVX512BW")
> +   (V64QI "TARGET_AVX512BW")
> (V32HI "TARGET_AVX512BW") (V16SI "TARGET_AVX512F")
> (V8DI "TARGET_AVX512F") (V16SF "TARGET_AVX512F")
> (V8DF "TARGET_AVX512F")])
> @@ -2559,10 +2577,12 @@ (define_expand "reduc__scal_
>   (match_operand:REDUC_SMINMAX_MODE 1 "register_operand"))]
>""
>  {
> -  rtx tmp = gen_reg_rtx (mode);
> -  ix86_expand_reduc (gen_3, tmp, operands[1]);
> -  emit_insn (gen_vec_extract (operands[0], tmp,
> -   const0_rtx));
> +  rtx tmp = gen_reg_rtx (mode);
> +  emit_insn (gen_vec_extract_hi_ (tmp, operands[1]));
> +  rtx tmp2 = gen_reg_rtx (mode);
> +  emit_insn (gen_3
> +(tmp2, tmp, gen_lowpart (mode, operands[1])));
> +  emit_insn (gen_reduc__scal_ (operands[0], 
> tmp2));
>DONE;
>  })
>


Re: [patch] tighten toplevel guard on ibm-ldouble.c

2018-10-05 Thread Segher Boessenkool
Hi Olivier,

On Wed, Oct 03, 2018 at 09:35:35PM +0200, Olivier Hainque wrote:
> With a forthcoming micro cleanup patch to the 32bits powerpc-vxworks
> port (arranging to define __powerpc__ in addition to a few others), 
> the port fails to build with:
> 
>   In file included from 
> ../../../../src/libgcc/config/rs6000/ibm-ldouble.c:384:
>   ../../../../src/libgcc/soft-fp/quad.h:72:1: error: unable to emulate 'TF'
> 
> The port doesn't support 128bit floats so the attempt to compile
> ibm-ldouble.c is surprising.
> 
> The whole source is guarded with a number of conditions already:
> 
> #if (defined (__MACH__) || defined (__powerpc__) || defined (_AIX)) \
>   && !defined (__rtems__)
> 
> The file starts with:
> 
> /* 128-bit long double support routines for Darwin.
> 
> so presumably none of this is needed when the target
> doesn't have support for 128bit FP types at all.
> 
> rs6000_init_libfuncs tells us when we get to initialize
> float128 libcalls:
> 
> rs6000_init_libfuncs (void)
> {
> /* __float128 support.  */
> if (TARGET_FLOAT128_TYPE)
>   ...
> 
> /* AIX/Darwin/64-bit Linux quad floating point routines.  */
> if (TARGET_LONG_DOUBLE_128)
>   ...
> }
> 
> Then rs6000_cpu_cpp_builtins tells what macros get
> defined when the two target attributes above are true:
> 
> if (TARGET_FLOAT128_TYPE)
>   builtin_define ("__FLOAT128_TYPE__");
> 
> if (TARGET_LONG_DOUBLE_128)
>   {
> builtin_define ("__LONG_DOUBLE_128__");
> 
> This suggests the attached patchlet, which cures the VxWorks
> build issue.
> 
> I'd appreciate feedback on the idea before going on with
> further testing deemed appropriate.
> 
> How does that look ?

I think it looks fine.  Okay for trunk after some testing.

(Please don't use application/octet-stream for patches btw).


Segher


Re: [PATCH, rs6000] 2/2 Add x86 SSE3 intrinsics to GCC PPC64LE target

2018-10-05 Thread Segher Boessenkool
Hi!

On Tue, Oct 02, 2018 at 09:12:07AM -0500, Paul Clarke wrote:
> This is part 2/2 for contributing PPC64LE support for X86 SSE3
> instrisics. This patch includes testsuite/gcc.target tests for the
> intrinsics defined in pmmintrin.h. 
> 
> Tested on POWER8 ppc64le and ppc64 (-m64 and -m32, the latter only reporting
> 10 new unsupported tests.)
> 
> [gcc/testsuite]
> 
> 2018-10-01  Paul A. Clarke  
> 
>   * sse3-check.h: New file.
>   * sse3-addsubps.h: New file.
>   * sse3-addsubpd.h: New file.
>   * sse3-haddps.h: New file.
>   * sse3-hsubps.h: New file.
>   * sse3-haddpd.h: New file.
>   * sse3-hsubpd.h: New file.
>   * sse3-lddqu.h: New file.
>   * sse3-movsldup.h: New file.
>   * sse3-movshdup.h: New file.
>   * sse3-movddup.h: New file.

All these entries should have gcc.target/powerpc/ in the file name.

> --- gcc/testsuite/gcc.target/powerpc/pr37191.c(nonexistent)
> +++ gcc/testsuite/gcc.target/powerpc/pr37191.c(working copy)

You need to mention this file in the changelog, too.

> @@ -0,0 +1,49 @@
> +/* { dg-do compile } */
> +/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
> +/* { dg-options "-O3 -mdirect-move" } */

-mdirect-move is deprecated and doesn't do anything.  You want -mcpu=power8
if you want to enable power8 instructions.  (Or -mpower8-vector also works,
for the time being anyway, but it is not preferred).

Have you tested this with -mcpu= an older cpu?  Did that work?  (It won't
_do_ much of course, but are there extra unexpected errors, etc.)

> +/* { dg-require-effective-target lp64 } */

Do these tests actually need this?  For what, then?


Segher


Re: [PATCH, rs6000] 1/2 Add x86 SSE3 intrinsics to GCC PPC64LE target

2018-10-05 Thread Segher Boessenkool
Hi Paul,

On Tue, Oct 02, 2018 at 09:11:57AM -0500, Paul Clarke wrote:
> This is a follow-on to earlier commits for adding compatibility
> implementations of x86 intrinsics for PPC64LE.  This is the first of
> two patches.  This patch adds 11 of the 13 x86 intrinsics from
>  ("SSE3").  (Patch 2/2 adds tests for these intrinsics,
> and briefly describes the tests performed.)
> 
> Implementations are relatively straightforward, with occasional
> extra effort for vector element ordering.

Looks fine.

> Not implemented are _mm_wait and _mm_monitor, as there are no
> direct or trivial analogs in the POWER ISA.


>   * config.gcc (powerpc*-*-*): Add pmmintrin.h.

"... to extra_headers"?


> +#if 0
> +/* POWER8 / POWER9 have no equivalent.  */
> +
> +extern __inline void __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
> +_mm_monitor (void const * __P, unsigned int __E, unsigned int __H)
> +{
> +  __builtin_ia32_monitor (__P, __E, __H);
> +#error "_mm_monitor is not supported on this architecture."
> +}
> +
> +extern __inline void __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
> +_mm_mwait (unsigned int __E, unsigned int __H)
> +{
> +#error "_mm_mwait is not supported on this architecture."
> +}
> +#endif
> +
> +#endif /* _PMMINTRIN_H_INCLUDED */

Don't #if 0 the x86 implementation please; just say e.g.

/* POWER8 / POWER9 have no equivalent for __builtin_ia32_monitor and
   _mm_mwait.  */


Okay for trunk.  Thanks!


Segher


Re: [patch] Fix PR tree-optimization/86659

2018-10-05 Thread Eric Botcazou
> So I wonder why it is necessary to track 'reverse' in gimple_match_op
> at all given we bail out without optimizing as far as I can see?

Because of the valueization?  If it is done, gimple_simplify returns true so 
the result will be synthetized from res_op by means of maybe_build_generic_op.
That's what I was referring to in the opening message by "the underlying issue 
of the missing propagation of the flag during GIMPLE folding".

-- 
Eric Botcazou


Re: #999 spurious error on derived record passed as Out parameter

2018-10-05 Thread Eric Botcazou
Sorry, wrong list...

-- 
Eric Botcazou


#999 spurious error on derived record passed as Out parameter

2018-10-05 Thread Eric Botcazou
The error message is:

p.adb:7:03: unconstrained subtype not allowed (need initialization)
p.adb:7:03: provide initial value or explicit discriminant values
p.adb:7:03: or give default discriminant values for type "Rec"

and is issued by every compiler I tried.  This doesn't happen if the parameter 
is In Out instead of Out.  The problem is that the compiler doesn't initialize 
the temporary it creates around the call in the Out case.  This blocks the 2nd 
part of the fix for R914-006.

-- 
Eric Botcazou
with Q; use Q;

procedure P is
  D : Derived;
begin
  Proc1 (Rec (D));
  Proc2 (Rec (D));
end;
package Q is

   type T_Unsigned8  is new Natural range 0 .. (2 ** 8 - 1);

   type Rec (Discriminant : T_Unsigned8) is record
  Fixed_Field : T_Unsigned8;
  case Discriminant is
 when 0 =>
Optional_Field : T_unsigned8;
 when others =>
null;
  end case;
   end record;

   type Derived is new Rec (0);

   for Derived use record
  Fixed_Fieldat 0 range 0  .. 7;
  Discriminant   at 0 range 8  .. 15;
  Optional_Field at 0 range 16 .. 23;
   end record;

   procedure Proc1 (R : in out Rec);

   procedure Proc2 (R : out Rec);

end Q;


Re: [PATCH] i386: Don't pass -msse2avx to assembler for -mavx

2018-10-05 Thread Uros Bizjak
On Fri, Oct 5, 2018 at 1:01 AM H.J. Lu  wrote:
>
>
> With
>
> gcc -O2 -fPIC -flto -g -c -o a.o a.c
> gcc -O2 -fPIC -flto -g -mavx   -c -o b.o b.c
> gcc -shared -O2 -fPIC -flto -g -o lib1.so a.o b.o
>
> LTO correctly generates AVX for b.o and SSE for a.o.  But the GCC driver
> passes -msse2avx to assembler, which encodes SSE instructions as AVX
> instructions.  We shouldn't pass -msse2avx to assembler for -mavx.
>
> Tested on x86-64.  OK for trunk?
>
> Thanks.
>
> H.J.
> ---
> PR target/87522
> * config/i386/gnu-user.h (ASM_SPEC): Don't pass -msse2avx to
> assembler for -mavx.
> * config/i386/gnu-user64.h (ASM_SPEC): Likewise.

OK for mainline and backports.

Thanks,
Uros.

>  gcc/config/i386/gnu-user.h   | 2 +-
>  gcc/config/i386/gnu-user64.h | 2 +-
>  2 files changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/gcc/config/i386/gnu-user.h b/gcc/config/i386/gnu-user.h
> index a922c9b93fa..baed87aa54f 100644
> --- a/gcc/config/i386/gnu-user.h
> +++ b/gcc/config/i386/gnu-user.h
> @@ -67,7 +67,7 @@ along with GCC; see the file COPYING3.  If not see
>
>  #undef  ASM_SPEC
>  #define ASM_SPEC \
> -  "--32 %{!mno-sse2avx:%{mavx:-msse2avx}} %{msse2avx:%{!mavx:-msse2avx}}"
> +  "--32 %{msse2avx:%{!mavx:-msse2avx}}"
>
>  #undef  SUBTARGET_EXTRA_SPECS
>  #define SUBTARGET_EXTRA_SPECS \
> diff --git a/gcc/config/i386/gnu-user64.h b/gcc/config/i386/gnu-user64.h
> index f7a68fdecf0..09141ce3508 100644
> --- a/gcc/config/i386/gnu-user64.h
> +++ b/gcc/config/i386/gnu-user64.h
> @@ -50,7 +50,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  
> If not, see
>  #define ASM_SPEC "%{" SPEC_32 ":--32} \
>   %{" SPEC_64 ":--64} \
>   %{" SPEC_X32 ":--x32} \
> - %{!mno-sse2avx:%{mavx:-msse2avx}} %{msse2avx:%{!mavx:-msse2avx}}"
> + %{msse2avx:%{!mavx:-msse2avx}}"
>
>  #define GNU_USER_TARGET_LINK_SPEC \
>"%{" SPEC_64 ":-m " GNU_USER_LINK_EMULATION64 "} \
> --
> 2.17.1
>