[AARCH64] Remove static variable all_extensions from aarch64.c

2016-05-16 Thread Kugan Vivekanandarajah
Hi,

static variable all_extensions in aarch64.c is not used and therefore
dead. I don’t see any reason why it should be there. Attached patch
removes this.


Bootstrapped on aarch64-linux-gnu. Regression testing is ongoing.

Is this OK for trunk?

Thanks,
Kugan

gcc/ChangeLog:

2016-05-17  Kugan Vivekanandarajah  

* config/aarch64/aarch64.c (all_extensions): Removed unused static variable.
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index e081b16..00ab85b 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -663,16 +663,6 @@ struct aarch64_option_extension
   const unsigned long flags_off;
 };
 
-/* ISA extensions in AArch64.  */
-static const struct aarch64_option_extension all_extensions[] =
-{
-#define AARCH64_OPT_EXTENSION(NAME, X, FLAGS_ON, FLAGS_OFF, FEATURE_STRING) \
-  {NAME, FLAGS_ON, FLAGS_OFF},
-#include "aarch64-option-extensions.def"
-#undef AARCH64_OPT_EXTENSION
-  {NULL, 0, 0}
-};
-
 typedef enum aarch64_cond_code
 {
   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,


[PATCH 3/3] function: Restructure *logue insertion

2016-05-16 Thread Segher Boessenkool
This patch restructures how the prologues/epilogues are inserted.  Sibcalls
that run without prologue are now handled in shrink-wrap.c; it communicates
what is already handled by setting the EDGE_IGNORE flag.  The
try_shrink_wrapping function then doesn't need to be passed the bb_flags
anymore.

Tested like the previous two patches; is this okay for trunk?


Segher


2016-05-16  Segher Boessenkool  

* function.c (make_epilogue_seq): Remove epilogue_end parameter.
(thread_prologue_and_epilogue_insns): Remove bb_flags.  Restructure
code.  Ignore sibcalls on EDGE_IGNORE edges.
* shrink-wrap.c (handle_simple_exit): New function.  Set EDGE_IGNORE
on edges for sibcalls that run without prologue.  The rest of the
function is combined from...
(fix_fake_fallthrough_edge): ... this, and ...
(try_shrink_wrapping): ... a part of this.  Remove the bb_with
function argument, make it a local variable.

---
 gcc/function.c| 168 ++
 gcc/shrink-wrap.c |  88 ++--
 gcc/shrink-wrap.h |   3 +-
 3 files changed, 113 insertions(+), 146 deletions(-)

diff --git a/gcc/function.c b/gcc/function.c
index 75d2ad4..278aaf6 100644
--- a/gcc/function.c
+++ b/gcc/function.c
@@ -5819,13 +5819,13 @@ make_prologue_seq (void)
 }
 
 static rtx_insn *
-make_epilogue_seq (rtx_insn **epilogue_end)
+make_epilogue_seq (void)
 {
   if (!targetm.have_epilogue ())
 return NULL;
 
   start_sequence ();
-  *epilogue_end = emit_note (NOTE_INSN_EPILOGUE_BEG);
+  emit_note (NOTE_INSN_EPILOGUE_BEG);
   rtx_insn *seq = targetm.gen_epilogue ();
   if (seq)
 emit_jump_insn (seq);
@@ -5897,66 +5897,29 @@ make_epilogue_seq (rtx_insn **epilogue_end)
 void
 thread_prologue_and_epilogue_insns (void)
 {
-  bool inserted;
-  bitmap_head bb_flags;
-  rtx_insn *epilogue_end ATTRIBUTE_UNUSED;
-  edge e, entry_edge, orig_entry_edge, exit_fallthru_edge;
-  edge_iterator ei;
-
   df_analyze ();
 
-  rtl_profile_for_bb (ENTRY_BLOCK_PTR_FOR_FN (cfun));
-
-  inserted = false;
-  epilogue_end = NULL;
-
   /* Can't deal with multiple successors of the entry block at the
  moment.  Function should always have at least one entry
  point.  */
   gcc_assert (single_succ_p (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
-  entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
-  orig_entry_edge = entry_edge;
+
+  edge entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
+  edge orig_entry_edge = entry_edge;
 
   rtx_insn *split_prologue_seq = make_split_prologue_seq ();
   rtx_insn *prologue_seq = make_prologue_seq ();
-  rtx_insn *epilogue_seq = make_epilogue_seq (&epilogue_end);
-
-  bitmap_initialize (&bb_flags, &bitmap_default_obstack);
+  rtx_insn *epilogue_seq = make_epilogue_seq ();
 
   /* Try to perform a kind of shrink-wrapping, making sure the
  prologue/epilogue is emitted only around those parts of the
  function that require it.  */
 
-  try_shrink_wrapping (&entry_edge, &bb_flags, prologue_seq);
+  try_shrink_wrapping (&entry_edge, prologue_seq);
 
-  if (split_prologue_seq != NULL_RTX)
-{
-  insert_insn_on_edge (split_prologue_seq, orig_entry_edge);
-  inserted = true;
-}
-  if (prologue_seq != NULL_RTX)
-{
-  insert_insn_on_edge (prologue_seq, entry_edge);
-  inserted = true;
-}
-
-  /* If the exit block has no non-fake predecessors, we don't need
- an epilogue.  */
-  FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
-if ((e->flags & EDGE_FAKE) == 0)
-  break;
-  if (e == NULL)
-goto epilogue_done;
 
   rtl_profile_for_bb (EXIT_BLOCK_PTR_FOR_FN (cfun));
 
-  exit_fallthru_edge = find_fallthru_edge (EXIT_BLOCK_PTR_FOR_FN 
(cfun)->preds);
-
-  /* If nothing falls through into the exit block, we don't need an
- epilogue.  */
-  if (exit_fallthru_edge == NULL)
-goto epilogue_done;
-
   /* A small fib -- epilogue is not yet completed, but we wish to re-use
  this marker for the splits of EH_RETURN patterns, and nothing else
  uses the flag in the meantime.  */
@@ -5967,6 +5930,8 @@ thread_prologue_and_epilogue_insns (void)
  code.  In order to be able to properly annotate these with unwind
  info, try to split them now.  If we get a valid split, drop an
  EPILOGUE_BEG note and mark the insns as epilogue insns.  */
+  edge e;
+  edge_iterator ei;
   FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
 {
   rtx_insn *prev, *last, *trial;
@@ -5986,78 +5951,84 @@ thread_prologue_and_epilogue_insns (void)
   emit_note_after (NOTE_INSN_EPILOGUE_BEG, prev);
 }
 
-  if (epilogue_seq)
-{
-  insert_insn_on_edge (epilogue_seq, exit_fallthru_edge);
-  inserted = true;
-}
-  else
-{
-  basic_block cur_bb;
+  edge exit_fallthru_edge = find_fallthru_edge (EXIT_BLOCK_PTR_FOR_FN 
(cfun)->preds);
 
-  if (! next_active_insn (BB_END (exit_fallthru_edge->src)))
-   goto epil

[PATCH 2/3] function: Factor out make_*logue_seq

2016-05-16 Thread Segher Boessenkool
Make new functions make_split_prologue_seq, make_prologue_seq, and
make_epilogue_seq.

Tested as in the previous patch; is this okay for trunk?


Segher


2016-05-16  Segher Boessenkool  

* function.c (make_split_prologue_seq, make_prologue_seq,
make_epilogue_seq): New functions, factored out from...
(thread_prologue_and_epilogue_insns): Here.

---
 gcc/function.c | 154 +++--
 1 file changed, 85 insertions(+), 69 deletions(-)

diff --git a/gcc/function.c b/gcc/function.c
index b9a6338..75d2ad4 100644
--- a/gcc/function.c
+++ b/gcc/function.c
@@ -5768,6 +5768,83 @@ set_return_jump_label (rtx_insn *returnjump)
 JUMP_LABEL (returnjump) = ret_rtx;
 }
 
+static rtx_insn *
+make_split_prologue_seq (void)
+{
+  if (!flag_split_stack
+  || lookup_attribute ("no_split_stack", DECL_ATTRIBUTES (cfun->decl)))
+return NULL;
+
+  start_sequence ();
+  emit_insn (targetm.gen_split_stack_prologue ());
+  rtx_insn *seq = get_insns ();
+  end_sequence ();
+
+  record_insns (seq, NULL, &prologue_insn_hash);
+  set_insn_locations (seq, prologue_location);
+
+  return seq;
+}
+
+static rtx_insn *
+make_prologue_seq (void)
+{
+  if (!targetm.have_prologue ())
+return NULL;
+
+  start_sequence ();
+  rtx_insn *seq = targetm.gen_prologue ();
+  emit_insn (seq);
+
+  /* Insert an explicit USE for the frame pointer
+ if the profiling is on and the frame pointer is required.  */
+  if (crtl->profile && frame_pointer_needed)
+emit_use (hard_frame_pointer_rtx);
+
+  /* Retain a map of the prologue insns.  */
+  record_insns (seq, NULL, &prologue_insn_hash);
+  emit_note (NOTE_INSN_PROLOGUE_END);
+
+  /* Ensure that instructions are not moved into the prologue when
+ profiling is on.  The call to the profiling routine can be
+ emitted within the live range of a call-clobbered register.  */
+  if (!targetm.profile_before_prologue () && crtl->profile)
+emit_insn (gen_blockage ());
+
+  seq = get_insns ();
+  end_sequence ();
+  set_insn_locations (seq, prologue_location);
+
+  return seq;
+}
+
+static rtx_insn *
+make_epilogue_seq (rtx_insn **epilogue_end)
+{
+  if (!targetm.have_epilogue ())
+return NULL;
+
+  start_sequence ();
+  *epilogue_end = emit_note (NOTE_INSN_EPILOGUE_BEG);
+  rtx_insn *seq = targetm.gen_epilogue ();
+  if (seq)
+emit_jump_insn (seq);
+
+  /* Retain a map of the epilogue insns.  */
+  record_insns (seq, NULL, &epilogue_insn_hash);
+  set_insn_locations (seq, epilogue_location);
+
+  seq = get_insns ();
+  rtx_insn *returnjump = get_last_insn ();
+  end_sequence ();
+
+  if (JUMP_P (returnjump))
+set_return_jump_label (returnjump);
+
+  return seq;
+}
+
+
 
 /* Generate the prologue and epilogue RTL if the machine supports it.  Thread
this into place with notes indicating where the prologue ends and where
@@ -5822,9 +5899,7 @@ thread_prologue_and_epilogue_insns (void)
 {
   bool inserted;
   bitmap_head bb_flags;
-  rtx_insn *returnjump;
   rtx_insn *epilogue_end ATTRIBUTE_UNUSED;
-  rtx_insn *prologue_seq ATTRIBUTE_UNUSED, *split_prologue_seq 
ATTRIBUTE_UNUSED;
   edge e, entry_edge, orig_entry_edge, exit_fallthru_edge;
   edge_iterator ei;
 
@@ -5834,7 +5909,6 @@ thread_prologue_and_epilogue_insns (void)
 
   inserted = false;
   epilogue_end = NULL;
-  returnjump = NULL;
 
   /* Can't deal with multiple successors of the entry block at the
  moment.  Function should always have at least one entry
@@ -5843,46 +5917,9 @@ thread_prologue_and_epilogue_insns (void)
   entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
   orig_entry_edge = entry_edge;
 
-  split_prologue_seq = NULL;
-  if (flag_split_stack
-  && (lookup_attribute ("no_split_stack", DECL_ATTRIBUTES (cfun->decl))
- == NULL))
-{
-  start_sequence ();
-  emit_insn (targetm.gen_split_stack_prologue ());
-  split_prologue_seq = get_insns ();
-  end_sequence ();
-
-  record_insns (split_prologue_seq, NULL, &prologue_insn_hash);
-  set_insn_locations (split_prologue_seq, prologue_location);
-}
-
-  prologue_seq = NULL;
-  if (targetm.have_prologue ())
-{
-  start_sequence ();
-  rtx_insn *seq = targetm.gen_prologue ();
-  emit_insn (seq);
-
-  /* Insert an explicit USE for the frame pointer
- if the profiling is on and the frame pointer is required.  */
-  if (crtl->profile && frame_pointer_needed)
-   emit_use (hard_frame_pointer_rtx);
-
-  /* Retain a map of the prologue insns.  */
-  record_insns (seq, NULL, &prologue_insn_hash);
-  emit_note (NOTE_INSN_PROLOGUE_END);
-
-  /* Ensure that instructions are not moved into the prologue when
-profiling is on.  The call to the profiling routine can be
-emitted within the live range of a call-clobbered register.  */
-  if (!targetm.profile_before_prologue () && crtl->profile)
-emit_insn (gen_blockage ());
-
-  prologue_seq = get_insns ();
-  

[PATCH 1/3] function: Do the CLEANUP_EXPENSIVE after shrink-wrapping, not before

2016-05-16 Thread Segher Boessenkool
We should do CLEANUP_EXPENSIVE after shrink-wrapping, because shrink-
wrapping creates constructs that CLEANUP_EXPENSIVE can optimise, and
nothing runs CLEANUP_EXPENSIVE later.  We don't need cleanup_cfg before
shrink-wrapping, nothing in shrink-wrapping (or the other *logue insertion
code) cares at all.

Tested this (and the other two patches in this series) on powerpc64-linux
(-m32/-m64, -mlra/-mno-lra); on powerpc64le-linux; and on x86_64-linux.
No regressions.

Is this okay for trunk?


Segher


2016-05-16  Segher Boessenkool  

* function.c (rest_of_handle_thread_prologue_and_epilogue): Call
cleanup_cfg with CLEANUP_EXPENSIVE after shrink-wrapping.  Don't
call cleanup_cfg before shrink-wrapping.

---
 gcc/function.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/gcc/function.c b/gcc/function.c
index 70584b9..b9a6338 100644
--- a/gcc/function.c
+++ b/gcc/function.c
@@ -6369,9 +6369,6 @@ make_pass_leaf_regs (gcc::context *ctxt)
 static unsigned int
 rest_of_handle_thread_prologue_and_epilogue (void)
 {
-  if (optimize)
-cleanup_cfg (CLEANUP_EXPENSIVE);
-
   /* On some machines, the prologue and epilogue code, or parts thereof,
  can be represented as RTL.  Doing so lets us schedule insns between
  it and the rest of the code and also allows delayed branch
@@ -6384,7 +6381,7 @@ rest_of_handle_thread_prologue_and_epilogue (void)
 
   /* Shrink-wrapping can result in unreachable edges in the epilogue,
  see PR57320.  */
-  cleanup_cfg (0);
+  cleanup_cfg (optimize ? CLEANUP_EXPENSIVE : 0);
 
   /* The stack usage info is finalized during prologue expansion.  */
   if (flag_stack_usage_info)
-- 
1.9.3



Re: [PATCH] misc minor doc fixes

2016-05-16 Thread Sandra Loosemore

On 05/16/2016 05:05 PM, Jim Wilson wrote:

Deletes text claiming that major version changes are rare, and fixes
two misspellings of signaling.

Tested with make info and make dvi.


This looks fine to me.

-Sandra



Re: PING^5 [PATCH, GCC 5] PR 70613, -fabi-version docs don't match implementation

2016-05-16 Thread Sandra Loosemore

On 05/16/2016 04:35 PM, Jim Wilson wrote:

This is my fifth ping.  I just need someone to rubber stamp it so I
can check it in.


The documentation change looks fine, but as a documentation maintainer 
only I don't think I can approve changes to a release branch.


-Sandra




[PATCH] misc minor doc fixes

2016-05-16 Thread Jim Wilson
Deletes text claiming that major version changes are rare, and fixes
two misspellings of signaling.

Tested with make info and make dvi.

Jim
2016-05-16  Jim Wilson  

	* doc/cpp.texi (__GNUC__): Major version changes are no longer rare.
	* doc/invoke.texi (-mnan=2008): Change signalling to signaling.
	* doc/md.texi (fmin@var{m}3): Likewise.

Index: cpp.texi
===
--- cpp.texi	(revision 236231)
+++ cpp.texi	(working copy)
@@ -1984,7 +1984,7 @@ by GCC, or a non-GCC compiler that claims to accep
 you can simply test @code{__GNUC__}.  If you need to write code
 which depends on a specific version, you must be more careful.  Each
 time the minor version is increased, the patch level is reset to zero;
-each time the major version is increased (which happens rarely), the
+each time the major version is increased, the
 minor version and patch level are reset.  If you wish to use the
 predefined macros directly in the conditional, you will need to write it
 like this:
Index: invoke.texi
===
--- invoke.texi	(revision 236231)
+++ invoke.texi	(working copy)
@@ -18130,7 +18130,7 @@ IEEE 754 floating-point data.
 
 The @option{-mnan=legacy} option selects the legacy encoding.  In this
 case quiet NaNs (qNaNs) are denoted by the first bit of their trailing
-significand field being 0, whereas signalling NaNs (sNaNs) are denoted
+significand field being 0, whereas signaling NaNs (sNaNs) are denoted
 by the first bit of their trailing significand field being 1.
 
 The @option{-mnan=2008} option selects the IEEE 754-2008 encoding.  In
Index: md.texi
===
--- md.texi	(revision 236231)
+++ md.texi	(working copy)
@@ -5018,7 +5018,7 @@ it is unspecified which of the two operands is ret
 IEEE-conformant minimum and maximum operations.  If one operand is a quiet
 @code{NaN}, then the other operand is returned.  If both operands are quiet
 @code{NaN}, then a quiet @code{NaN} is returned.  In the case when gcc supports
-signalling @code{NaN} (-fsignaling-nans) an invalid floating point exception is
+signaling @code{NaN} (-fsignaling-nans) an invalid floating point exception is
 raised and a quiet @code{NaN} is returned.
 
 All operands have mode @var{m}, which is a scalar or vector


Re: [PATCH 0/4] RFC: RTL frontend

2016-05-16 Thread Jeff Law

On 05/04/2016 02:49 PM, David Malcolm wrote:


* The existing RTL code is structured around a single function being
  optimized, so, as a simplification, the RTL frontend can only handle
  one function per input file.  Also, the dump format currently uses
  comments to separate functions::

;; Function test_1 (test_1, funcdef_no=0, decl_uid=1758, cgraph_uid=0, 
symbol_order=0)
ISTM we can fix this by adding more true structure to the RTL dump. 
IMHO we have the freedom to extend the RTL dumper to make it easier to 
read the RTL dumps in for this kind of work.





... various pass-specific things, sometimes expressed as comments,
sometimes not

Which seems like a bug to me.



;;
;; Full RTL generated for this function:
;;
(note 1 0 6 NOTE_INSN_DELETED)
;; etc, insns for function "test_1" go here
(insn 27 26 0 6 (use (reg/i:SI 0 ax)) 
../../src/gcc/testsuite/rtl.dg/test.c:7 -1
 (nil))

;; Function test_2 (test_2, funcdef_no=1, decl_uid=1765, cgraph_uid=1, 
symbol_order=1)
... various pass-specific things, sometimes expressed as comments,
sometimes not
;;
;; Full RTL generated for this function:
;;
(note 1 0 5 NOTE_INSN_DELETED)
;; etc, insns for function "test_2" go here
(insn 59 58 0 8 (use (reg/i:SF 21 xmm0)) 
../../src/gcc/testsuite/rtl.dg/test.c:31 -1
 (nil))

  so that there's no clear separation of the instructions between the
  two functions (and no metadata e.g. function names).

  This could be fixed by adding a new clause to the dump e.g.::

Which would seem like a good idea to me.



* Similarly, there are no types beyond the built-in ones; all expressions
  are treated as being of type int.  I suspect that this approach
  will be too simplistic when it comes to e.g. aliasing.
Well, we have pointers back to the tree IL for this kind of thing, but 
it's far from ideal because of the lack of separation that implies.


I wouldn't lose a ton of sleep if we punted this for a while, perhaps 
just dumping the alias set splay tree so we can at least carry that 
information around.




* There's no support for running more than one pass; fixing this would
  require being able to run passes from a certain point onwards.

I think that's OK at this stage.



* Roundtripping of recognized instructions may be an issue (i.e. those
  with INSN_CODE != -1), such as the "667 {jump}" in the following::

(jump_insn 50 49 51 10
  (set (pc)
   (label_ref:DI 59)) ../../src/test-switch.c:18 667 {jump}
   (nil) -> 59)

  since the integer ID can change when the .md files are changed
  (and the associated pattern name is very much target-specific).
  It may be best to reset them to -1 in the input files (and delete the
  operation name), giving::
Just ignore the index and the pretty name.  When you're done reading the 
file, call recog on each insn to get that information filled in.





(jump_insn 50 49 51 10
  (set (pc)
   (label_ref:DI 59)) ../../src/test-switch.c:18 -1
   (nil) -> 59)

* Currently there's no explicit CFG edge information in the dumps.
  The rtl1 frontend reconstructs the edges based on jump instructions.
  As I understand the distinction between cfgrtl and cfglayout modes
  https://gcc.gnu.org/wiki/cfglayout_mode , this is OK for "cfgrtl" mode,
  but isn't going to work for "cfglayout" mode - in the latter,
  unconditional jumps are represented purely by edges in the CFG, and this
  information isn't currently present in the dumps  (perhaps we could add it
  if it's an issue).
We could either add the CFG information or you could extract it from the 
guts of the RTL you read.  The former leads to the possibility of an 
inconsistent view of the CFG.  The latter is more up-front work and has 
to deal with the differences between cfgrtl and cfglayout modes.




Open Questions
**

* Register numbering: consider this fragment of RTL emitted during
  expansion::

(reg/f:DI 82 virtual-stack-vars)

  At the time of emission, register 82 is the VIRTUAL_STACK_VARS_REGNUM,
  and this value is effectively hardcoded into the dump.  Presumably this
  is baking in assumptions about the target into the test.  Also, how likely is
  this value to change?  When we reload the dump, should we notice that this
  is tagged with virtual-stack-vars and override the specific register
  number to use the current value of VIRTUAL_STACK_VARS_REGNUM on the
  target rtl1 was built for?
Those change semi-regularly.  Essentially anytime a new version of the 
ISA shows up with new register #s.


My instinct is to drop raw numbers and just output them symbolicly.  We 
can map them back into the hard register numbers easy enough.  We would 
want to use some magic to identify pseudo regs.  P1...PN in the dumps 
which we'd map to FIRST_PSEUDO_REGISTER+N when we read the file in.



Jeff


Re: match.pd: ~X & Y to X ^ Y in some cases

2016-05-16 Thread Jeff Law

On 05/16/2016 04:31 PM, Marc Glisse wrote:

On Mon, 16 May 2016, Jeff Law wrote:


Please use if (GIMPLE
   && ((get_nonzero_bits ...)

Rather than #if GIMPLE


Richard asked for the reverse in some previous patch:
https://gcc.gnu.org/ml/gcc-patches/2016-04/msg01617.html

I don't really care which one we settle on...

If Richi wanted the reverse, then go with it.  I'm not going to object 
over something like that.


jeff


Re: PING^5 [PATCH, GCC 5] PR 70613, -fabi-version docs don't match implementation

2016-05-16 Thread Jim Wilson
This is my fifth ping.  I just need someone to rubber stamp it so I
can check it in.

Maybe it would be easier if I volunteered to be a doc maintainer so I
can self approve it?

Jim

On Mon, May 9, 2016 at 4:21 PM, Jim Wilson  wrote:
> On Mon, May 2, 2016 at 12:13 PM, Jim Wilson  wrote:
>> Here is a patch to correct the -fabi-version docs on the GCC 5 branch.
>> https://gcc.gnu.org/ml/gcc-patches/2016-04/msg00480.html
>
> You can see the default -fabi-version in gcc/c-family/c-opts.c on the
> gcc-5 branch which has
>
>   /* Change flag_abi_version to be the actual current ABI level for the
>  benefit of c_cpp_builtins.  */
>   if (flag_abi_version == 0)
> flag_abi_version = 9;
>
> You can see in the docs that -fabi-version only goes up to 8.
> 
> https://gcc.gnu.org/onlinedocs/gcc-5.3.0/gcc/C_002b_002b-Dialect-Options.html#C_002b_002b-Dialect-Options
>
> As for how we got here...
> I see that the patch for bug 65945 was back ported to the gcc-5
> branch, which required a partial backport of the patch for bug 44282,
> which added abi version 9.  The original patch for 44282 is missing
> the doc change.
>
> The missing doc change was then added here
> https://gcc.gnu.org/viewcvs/gcc?view=revision&revision=228017
> which has the invoke.texi hunk we need, but is missing a ChangeLog
> entry for it.  So it appears all we need is a partial backport of this
> invoke.texi hunk.  This is mostly documenting a change to -Wabi, so we
> only need parts of two hunks that document -fabi-version=9 and mention
> gcc-5.2.
>
> The patch is attached again.
>
> Jim
Index: ChangeLog
===
--- ChangeLog	(revision 234867)
+++ ChangeLog	(working copy)
@@ -1,3 +1,12 @@
+2016-04-11  Jim Wilson  
+
+	Partial backport from trunk r228017.
+	2015-09-22  Jason Merrill  
+
+	PR c++/70613
+	* doc/invoke.texi (-fabi-version): Document version 9.
+	(-Wabi): Document version 9.  Mention version 8 is default for GCC 5.1.
+
 2016-04-09  Oleg Endo  
 
 	Backport from mainline
Index: doc/invoke.texi
===
--- doc/invoke.texi	(revision 234867)
+++ doc/invoke.texi	(working copy)
@@ -2118,6 +2118,9 @@ scope.
 Version 8, which first appeared in G++ 4.9, corrects the substitution
 behavior of function types with function-cv-qualifiers.
 
+Version 9, which first appeared in G++ 5.2, corrects the alignment of
+@code{nullptr_t}.
+
 See also @option{-Wabi}.
 
 @item -fabi-compat-version=@var{n}
@@ -2619,7 +2622,15 @@ When mangling a function type with function-cv-qua
 un-qualified function type was incorrectly treated as a substitution
 candidate.
 
-This was fixed in @option{-fabi-version=8}.
+This was fixed in @option{-fabi-version=8}, the default for GCC 5.1.
+
+@item
+@code{decltype(nullptr)} incorrectly had an alignment of 1, leading to
+unaligned accesses.  Note that this did not affect the ABI of a
+function with a @code{nullptr_t} parameter, as parameters have a
+minimum alignment.
+
+This was fixed in @option{-fabi-version=9}, the default for GCC 5.2.
 @end itemize
 
 It also warns about psABI-related changes.  The known psABI changes at this


Re: match.pd: ~X & Y to X ^ Y in some cases

2016-05-16 Thread Marc Glisse

On Mon, 16 May 2016, Jeff Law wrote:


Please use if (GIMPLE
   && ((get_nonzero_bits ...)

Rather than #if GIMPLE


Richard asked for the reverse in some previous patch:
https://gcc.gnu.org/ml/gcc-patches/2016-04/msg01617.html

I don't really care which one we settle on...

--
Marc Glisse


Re: [PATCH 4/4] Initial version of RTL frontend

2016-05-16 Thread Jeff Law

On 05/10/2016 08:13 AM, David Malcolm wrote:

On Wed, 2016-05-04 at 16:49 -0400, David Malcolm wrote:
[...snip...]


I wrote this by compiling a test.c with -fdump-rtl-all on
x86_64-pc-linux-gnu, and then attempting to load the state at each
pass, and then trying to run just one pass, and fixing bugs until the
dump output from each pass was the same as when running the pass from
cc1.  I've only done this with a small subset of passes, and with a
very
simple test case, so I'm sure there are plenty of bugs and "x86_64
-isms"
remaining.


[...snip...]

On the subject of "x86_64-isms", a couple of issues I ran into when
testing with --target=aarch64-linux-gnu:

* roundtrip.exp fails due to different register names so e.g. this from
x86_64:

 (reg:SI 5 di [ i ]))

  becomes this after roundtripping on aarch64:

 (reg:SI 5 x5 [ i ]))

  (i.e. "di" vs "x5" for register 5).

* unknown modes e.g. this from x86_64:

(reg:CCGC 17 flags)

  fails on aarch64 due to the lack of a "CCGC" mode.
Fundamentally all this stuff is target dependent.  Trying to read in an 
x86-64 generated dump into an aarch64 targeted compiler is a waste of 
time/effort.  THe number of things you're going to stumble over are 
innumerable.


I think we lay down the law that RTL dumps are target specific.There 
probably should be some kind of metadata emitted in the dump file which 
identifies the target and if we try to read in the wrong stuff, we get a 
nice error.


jeff


Re: [PATCH 2/3] Add profiling support for IVOPTS

2016-05-16 Thread Bin.Cheng
> As profile-guided optimization can provide very useful information
> about basic block frequencies within a loop, following patch set leverages
> that information. It speeds up a single benchmark from upcoming SPECv6
> suite by 20% (-O2 -profile-generate/-fprofile use) and I think it can
> also improve others (currently measuring numbers for PGO).
Hi,
Is this 20% improvement from this patch, or does it include the
existing PGO's improvement?

For the patch:
> +
> +  /* Return true if the frequency has a valid value.  */
> +  bool has_frequency ();
> +
>/* Return infinite comp_cost.  */
>static comp_cost get_infinite ();
>
> @@ -249,6 +272,9 @@ private:
>   complexity field should be larger for more
>   complex expressions and addressing modes).  */
>int m_scratch;  /* Scratch used during cost computation.  */
> +  sreal m_frequency;  /* Frequency of the basic block this comp_cost
> + belongs to.  */
> +  sreal m_cost_scaled;  /* Scalled runtime cost.  */
IMHO we shouldn't embed frequency in comp_cost, neither record scaled
cost in it.  I would suggest we compute cost and amortize the cost
over frequency in get_computation_cost_at before storing it into
comp_cost.  That is, once cost is computed/stored in comp_cost, it is
already scaled with frequency.  One argument is frequency info is only
valid for use's statement/basic_block, it really doesn't have clear
meaning in comp_cost structure.  Outside of function
get_computation_cost_at, I found it's hard to understand/remember
what's the meaning of comp_cost.m_frequency and where it came from.
There are other reasons embedded in below comments.
>
>
>  comp_cost&
> @@ -257,6 +283,8 @@ comp_cost::operator= (const comp_cost& other)
>m_cost = other.m_cost;
>m_complexity = other.m_complexity;
>m_scratch = other.m_scratch;
> +  m_frequency = other.m_frequency;
> +  m_cost_scaled = other.m_cost_scaled;
>
>return *this;
>  }
> @@ -275,6 +303,7 @@ operator+ (comp_cost cost1, comp_cost cost2)
>
>cost1.m_cost += cost2.m_cost;
>cost1.m_complexity += cost2.m_complexity;
> +  cost1.m_cost_scaled += cost2.m_cost_scaled;
>
>return cost1;
>  }
> @@ -290,6 +319,8 @@ comp_cost
>  comp_cost::operator+= (HOST_WIDE_INT c)
This and below operators need check for infinite cost first and return
immediately.
>  {
>this->m_cost += c;
> +  if (has_frequency ())
> +this->m_cost_scaled += scale_cost (c);
>
>return *this;
>  }
> @@ -5047,18 +5128,21 @@ get_computation_cost_at (struct ivopts_data *data,
>   (symbol/var1/const parts may be omitted).  If we are looking for an
>   address, find the cost of addressing this.  */
>if (address_p)
> -return cost + get_address_cost (symbol_present, var_present,
> -offset, ratio, cstepi,
> -mem_mode,
> -TYPE_ADDR_SPACE (TREE_TYPE (utype)),
> -speed, stmt_is_after_inc, can_autoinc);
> +{
> +  cost += get_address_cost (symbol_present, var_present,
> + offset, ratio, cstepi,
> + mem_mode,
> + TYPE_ADDR_SPACE (TREE_TYPE (utype)),
> + speed, stmt_is_after_inc, can_autoinc);
> +  goto ret;
> +}
>
>/* Otherwise estimate the costs for computing the expression.  */
>if (!symbol_present && !var_present && !offset)
>  {
>if (ratio != 1)
>   cost += mult_by_coeff_cost (ratio, TYPE_MODE (ctype), speed);
> -  return cost;
> +  goto ret;
>  }
>
>/* Symbol + offset should be compile-time computable so consider that they
> @@ -5077,7 +5161,8 @@ get_computation_cost_at (struct ivopts_data *data,
>aratio = ratio > 0 ? ratio : -ratio;
>if (aratio != 1)
>  cost += mult_by_coeff_cost (aratio, TYPE_MODE (ctype), speed);
> -  return cost;
> +
> +  goto ret;
>
>  fallback:
>if (can_autoinc)
> @@ -5093,8 +5178,13 @@ fallback:
>  if (address_p)
>comp = build_simple_mem_ref (comp);
>
> -return comp_cost (computation_cost (comp, speed), 0);
> +cost = comp_cost (computation_cost (comp, speed), 0);
>}
> +
> +ret:
> +  cost.calculate_scaled_cost (at->bb->frequency,
> +  data->current_loop->header->frequency);
Here cost consists of two parts.  One is for loop invariant
computation, we amortize is against avg_loop_niter and record register
pressure (either via invriant variables or invariant expressions) for
it;  the other is loop variant part.  For the first part, we should
not scaled it using frequency, since we have already assumed it would
be hoisted out of loop.  No matter where the use is, hoisted loop
invariant has the same frequency as loop header.  This is the second
reason I want to factor frequency out of comp_cost.  It's easier to
scale with frequency only it's necessary.

> +  return cost;
>  }
>
>  /* Determines the cost of the computation by that USE is expressed
> @@ -5922,16 +6012,19 @@ determine_group_iv_costs (struct ivopts_data *data)
>group = data->vgroups[i];
>
>fprintf (dump_file, "Group %d:\n", i);
> -  fprintf (dump_file, "  cand\tcost\tcompl.\tinv.ex.\tdepends

Re: [PATCH] Import config.sub and config.guess from upstream.

2016-05-16 Thread Jeff Law

On 05/13/2016 01:55 AM, Jakub Sejdak wrote:

+2016-05-13  Jakub Sejdak  

+* config.guess: Import version 2016-04-02 (newest).
+* config.sub: Import version 2016-05-10 (newest).


Installed on the trunk.  I think this was NAK's for the release branches.

jeff


Re: [PATCH 3/3] Allow constant global VAR_DECLs in constant jump functions

2016-05-16 Thread Jeff Law

On 05/12/2016 10:09 AM, Martin Jambor wrote:

Hi,

the following patch adds the final step necessary to perform
optimization requested in PR 69708, i.e do indirect inlining of a
function passed by value in a structure.  It allows jump functions to
be aggregate global constant VAR_DECLs, which enables the
constructor-walking code introduced in the first patch of the series
to deduce aggregate contents from it.  IPA-CP expects jump-functions
to be scalars, and they indeed need be for processing arithmetic
jump-functions, but this patch allows any tree for the simple ones.

Bootstrapped, lto-bootstrapped tested on x86_64.  OK for trunk?

Thanks,

Martin


2016-05-11  Martin Jambor  

PR ipa/69708
* ipa-cp.c (ipa_get_jf_pass_through_result): Allow non-ip constant
input for NOP_EXPR pass-through functions.
* ipa-prop.c (ipa_compute_jump_functions_for_edge): Allow
aggregate global constant VAR_DECLs in constant jump functions.

testsuite/
* gcc.dg/ipa/iinline-cstagg-2.c: New test.
* gcc.dg/ipa/ipcp-cstagg-5.c: Likewise.
* gcc.dg/ipa/ipcp-cstagg-6.c: Likewise.
* gcc.dg/ipa/ipcp-cstagg-7.c: Likewise.

LGTM.
jeff



Re: [PATCH 1/3] Indirect inlining of targets from references of global constants

2016-05-16 Thread Jeff Law

On 05/12/2016 10:08 AM, Martin Jambor wrote:

Hi,

the patch below implements deducing aggregate contents from pointers
to constant variables for inlining and IPA-CP, which finally makes us
perform the optimization requested in
https://gcc.gnu.org/ml/gcc/2014-07/msg00240.html. It also lays down
the basis for doing optimization requested in PR 69708 but two
additional small patches are required for that.

This means we do not give up if we can't use AA to prove that
the memory in question has not been clobbered since invocation of the
function but only mark this fact in the indirect_call_info.  Later on
we still use this information if we know that the parameter in
question points to a constant variable.

If this is deemed a god approach, we will probably want to add a
similar bit to inlining conditions.

Bootstrapped, lto-bootstrapped and tested on x86_64-linux. OK for
trunk?

Thanks,

Martin


2016-05-11  Martin Jambor  

PR ipa/69708
* cgraph.h (cgraph_indirect_call_info): New field
guaranteed_unmodified.
* ipa-cp.c (ipa_get_indirect_edge_target_1): Also pass parameter value
to ipa_find_agg_cst_for_param, check guaranteed_unmodified when
appropriate.
* ipa-inline-analysis.c (evaluate_conditions_for_known_args): Also
pass the parameter value to ipa_find_agg_cst_for_param.
* ipa-prop.c (ipa_load_from_parm_agg): New parameter
guaranteed_unmodified, store AA results there instead of bailing out
if present.
(ipa_note_param_call): Also initialize guaranteed_unmodified flag.
(ipa_analyze_indirect_call_uses): Also set guaranteed_unmodified flag.
(find_constructor_constant_at_offset): New function.
(ipa_find_agg_cst_from_init): Likewise.
(ipa_find_agg_cst_for_param): Also seearch for aggregate values in
static initializers of contants, report back through a new paameter
from_global_constant if that was the case.
(try_make_edge_direct_simple_call): Also pass parameter value to
ipa_find_agg_cst_for_param, check guaranteed_unmodified when
appropriate.
(ipa_write_indirect_edge_info): Stream new flag guaranteed_unmodified.
(ipa_read_indirect_edge_info): Likewise.
* ipa-prop.h (ipa_find_agg_cst_for_param): Update declaration.
(ipa_load_from_parm_agg): Likewise.

testsuite/
* gcc.dg/ipa/iinline-cstagg-1.c: New test.
* gcc.dg/ipa/ipcp-cstagg-1.c: Likewise.
* gcc.dg/ipa/ipcp-cstagg-2.c: Likewise.
* gcc.dg/ipa/ipcp-cstagg-3.c: Likewise.
* gcc.dg/ipa/ipcp-cstagg-4.c: Likewise.
---

LGTM.
jeff




[C++ Patch] PR 70466 ("ICE on invalid code in tree check: expected constructor, have parm_decl in convert_like_real...")

2016-05-16 Thread Paolo Carlini

Hi,

in this ICE during error recovery, the check in convert_like_real:

if (CONSTRUCTOR_NELTS (expr) == 0
&& FUNCTION_FIRST_USER_PARMTYPE (convfn) != void_list_node)

is reached for a PARM_DECL as expr. I think that the correct way to 
avoid in general such problem is adding (here too, as elsewhere) a check 
that BRACE_ENCLOSED_INITIALIZER_P (expr) is true to the outer 
conditional, for sure because talking about "converting to %qT from 
initializer list would use explicit constructor %qD", which happens 
anyway in the above conditional, otherwise doesn't make sense. Tested 
x86_64-linux.


Thanks,
Paolo.

/
/cp
2016-05-16  Paolo Carlini  

PR c++/70466
* call.c (convert_like_real): Check that we are actually converting
from an init list.

/testsuite
2016-05-16  Paolo Carlini  

PR c++/70466
* g++.dg/template/crash122.C: New.
Index: cp/call.c
===
--- cp/call.c   (revision 236300)
+++ cp/call.c   (working copy)
@@ -6377,6 +6377,7 @@ convert_like_real (conversion *convs, tree expr, t
/* When converting from an init list we consider explicit
   constructors, but actually trying to call one is an error.  */
if (DECL_NONCONVERTING_P (convfn) && DECL_CONSTRUCTOR_P (convfn)
+   && BRACE_ENCLOSED_INITIALIZER_P (expr)
/* Unless this is for direct-list-initialization.  */
&& !DIRECT_LIST_INIT_P (expr)
/* And in C++98 a default constructor can't be explicit.  */
Index: testsuite/g++.dg/template/crash122.C
===
--- testsuite/g++.dg/template/crash122.C(revision 0)
+++ testsuite/g++.dg/template/crash122.C(working copy)
@@ -0,0 +1,27 @@
+// PR c++/70466
+
+template < class T, class T >  // { dg-error "conflicting" }
+class A
+{
+public:
+  explicit A (T (S::*f) ()) {}  // { dg-error "expected" }
+};
+
+template < class T, class S > 
+A < T, S > foo (T (S::*f) ())
+{
+  return A < T, S > (f);
+}
+
+class B
+{
+public:
+  void bar () {}
+};
+
+int
+main ()
+{
+  foo (&B::bar);
+  return 0;
+}


Re: [PATCH] Enable libgloss support for ARC in top-level configure.ac

2016-05-16 Thread Jeff Law

On 05/13/2016 06:35 AM, Anton Kolesov wrote:

2016-05-13  Anton Kolesov  

* configure.ac: Add ARC support to libgloss.
* configure: Regenerate
Thanks.  Installed on the trunk after moving a mis-placed ChangeLog 
entry from Wilco.


Jeff


Re: match.pd: ~X & Y to X ^ Y in some cases

2016-05-16 Thread Jeff Law

On 05/13/2016 01:07 PM, Marc Glisse wrote:

Hello,

maybe this would fit better in VRP, but it is easier (and not completely
useless) to put it in match.pd.

Since the transformation is restricted to GIMPLE, I think I don't need
to check that @0 is SSA_NAME. I didn't test if @0 has pointer type
before calling get_range_info because we are doing bit_not on it, but it
looks like I should because we can do bitops on pointers?

Adjustment for pr69270.c is exactly the same as in the previous patch
from today :-)

Bootstrap+regtest on powerpc64le-unknown-linux-gnu.


2016-05-16  Marc Glisse  

gcc/
* match.pd (~X & Y): New transformation.

gcc/testsuite/
* gcc.dg/tree-ssa/pr69270.c: Adjust.
* gcc.dg/tree-ssa/andnot-1.c: New testcase.



Please use if (GIMPLE
&& ((get_nonzero_bits ...)

Rather than #if GIMPLE

With that, OK.

jeff


Re: RFA: Generate normal DWARF DW_LOC descriptors for non integer mode pointers

2016-05-16 Thread Jeff Law

On 05/16/2016 08:12 AM, Nick Clifton wrote:

Hi Guys,

  Currently dwarf2out.c:mem_loc_descriptor() has some special case
  code to handle the situation where an address is held in a register
  whose mode is not of type MODE_INT.  It generates a
  DW_OP_GNU_regval_type expression which may later on be converted into
  a frame pointer based expression.  This is a problem for targets which
  use a partial integer mode for their pointers (eg the msp430).  In
  such cases the conversion to a frame pointer based expression could
  be wrong if the frame pointer is not being used.

  For example the GDB testfile gdb/testsuite/gdb.base/advance.c contains
  this code fragment:

int
main ()
{
  int result;
  int b, c;
  c = 5;
  b = 3;/* advance this location */

  func (c); /* stop here after leaving current frame */

  which compiles to these instructions:

suba#6, r1
mov #5, 4(r1)   
mov #3, 2(r1)   
mov 4(r1),  r12 
calla   #0  ;

  (Note that only r1 - the stack pointer - is used.  r4 - the frame
  pointer - is not).

  The debug information produced for the "c" local variable looks like
  this:

Abbrev Number: 3 (DW_TAG_variable)
 DW_AT_name: c
 DW_AT_decl_file   : 1
 DW_AT_decl_line   : 40
 DW_AT_type: <0x37>
 DW_AT_location: 5 byte block: f5 4 21 32 1c(DW_OP_GNU_regval_type: 4 
(r4) <0x21>; DW_OP_lit2; DW_OP_minus)

  ie it says that "c" is stored in memory location "r4 - 2", which is
  wrong since register r4 is not even used in this function.

  The patch below addresses this problem by allowing the normal,
  register based descriptor to be produced when the mode is Pmode.

  With this patch applied the unexpected failure count in the GDB
  testsuite for the MSP430's -mlarge multilib changes from 2253 to 367.
  There are no regressions, for MSP430 or x86_64, and no changes to
  the GCC testsuite results for either target.

  OK to apply ?

Cheers
  Nick

gcc/ChangeLog
2016-05-16  Nick Clifton  

* dwarf2out.c (mem_loc_descriptor): Convert REG based addresses
whose mode is Pmode into basereg descriptors even if Pmode is
not an integer mode.
I'm not real familiar with dwarf, so if one of other maintainers steps 
in and says this is OK, then ignore my comments/questions.


I may be missing something, but isn't it the transition to an FP 
relative address rather than a SP relative address that's the problem 
here?  Where does that happen?  Is it possible we've got the wrong 
DECL_RTL or somesuch?


Jeff


Avoid inlining into instrumetnation thunks

2016-05-16 Thread Jan Hubicka
Hi,
this patch fixes chkp ICE when we try to inline into an instrumentation thunk.
This is not really a thunk and ths can't be hanled as such.

Bootstrapped/regtested x86_64-linux

Honza

2016-05-16  Jan Hubicka  

* ipa-inline-analysis.c (compute_inline_parameters): Disable inlinig
into instrumentation thunks.
* cif-code.def (CIF_CHKP): New.

Index: ipa-inline-analysis.c
===
--- ipa-inline-analysis.c   (revision 236275)
+++ ipa-inline-analysis.c   (working copy)
@@ -2943,7 +2943,13 @@ compute_inline_parameters (struct cgraph
   info->self_size = info->size;
   info->self_time = info->time;
   /* We can not inline instrumetnation clones.  */
-  info->inlinable = !node->thunk.add_pointer_bounds_args;
+  if (node->thunk.add_pointer_bounds_args)
+   {
+  info->inlinable = false;
+  node->callees->inline_failed = CIF_CHKP;
+   }
+  else
+info->inlinable = true;
 }
   else
 {
Index: cif-code.def
===
--- cif-code.def(revision 236275)
+++ cif-code.def(working copy)
@@ -135,3 +135,7 @@ DEFCIFCODE(CILK_SPAWN, CIF_FINAL_ERROR,
 /* We proved that the call is unreachable.  */
 DEFCIFCODE(UNREACHABLE, CIF_FINAL_ERROR,
   N_("unreachable"))
+
+/* We can't inline because of instrumentation thunk.  */
+DEFCIFCODE(CHKP, CIF_FINAL_ERROR,
+  N_("caller is instrumetnation thunk"))


PING 2 [PATCH] integer overflow checking builtins in constant expressions

2016-05-16 Thread Martin Sebor

Ping 2 of the following patch:
  https://gcc.gnu.org/ml/gcc-patches/2016-05/msg00013.html

On 05/09/2016 10:38 AM, Martin Sebor wrote:

Pinging the following patch:
   https://gcc.gnu.org/ml/gcc-patches/2016-05/msg00013.html

On 05/01/2016 10:39 AM, Martin Sebor wrote:

c/68120 - can't easily deal with integer overflow at compile time,
is an enhancement request to make the integer overflow intrinsics
usable in constant expressions in C (in addition to letting them
be invoked with just two arguments).

The inability to use the built-ins in constant expressions also
limited to non-constexpr the contexts in which the patch for c++/
69517 - SEGV on a VLA with excess initializer elements, was able
to prevent the SEGV.  This limitation is noted in c++/70507 -
integer overflow builtins not constant expressions.

The attached patch implements the request in c/68120 for both
the C and C++ front-ends.  It stops short of providing the new
__builtin_add_wrapv function requested in c/68120.  It doesn't
seem necessary since the same functionality is available with
the patch via the existing built-ins.

With this enhancement in place it will be possible to add the
C++ VLA checking to constexpr functions and fully resolve c++/
69517 (which I plan to do next).

While testing the patch, I also noticed an minor inconsistency
in the text of the diagnostic GCC issues for invalid calls to
the built-ins with insufficient numbers of arguments:  for one
set of built-ins the error says: "not enough arguments," while
for another it says: "too few arguments."  I raised PR c/70883
- inconsistent error message for calls to __builtin_add_overflow
with too few arguments, for this and include a fix in this patch
as well.

Martin

PS The enhancement to call the type-generic built-ins with a null
pointer is not available in C++ 98 mode because GCC doesn't allow
null pointers in constant expressions.  Since C and later versions
of C++ do, it seems that it might be worthwhile to relax the rules
and accept them in C++ 98 as well so that the built-ins can be used
portably across all versions of C++.







Re: [PATCH 2/3] Const parameters are always unmodified

2016-05-16 Thread Jeff Law

On 05/12/2016 10:08 AM, Martin Jambor wrote:

Hi,

this patch simply makes parm_preserved_before_stmt_p consider all
const PARM_DECLs constant and does not invoke AA walking on them
(really the DECLs themselves, not the memory they might point to).

Bootstrapped and lto-bootstrapped and tested on x86_64-linux.  OK for
trunk?

Thanks,

Martin


2016-05-10  Martin Jambor  

* ipa-prop.c (parm_preserved_before_stmt_p): Return true for loads
from TREE_READONLY parameters.

Seems like a good idea irrespective of the other patches in this series.

OK for the trunk.  Is this related to 69708, if so you might want to add 
that marker to the ChangeLog entry.



jeff



Re: [PATCH vs] Take known zero bits into account when checking extraction.

2016-05-16 Thread Jeff Law

On 05/11/2016 02:52 AM, Dominik Vogt wrote:

On Wed, May 11, 2016 at 10:40:11AM +0200, Bernd Schmidt wrote:

On 05/11/2016 09:42 AM, Dominik Vogt wrote:

On Tue, May 10, 2016 at 05:05:06PM +0200, Bernd Schmidt wrote:

Earlier in the discussion you mentioned the intention to remove
these costs. Nothing else in the function does cost calculations -
maybe you can try placing a gcc_unreachable into the case where the
costs would prevent the transformation to see if it ever triggers.


You mean to try it out locally or as part of the patch?


I meant try it out locally. I'm almost certain the patch shouldn't
be trying to use costs here.


That's what I mentioned somewhere during the discussion.  The s390
backend just uses COSTS_N_INSNS(1) for AND as well as ZERO_EXTEND,
so this won't ever trigger.  I just left the rtx_cost call in the
patch for further discussion as Jeff said he liked the approach.
We don't need it to achieve the behaviour we want for s390.
I liked it, just based on the general theory that we should be comparing 
costs of a transform to the original much more often than we currently do.


If Bernd prefers it gone and you don't need it to achieve your goals, 
then I won't object to the costing stuff going away.


jeff



Re: [PATCH] Take known zero bits into account when checking extraction.

2016-05-16 Thread Jeff Law

On 04/29/2016 03:35 AM, Dominik Vogt wrote:

On Wed, Apr 27, 2016 at 10:24:21PM -0600, Jeff Law wrote:

Instead you want insn 12 to use a zero-extend to extend (reg:SI 64)
into (reg:DI 2)?


Yes, because we get the zero extend for free in this case (through
the constant in the AND or because the input value is a function
argument that is already zero extended).


Can't you achieve this in this clause:

 /* If the constant is one less than a power of two, this might be
 representable by an extraction even if no shift is present.
 If it doesn't end up being a ZERO_EXTEND, we will ignore it unless
 we are in a COMPARE.  */

You extract the constant via UINTVAL (XEXP (x, 1)), then munge it
based on nonzero_bits and pass the result to exact_log2?


That's what we tried first, but it resulted in worse code in many
places (saved about 250 instructions in the SPEC2006 testsuite but
added about 42000 elsewhere).  It was so bad that I didn't even
bother to check what's going on.  Probably this was triggered all
over the place by small constants like 1, 3, 7 and the like where
s390 has no cheap way for zero extension.  So I limited this to
constants that are actually mode masks, implicitly assuming that
there are zero extend instructions only for known modes (which is
true for s390 but may not for some other targets).  Being
conservative here shouldn't hurt; but I wonder whether there are
targets where this condition still allows too much.
You're probably right.  FWIW, I do believe a variety of targets can do 
these kind of zero extensions.  The PA for example has extru which can 
extract a field from a general register zero extend it, then place the 
result, right justified into another register.



We don't get them "for free", except as a component of a larger sequence 
of statements for bitfield extraction/manipulation.


I believe PPC has similar capabilities.
jeff


[PATCH, i386]: No singing in the compiler!

2016-05-16 Thread Uros Bizjak
2016-05-16  Uros Bizjak  

* config/i386/xopintrin.h: Correct "unsinged" typo in the comments.

Committed as obvious.

Uros.
Index: config/i386/xopintrin.h
===
--- config/i386/xopintrin.h (revision 236296)
+++ config/i386/xopintrin.h (working copy)
@@ -330,7 +330,7 @@
 }
 
 /* Compare and Predicate Generation
-   pcom (integer, unsinged bytes) */
+   pcom (integer, unsigned bytes) */
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_comlt_epu8(__m128i __A, __m128i __B)
@@ -380,7 +380,7 @@
   return (__m128i) __builtin_ia32_vpcomtrueub ((__v16qi)__A, (__v16qi)__B);
 }
 
-/*pcom (integer, unsinged words) */
+/*pcom (integer, unsigned words) */
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_comlt_epu16(__m128i __A, __m128i __B)
@@ -430,7 +430,7 @@
   return (__m128i) __builtin_ia32_vpcomtrueuw ((__v8hi)__A, (__v8hi)__B);
 }
 
-/*pcom (integer, unsinged double words) */
+/*pcom (integer, unsigned double words) */
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_comlt_epu32(__m128i __A, __m128i __B)
@@ -480,7 +480,7 @@
   return (__m128i) __builtin_ia32_vpcomtrueud ((__v4si)__A, (__v4si)__B);
 }
 
-/*pcom (integer, unsinged quad words) */
+/*pcom (integer, unsigned quad words) */
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_comlt_epu64(__m128i __A, __m128i __B)


Re: [PATCH] Respect --param ipa-max-agg-items=0

2016-05-16 Thread Jeff Law

On 05/11/2016 09:46 AM, Martin Jambor wrote:

Hi,

when analyzing PR 70646, I found out that --param ipa-max-agg-items=0
does not prevent creation of aggregate jump functions because it is
checked only after the first such jump function is created.  The
following patch fixes that by checking the parameter before starting
the whole analysis.

Bootstrapped and lto-bootstrapped on x86_64-linux.  OK for trunk?  OK
for all active release branches?

Yes and yes.



jeff



Re: [PATCH] [rtlfe] Barebones implementation of "__RTL"; next steps?

2016-05-16 Thread Jeff Law

On 05/12/2016 08:29 AM, David Malcolm wrote:


One wart I ran into is that system.h has this:

/* Front ends should never have to include middle-end headers.  Enforce
   this by poisoning the header double-include protection defines.  */
#ifdef IN_GCC_FRONTEND
#pragma GCC poison GCC_RTL_H GCC_EXCEPT_H GCC_EXPR_H
#endif

i.e. the idea of running RTL code from inside the C frontend seems to
be banned.
Yea, we really don't want the front-ends to know about the guts of RTL. 
This work would seem to violate that guiding principle.


I'd be more in favor of a true RTL front-end rather than bolting it onto 
the side of the C front-end.


jeff



Re: VRP: range info of new variables

2016-05-16 Thread Jeff Law

On 05/13/2016 12:50 PM, Marc Glisse wrote:

Hello,

when VRP does some transforms, it may create new SSA_NAMEs, but doesn't
give them range information. This can prevent cascading transformations
in a single VRP pass. With this patch, I assign range information to the
variable introduced by one transformation, and in another
transformation, I get range information through get_range_info instead
of get_value_range in order to have access to the new information.

Some notes:
- get_range_info only applies to integers, not pointers. I hope we are
not losing much by restricting this transformation. I could also call
get_value_range and only fall back to get_range_info if that failed (and
we don't have a pointer), but it doesn't seem worth it.

It probably isn't worth it.


- Now that I think of it, maybe I should check that the variable is not
a pointer before calling set_range_info? Having range [0, 1] makes it
unlikely, but who knows...

Maybe using an assert would be better.



Index: gcc/tree-vrp.c
===
--- gcc/tree-vrp.c  (revision 236194)
+++ gcc/tree-vrp.c  (working copy)
@@ -8933,20 +8933,24 @@ simplify_truth_ops_using_ranges (gimple_
 gimple_assign_set_rhs_with_ops (gsi,
need_conversion
? NOP_EXPR : TREE_CODE (op0), op0);
   /* For A != B we substitute A ^ B.  Either with conversion.  */
   else if (need_conversion)
 {
   tree tem = make_ssa_name (TREE_TYPE (op0));
   gassign *newop
= gimple_build_assign (tem, BIT_XOR_EXPR, op0, op1);
   gsi_insert_before (gsi, newop, GSI_SAME_STMT);
+  if (TYPE_PRECISION (TREE_TYPE (tem)) > 1)
+   set_range_info (tem, VR_RANGE,
+   wi::zero (TYPE_PRECISION (TREE_TYPE (tem))),
+   wi::one (TYPE_PRECISION (TREE_TYPE (tem;
Is there actually a case where TYPE_PRECISION (TREE_TYPE (tem)) > 1 is 
ever false?  Would an assert make more sense here?





 /* Simplify an integral conversion from an SSA name in STMT.  */

 static bool
 simplify_conversion_using_ranges (gimple *stmt)
Your ChangeLog mentions simplify_switch_using_ranges, not 
simplify_conversion_using_ranges.


This is OK for the trunk -- your call on asserting the variable is not a 
pointer before calling set_range_info.  Similarly on the check that the 
TYPE_PRECISION (TREE_TYPE (tem)) > 1.


Jeff


Re: Fix for PR68159 in Libiberty Demangler (6)

2016-05-16 Thread Jeff Law

On 05/16/2016 12:19 PM, Jakub Jelinek wrote:

On Mon, May 16, 2016 at 12:12:38PM -0600, Jeff Law wrote:

On 05/06/2016 09:19 AM, Jakub Jelinek wrote:

On Fri, May 06, 2016 at 11:11:29PM +0800, Marcel Böhme wrote:

+  dpi.copy_templates
+= (struct d_print_template *) malloc (((size_t) dpi.num_copy_templates)
+ * sizeof (*dpi.copy_templates));
+  if (! dpi.copy_templates)
+{
+  d_print_error (&dpi);
+  return 0;
+}


Another thing to consider is if the common values of dpi.num_*
and similarly in the other block are small enough, it might be desirable
to just use an automatic fixed size array (or even alloca) and only
fall back to malloc if it is too large.

Please, no, don't fall back to alloca like this.  That coding idiom has been
the source of numerous security exploits in glibc.  Experience shows us that
we are not capable of doing that correctly on a consistent basis.


Falling back to fixed size buffer is something we use heavily in gcc, and
are able to get it right, there is nothing hard in it.
Conceptually I agree, it ought not be that hard, in practice, it's been 
an absolute disaster in glibc.


I've often wondered if the right model is to to use escape analysis 
along with the size of the object, loop analysis, etc and let the 
compiler figure this stuff out rather than leaving it to humans.





For the cases where we can't use malloc at all and we'd need too much memory
that it won't fit into the static buffer, I think all we can do is fall back
into increasing the time complexity in the demangler by processing the
string multiple times.

Probably true.

jeff


Re: Fix for PR68159 in Libiberty Demangler (6)

2016-05-16 Thread Jakub Jelinek
On Mon, May 16, 2016 at 12:12:38PM -0600, Jeff Law wrote:
> On 05/06/2016 09:19 AM, Jakub Jelinek wrote:
> >On Fri, May 06, 2016 at 11:11:29PM +0800, Marcel Böhme wrote:
> >>+  dpi.copy_templates
> >>+= (struct d_print_template *) malloc (((size_t) dpi.num_copy_templates)
> >>+ * sizeof (*dpi.copy_templates));
> >>+  if (! dpi.copy_templates)
> >>+{
> >>+  d_print_error (&dpi);
> >>+  return 0;
> >>+}
> >
> >Another thing to consider is if the common values of dpi.num_*
> >and similarly in the other block are small enough, it might be desirable
> >to just use an automatic fixed size array (or even alloca) and only
> >fall back to malloc if it is too large.
> Please, no, don't fall back to alloca like this.  That coding idiom has been
> the source of numerous security exploits in glibc.  Experience shows us that
> we are not capable of doing that correctly on a consistent basis.

Falling back to fixed size buffer is something we use heavily in gcc, and
are able to get it right, there is nothing hard in it.

For the cases where we can't use malloc at all and we'd need too much memory
that it won't fit into the static buffer, I think all we can do is fall back
into increasing the time complexity in the demangler by processing the
string multiple times.

Jakub


Re: Fix for PR68159 in Libiberty Demangler (6)

2016-05-16 Thread Jeff Law

On 05/06/2016 09:19 AM, Jakub Jelinek wrote:

On Fri, May 06, 2016 at 11:11:29PM +0800, Marcel Böhme wrote:

+  dpi.copy_templates
+= (struct d_print_template *) malloc (((size_t) dpi.num_copy_templates)
+ * sizeof (*dpi.copy_templates));
+  if (! dpi.copy_templates)
+{
+  d_print_error (&dpi);
+  return 0;
+}


Another thing to consider is if the common values of dpi.num_*
and similarly in the other block are small enough, it might be desirable
to just use an automatic fixed size array (or even alloca) and only
fall back to malloc if it is too large.
Please, no, don't fall back to alloca like this.  That coding idiom has 
been the source of numerous security exploits in glibc.  Experience 
shows us that we are not capable of doing that correctly on a consistent 
basis.


Jeff



Re: [PATCH] Add PowerPC ISA 3.0 word splat and byte immediate splat support

2016-05-16 Thread Michael Meissner
On Fri, May 13, 2016 at 08:23:16PM -0500, Segher Boessenkool wrote:
> On Fri, May 13, 2016 at 07:25:43PM -0400, Michael Meissner wrote:
> > This patch adds support for the 32-bit word splat instructions, the byte
> > immediate splat instructions, and the vector sign extend instructions to GCC
> > 7.0.
> > 
> > In addition to the various splat instructions, since I was modifying the 
> > vector
> > moves, I took the opportunity to reorganize the vector move instructions 
> > with
> > several changes I've wanted to do:
> 
> It is much easier to review, and for regression searches later, if one
> patch does one thing.  No need to change this patch, but please keep
> it in mind for later patches.

Yes and no.  Sometimes, you need to make larger changes.

> > Are these patches ok to apply to the GCC 7.0 trunk?
> 
> Changelog is missing.

Sorry about that.  I was focused on including the patches this time, that I
forgot to include the ChangeLog entries.

Unfortunately, I'm finding some regressions that seem to show up with more
recent trunk changes, particularly with reload than LRA.  I'm going to have to
spend some time debugging why reload is failing on big endian systems, and
resubmit the patches when it is debugged.

-- 
Michael Meissner, IBM
IBM, M/S 2506R, 550 King Street, Littleton, MA 01460-6245, USA
email: meiss...@linux.vnet.ibm.com, phone: +1 (978) 899-4797



[hsa] Increase hsa symbol alignment to a natural one

2016-05-16 Thread Martin Jambor
Hi,

in the last round fo alignment fixes, we have forgot to make sure that
all symbols are at least naturally aligned, which is a hard HSAIL
requirement.  This caused problems when emitting a symbol for a
private complex number, as the natural alignment as defined by HSAIL
is twice the one of the component, which was selected by gcc.

The following patch addresses this in two ways.  First, it simply
increases the alignment of symbols that are only accessible from
within HSAIL.  If however a symbol that is shared in between host and
an HSA accelerator is under-aligned (in my experience it only happens
if the user uses the aligned attribute), we have no option but to
abort HSAIL generation because even if we did generate it, it would
not finalize.

Bootstrapped and tested on x86_64-linux with hsa enabled.  I will
commit it to trunk and the gcc-6 branch shortly.

Martin


2016-05-16  Martin Jambor  

* hsa-gen.c (fillup_for_decl): Increase alignment to natural one.
(get_symbol_for_decl): Sorry if a global symbol in under-aligned.

libgomp/
* testsuite/libgomp.hsa.c/complex-align-2.c: New test.
---
 gcc/hsa-gen.c | 19 
 libgomp/testsuite/libgomp.hsa.c/complex-align-2.c | 27 +++
 2 files changed, 42 insertions(+), 4 deletions(-)
 create mode 100644 libgomp/testsuite/libgomp.hsa.c/complex-align-2.c

diff --git a/gcc/hsa-gen.c b/gcc/hsa-gen.c
index 5baf607..697d599 100644
--- a/gcc/hsa-gen.c
+++ b/gcc/hsa-gen.c
@@ -203,9 +203,13 @@ hsa_symbol::fillup_for_decl (tree decl)
 {
   m_decl = decl;
   m_type = hsa_type_for_tree_type (TREE_TYPE (decl), &m_dim, false);
-
   if (hsa_seen_error ())
-m_seen_error = true;
+{
+  m_seen_error = true;
+  return;
+}
+
+  m_align = MAX (m_align, hsa_natural_alignment (m_type));
 }
 
 /* Constructor of class representing global HSA function/kernel information and
@@ -929,6 +933,14 @@ get_symbol_for_decl (tree decl)
BRIG_LINKAGE_PROGRAM, true,
BRIG_ALLOCATION_PROGRAM, align);
  hsa_cfun->m_global_symbols.safe_push (sym);
+ sym->fillup_for_decl (decl);
+ if (sym->m_align > align)
+   {
+ sym->m_seen_error = true;
+ HSA_SORRY_ATV (EXPR_LOCATION (decl),
+"HSA specification requires that %E is at least "
+"naturally aligned", decl);
+   }
}
   else
{
@@ -944,12 +956,11 @@ get_symbol_for_decl (tree decl)
  sym = new hsa_symbol (BRIG_TYPE_NONE, BRIG_SEGMENT_PRIVATE,
BRIG_LINKAGE_FUNCTION);
  sym->m_align = align;
+ sym->fillup_for_decl (decl);
  hsa_cfun->m_private_variables.safe_push (sym);
}
 
-  sym->fillup_for_decl (decl);
   sym->m_name = hsa_get_declaration_name (decl);
-
   *slot = sym;
   return sym;
 }
diff --git a/libgomp/testsuite/libgomp.hsa.c/complex-align-2.c 
b/libgomp/testsuite/libgomp.hsa.c/complex-align-2.c
new file mode 100644
index 000..b2d7acf
--- /dev/null
+++ b/libgomp/testsuite/libgomp.hsa.c/complex-align-2.c
@@ -0,0 +1,27 @@
+#pragma omp declare target
+_Complex int *g;
+#pragma omp end declare target
+
+
+
+_Complex float f(void);
+
+int
+main ()
+{
+  _Complex int y;
+#pragma omp target map(from:y)
+  {
+_Complex int x;
+g = &x;
+__imag__ x = 1;
+__real__ x = 2;
+y = x;
+  }
+
+  if ((__imag__ y != 1)
+  || (__real__ y != 2))
+__builtin_abort ();
+  return 0;
+}
+
-- 
2.8.2



[committed] Fix some typos in gimple.c

2016-05-16 Thread Marek Polacek
While debugging PR71146 I notices these.  These typos are perfidious because
they make grepping for '__builtin_unreachable' harder.

Applying to trunk as obvious.

2016-05-16  Marek Polacek  

* gimple.c (maybe_remove_unused_call_args): Fix typos in the
commentary.

diff --git gcc/gimple.c gcc/gimple.c
index 1a22e82..d822fab 100644
--- gcc/gimple.c
+++ gcc/gimple.c
@@ -3002,7 +3002,7 @@ gimple_seq_discard (gimple_seq seq)
 
 /* See if STMT now calls function that takes no parameters and if so, drop
call arguments.  This is used when devirtualization machinery redirects
-   to __builtiln_unreacahble or __cxa_pure_virutal.  */
+   to __builtin_unreachable or __cxa_pure_virtual.  */
 
 void
 maybe_remove_unused_call_args (struct function *fn, gimple *stmt)

Marek


[PATCH 4/4] BRIG (HSAIL) frontend: smoke test suite

2016-05-16 Thread Pekka Jääskeläinen

A smoke test suite. The patch has been tested more thoroughly with the
proprietary HSA PRM conformance suite.

Requires the HSAILasm tool to first compile the .hsail to .brig.

--
Pekka Jääskeläinen
Parmance
A smoke test suite. The patch has been tested more thoroughly with the
proprietary HSA PRM conformance suite.

Requires the HSAILasm tool to first compile the .hsail to .brig.
diff --git a/gcc/testsuite/brig.dg/README b/gcc/testsuite/brig.dg/README
new file mode 100644
index 000..cc313c4
--- /dev/null
+++ b/gcc/testsuite/brig.dg/README
@@ -0,0 +1,10 @@
+BRIG (HSAIL) frontend test cases
+
+
+The suite consists of "smoke tests" that test several features of
+the compilation and regression tests, but is not an exhaustive test
+suite for all HSAIL instructions. The HSA PRM conformance suite
+is supposed to be used for that.
+
+HSAILasm is required for converting the text HSAIL files to BRIGs
+which the compiler consumes.
diff --git a/gcc/testsuite/brig.dg/dg.exp b/gcc/testsuite/brig.dg/dg.exp
new file mode 100644
index 000..fd75cae
--- /dev/null
+++ b/gcc/testsuite/brig.dg/dg.exp
@@ -0,0 +1,27 @@
+#   Copyright (C) 2009-2014 Free Software Foundation, Inc.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3.  If not see
+# .
+
+# GCC testsuite that uses the `dg.exp' driver.
+
+load_lib brig-dg.exp
+
+# Initialize `dg'.
+dg-init
+
+dg-runtest [find $srcdir/$subdir *.hsail] "" ""
+
+# All done.
+dg-finish
diff --git a/gcc/testsuite/brig.dg/test/gimple/alloca.hsail b/gcc/testsuite/brig.dg/test/gimple/alloca.hsail
new file mode 100644
index 000..73c2f93
--- /dev/null
+++ b/gcc/testsuite/brig.dg/test/gimple/alloca.hsail
@@ -0,0 +1,37 @@
+module &module:1:0:$full:$large:$default;
+
+/* Tests for alloca. */
+
+/* { dg-do compile } */
+/* { dg-options "-fdump-tree-gimple" } */
+
+prog function &subfunction(arg_u32 %return_value)() {
+ alloca_align(1)_u32 $s2, 256;
+ st_arg_u32 $s2, [%return_value];
+ ret;
+};
+
+prog kernel &kernel(kernarg_u64 %input_ptr, kernarg_u64 %output_ptr)
+{
+ld_kernarg_u64 $d0, [%input_ptr];
+ld_global_u32 $s0, [$d0];
+
+	alloca_align(256)_u32 $s1, 16;
+	{
+		arg_u32 %return_value;
+		call &subfunction(%return_value)();
+		ld_arg_u32 $s1, [%return_value];
+	}
+ld_kernarg_u64 $d1, [%output_ptr];
+st_global_u32 $s1, [$d0];
+};
+
+/* { dg-final { scan-tree-dump "s2 = __phsa_builtin_alloca \\\(256, 1, __context\\\);" "gimple" } } */
+
+/* { dg-final { scan-tree-dump "s1 = __phsa_builtin_alloca \\\(16, 256, __context\\\);" "gimple" } } */
+
+
+/* Both functions should have an alloca frame push and pop. */
+/* { dg-final { scan-tree-dump-times "__phsa_builtin_alloca_push_frame \\\(__context\\\);" 2 "gimple" } } */
+
+/* { dg-final { scan-tree-dump-times "__phsa_builtin_alloca_pop_frame \\\(__context\\\);" 2 "gimple" } } */
diff --git a/gcc/testsuite/brig.dg/test/gimple/atomics.hsail b/gcc/testsuite/brig.dg/test/gimple/atomics.hsail
new file mode 100644
index 000..a0b2f85
--- /dev/null
+++ b/gcc/testsuite/brig.dg/test/gimple/atomics.hsail
@@ -0,0 +1,33 @@
+module &module:1:0:$full:$large:$default;
+
+/* Test for atomic instructions. */
+
+/* { dg-do compile } */
+/* { dg-options "-fdump-tree-original" } */
+
+prog kernel &Kernel(kernarg_u64 %input_ptr, kernarg_u64 %output_ptr)
+{
+ld_kernarg_u64 $d0, [%input_ptr];
+
+	atomic_ld_global_rlx_system_b32 $s0, [$d0];
+	atomic_add_global_rlx_system_u32 $s1, [$d0 + 4], $s0;
+
+ld_kernarg_u64 $d0, [%output_ptr];
+atomicnoret_st_global_rlx_system_b32 [$d0], $s2;
+
+	atomicnoret_min_global_rlx_system_u32 [$d0 + 4], $s1;
+
+ret;
+};
+
+/* The atomic loads are implemented by casting to an atomic pointer. */
+/* { dg-final { scan-tree-dump "s0 = VIEW_CONVERT_EXPR\\\(\\\*\\\(atomic unsigned int \\\*\\\)" "original"} } */
+
+/* The atomic add should call a gcc builtin. */
+/* { dg-final { scan-tree-dump "= __sync_fetch_and_add_4 \\\(" "original"} } */
+
+/* The atomic stores are implemented by casting to an atomic pointer. */
+/* { dg-final { scan-tree-dump "\\\*\\\(atomic unsigned int \\\*\\\) VIEW_CONVERT_EXPR\\\(VIEW_CONVERT_EXPR\\\(d0\\\)\\\) = s2;" "original"} } */
+
+/* The atomic min is implemented by a custom builtin. */
+/* { dg-final { scan-tree-dump "builtin_out.\[0-9\]+ = __phsa_builtin_atomic_min_u32 \\\(" "original"} } */
diff --git a/gcc/te

[PATCH 2/4] BRIG (HSAIL) frontend: The FE itself.

2016-05-16 Thread Pekka Jääskeläinen

The BRIG frontend itself.

--
Pekka Jääskeläinen
Parmance


002-brig-fe-new-files.patch.gz
Description: application/gzip


[PATCH 1/4] BRIG (HSAIL) frontend: configuration file changes and misc

2016-05-16 Thread Pekka Jääskeläinen

The configuration file changes and misc. updates required
by the BRIG frontend.

Also, added include/hsa-interface.h which is hsa.h taken from libgomp
and will be shared by it (agreed with Martin Liška / SUSE).

--
Pekka Jääskeläinen
Parmance
The configuration file changes and misc. updates required
by the BRIG frontend.

Also, added include/hsa-interface.h which is hsa.h taken from libgomp
and will be shared by it (agreed with Martin Liška / SUSE).

diff --git a/Makefile.def b/Makefile.def
index ec5f31e..2c1668b 100644
--- a/Makefile.def
+++ b/Makefile.def
@@ -157,6 +157,7 @@ target_modules = { module= libquadmath; };
 target_modules = { module= libgfortran; };
 target_modules = { module= libobjc; };
 target_modules = { module= libgo; };
+target_modules = { module= libhsail-rt; };
 target_modules = { module= libtermcap; no_check=true;
missing=mostlyclean;
missing=clean;
@@ -619,6 +620,8 @@ languages = { language=objc;	gcc-check-target=check-objc;
 languages = { language=obj-c++;	gcc-check-target=check-obj-c++; };
 languages = { language=go;	gcc-check-target=check-go;
 lib-check-target=check-target-libgo; };
+languages = { language=brig;	gcc-check-target=check-brig;
+lib-check-target=check-target-libhsail-rt; };
 
 // Toplevel bootstrap
 bootstrap_stage = { id=1 ; };
diff --git a/Makefile.in b/Makefile.in
index f778d03..fcac74c 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -966,6 +966,7 @@ configure-target:  \
 maybe-configure-target-libgfortran \
 maybe-configure-target-libobjc \
 maybe-configure-target-libgo \
+maybe-configure-target-libhsail-rt \
 maybe-configure-target-libtermcap \
 maybe-configure-target-winsup \
 maybe-configure-target-libgloss \
@@ -1133,6 +1134,7 @@ all-target: maybe-all-target-libquadmath
 all-target: maybe-all-target-libgfortran
 all-target: maybe-all-target-libobjc
 all-target: maybe-all-target-libgo
+all-target: maybe-all-target-libhsail-rt
 all-target: maybe-all-target-libtermcap
 all-target: maybe-all-target-winsup
 all-target: maybe-all-target-libgloss
@@ -1227,6 +1229,7 @@ info-target: maybe-info-target-libquadmath
 info-target: maybe-info-target-libgfortran
 info-target: maybe-info-target-libobjc
 info-target: maybe-info-target-libgo
+info-target: maybe-info-target-libhsail-rt
 info-target: maybe-info-target-libtermcap
 info-target: maybe-info-target-winsup
 info-target: maybe-info-target-libgloss
@@ -1314,6 +1317,7 @@ dvi-target: maybe-dvi-target-libquadmath
 dvi-target: maybe-dvi-target-libgfortran
 dvi-target: maybe-dvi-target-libobjc
 dvi-target: maybe-dvi-target-libgo
+dvi-target: maybe-dvi-target-libhsail-rt
 dvi-target: maybe-dvi-target-libtermcap
 dvi-target: maybe-dvi-target-winsup
 dvi-target: maybe-dvi-target-libgloss
@@ -1401,6 +1405,7 @@ pdf-target: maybe-pdf-target-libquadmath
 pdf-target: maybe-pdf-target-libgfortran
 pdf-target: maybe-pdf-target-libobjc
 pdf-target: maybe-pdf-target-libgo
+pdf-target: maybe-pdf-target-libhsail-rt
 pdf-target: maybe-pdf-target-libtermcap
 pdf-target: maybe-pdf-target-winsup
 pdf-target: maybe-pdf-target-libgloss
@@ -1488,6 +1493,7 @@ html-target: maybe-html-target-libquadmath
 html-target: maybe-html-target-libgfortran
 html-target: maybe-html-target-libobjc
 html-target: maybe-html-target-libgo
+html-target: maybe-html-target-libhsail-rt
 html-target: maybe-html-target-libtermcap
 html-target: maybe-html-target-winsup
 html-target: maybe-html-target-libgloss
@@ -1575,6 +1581,7 @@ TAGS-target: maybe-TAGS-target-libquadmath
 TAGS-target: maybe-TAGS-target-libgfortran
 TAGS-target: maybe-TAGS-target-libobjc
 TAGS-target: maybe-TAGS-target-libgo
+TAGS-target: maybe-TAGS-target-libhsail-rt
 TAGS-target: maybe-TAGS-target-libtermcap
 TAGS-target: maybe-TAGS-target-winsup
 TAGS-target: maybe-TAGS-target-libgloss
@@ -1662,6 +1669,7 @@ install-info-target: maybe-install-info-target-libquadmath
 install-info-target: maybe-install-info-target-libgfortran
 install-info-target: maybe-install-info-target-libobjc
 install-info-target: maybe-install-info-target-libgo
+install-info-target: maybe-install-info-target-libhsail-rt
 install-info-target: maybe-install-info-target-libtermcap
 install-info-target: maybe-install-info-target-winsup
 install-info-target: maybe-install-info-target-libgloss
@@ -1749,6 +1757,7 @@ install-pdf-target: maybe-install-pdf-target-libquadmath
 install-pdf-target: maybe-install-pdf-target-libgfortran
 install-pdf-target: maybe-install-pdf-target-libobjc
 install-pdf-target: maybe-install-pdf-target-libgo
+install-pdf-target: maybe-install-pdf-target-libhsail-rt
 install-pdf-target: maybe-install-pdf-target-libtermcap
 install-pdf-target: maybe-install-pdf-target-winsup
 install-pdf-target: maybe-install-pdf-target-libgloss
@@ -1836,6 +1845,7 @@ install-html-target: maybe-install-html-target-libquadmath
 install-html-target: maybe-install-html-target-libgfortran
 install-html-target: maybe-install-html-target-libobjc
 install-html-targe

[PATCH 0/4] BRIG (HSAIL) frontend

2016-05-16 Thread Pekka Jääskeläinen
This patch set adds a BRIG (HSAIL) frontend. It can be used as a core
for an HSAIL finalizer implementation for processors with gcc backends.

It is a bit unusual frontend as the consumed format is a binary
representation.  The textual HSAIL can be compiled to it with a 
separate assembler.

The frontend has been mostly tested with the HSA PRM conformance suite which
it now passes. The accompanied GENERIC-scanning test suite is supposed to be
only a smoke test. 

libhsail-rt implements HSAIL specific builtins and includes a simple runtime
that implements SPMD execution via either Pth-based fibers or loops to 
execute multiple work-item work groups without SPMD/SIMD-default hardware.

I've split it to 4 patches:

001 - the configuration file changes and misc.
002 - the frontend itself
003 - libhsail-rt
004 - the smoke test suite

The diffstat is as follows:

 Makefile.def  | 3 +
 Makefile.in   |   489 +
 configure | 1 +
 configure.ac  | 1 +
 gcc/brig/Make-lang.in |   246 +
 gcc/brig/brig-c.h |68 +
 gcc/brig/brig-lang.c  |   461 +
 gcc/brig/brigfrontend/brig-arg-block-handler.cc   |67 +
 gcc/brig/brigfrontend/brig-atomic-inst-handler.cc |   377 +
 gcc/brig/brigfrontend/brig-basic-inst-handler.cc  |   732 +
 gcc/brig/brigfrontend/brig-branch-inst-handler.cc |   217 +
 gcc/brig/brigfrontend/brig-cmp-inst-handler.cc|   212 +
 gcc/brig/brigfrontend/brig-code-entry-handler.cc  |  2319 +++
 gcc/brig/brigfrontend/brig-code-entry-handler.h   |   449 +
 gcc/brig/brigfrontend/brig-comment-handler.cc |39 +
 gcc/brig/brigfrontend/brig-control-handler.cc |29 +
 .../brigfrontend/brig-copy-move-inst-handler.cc   |56 +
 gcc/brig/brigfrontend/brig-cvt-inst-handler.cc|   249 +
 gcc/brig/brigfrontend/brig-fbarrier-handler.cc|44 +
 gcc/brig/brigfrontend/brig-function-handler.cc|   373 +
 gcc/brig/brigfrontend/brig-function.cc|   698 +
 gcc/brig/brigfrontend/brig-function.h |   216 +
 gcc/brig/brigfrontend/brig-inst-mod-handler.cc|   168 +
 gcc/brig/brigfrontend/brig-label-handler.cc   |37 +
 gcc/brig/brigfrontend/brig-lane-inst-handler.cc   |82 +
 gcc/brig/brigfrontend/brig-machine.c  |37 +
 gcc/brig/brigfrontend/brig-machine.h  |35 +
 gcc/brig/brigfrontend/brig-mem-inst-handler.cc|   180 +
 gcc/brig/brigfrontend/brig-module-handler.cc  |30 +
 gcc/brig/brigfrontend/brig-queue-inst-handler.cc  |92 +
 gcc/brig/brigfrontend/brig-seg-inst-handler.cc|   133 +
 gcc/brig/brigfrontend/brig-signal-inst-handler.cc |42 +
 gcc/brig/brigfrontend/brig-util.cc|   347 +
 gcc/brig/brigfrontend/brig-util.h |49 +
 gcc/brig/brigfrontend/brig-variable-handler.cc|   255 +
 gcc/brig/brigfrontend/brig_to_generic.cc  |   773 +
 gcc/brig/brigfrontend/brig_to_generic.h   |   245 +
 gcc/brig/brigfrontend/phsa.h  |40 +
 gcc/brig/brigspec.c   |   193 +
 gcc/brig/config-lang.in   |41 +
 gcc/brig/lang-specs.h |28 +
 gcc/brig/lang.opt |41 +
 gcc/testsuite/brig.dg/README  |10 +
 gcc/testsuite/brig.dg/dg.exp  |27 +
 gcc/testsuite/brig.dg/test/gimple/alloca.hsail|37 +
 gcc/testsuite/brig.dg/test/gimple/atomics.hsail   |33 +
 gcc/testsuite/brig.dg/test/gimple/branches.hsail  |58 +
 gcc/testsuite/brig.dg/test/gimple/fbarrier.hsail  |74 +
 .../brig.dg/test/gimple/function_calls.hsail  |59 +
 gcc/testsuite/brig.dg/test/gimple/mem.hsail   |39 +
 gcc/testsuite/brig.dg/test/gimple/mulhi.hsail |33 +
 gcc/testsuite/brig.dg/test/gimple/packed.hsail|78 +
 .../brig.dg/test/gimple/smoke_test.hsail  |91 +
 gcc/testsuite/brig.dg/test/gimple/variables.hsail |   124 +
 gcc/testsuite/brig.dg/test/gimple/vector.hsail|57 +
 gcc/testsuite/lib/brig-dg.exp |29 +
 gcc/testsuite/lib/brig.exp|40 +
 include/hsa-interface.h   |   630 +
 libhsail-rt/Makefile.am   |   123 +
 libhsail-rt/Makefile.in   |   721 +
 libhsail-rt/README| 4 +
 libhsail-rt/aclocal.m4|   979 +
 libhsail-rt/config.h.in   |   217 +
 libhsail-rt/configure | 17162 ++
 libhsail-rt/configure.ac  |   150 +
 libhsail-rt/include/internal/phsa-rt.h|97 +
 .../include/internal/phsa_queue_interface.h   |60 +
 libhsail-rt/include/in

[gomp4.5] Some OpenMP 4.5 resolving and translation changes

2016-05-16 Thread Jakub Jelinek
Hi!

This patch tweaks various spots, including being able to compile taskloop
construct.  I'll be adding more testcases and tweaking the code further
later on.

2016-05-16  Jakub Jelinek  

* trans.c (trans_code): Handle new OpenMP 4.5 constructs.
* resolve.c (gfc_resolve_blocks): Likewise.
(gfc_resolve_code): Likewise.
* trans-openmp.c (gfc_trans_omp_clauses): Handle new OpenMP 4.5
clauses and new clause modifiers.
(gfc_trans_omp_do): Handle EXEC_OMP_TASKLOOP.
(GFC_OMP_SPLIT_TASKLOOP, GFC_OMP_MASK_TASKLOOP): New enum constants.
(gfc_split_omp_clauses): Handle EXEC_OMP_TARGET_PARALLEL{,_DO,_DO_SIMD}
and EXEC_OMP_TASKLOOP{,_SIMD}.  Add handling for new OpenMP 4.5
clauses and clause modifiers and handle if clause without/with
modifiers.
(gfc_trans_omp_target): Handle EXEC_OMP_TARGET_PARALLEL{,_DO,_DO_SIMD}
and EXEC_OMP_TARGET_SIMD.
(gfc_trans_omp_taskloop): New function.
(gfc_trans_omp_directive): Handle EXEC_OMP_TASKLOOP{,_SIMD},
EXEC_OMP_TARGET_PARALLEL{,_DO,_DO_SIMD} and EXEC_OMP_TARGET_SIMD.
* openmp.c (resolve_oacc_scalar_int_expr): Renamed to ...
(resolve_scalar_int_expr): ... this.  Fix up formatting.
(resolve_oacc_positive_int_expr): Renamed to ...
(resolve_positive_int_expr): ... this.  Fix up formatting.
(resolve_nonnegative_int_expr): New function.
(resolve_omp_clauses): Adjust callers, use the above functions
even for OpenMP clauses, add handling of new OpenMP 4.5 clauses.
(gfc_resolve_omp_parallel_blocks): Handle new OpenMP 4.5 constructs,
replace underscores with spaces in a few construct names.
(resolve_omp_do): Handle new OpenMP 4.5 constructs.
(resolve_oacc_loop_blocks): Call resolve_positive_int_expr instead
of resolve_oacc_positive_int_expr.
(gfc_resolve_omp_directive): Handle new OpenMP 4.5 constructs.

* testsuite/libgomp.fortran/taskloop-1.f90: New test.

--- gcc/fortran/trans.c.jj  2016-05-04 18:37:30.0 +0200
+++ gcc/fortran/trans.c 2016-05-16 15:48:07.489838631 +0200
@@ -1916,6 +1916,12 @@ trans_code (gfc_code * code, tree cond)
case EXEC_OMP_SINGLE:
case EXEC_OMP_TARGET:
case EXEC_OMP_TARGET_DATA:
+   case EXEC_OMP_TARGET_ENTER_DATA:
+   case EXEC_OMP_TARGET_EXIT_DATA:
+   case EXEC_OMP_TARGET_PARALLEL:
+   case EXEC_OMP_TARGET_PARALLEL_DO:
+   case EXEC_OMP_TARGET_PARALLEL_DO_SIMD:
+   case EXEC_OMP_TARGET_SIMD:
case EXEC_OMP_TARGET_TEAMS:
case EXEC_OMP_TARGET_TEAMS_DISTRIBUTE:
case EXEC_OMP_TARGET_TEAMS_DISTRIBUTE_PARALLEL_DO:
@@ -1924,6 +1930,8 @@ trans_code (gfc_code * code, tree cond)
case EXEC_OMP_TARGET_UPDATE:
case EXEC_OMP_TASK:
case EXEC_OMP_TASKGROUP:
+   case EXEC_OMP_TASKLOOP:
+   case EXEC_OMP_TASKLOOP_SIMD:
case EXEC_OMP_TASKWAIT:
case EXEC_OMP_TASKYIELD:
case EXEC_OMP_TEAMS:
--- gcc/fortran/resolve.c.jj2016-05-04 18:37:32.0 +0200
+++ gcc/fortran/resolve.c   2016-05-16 15:35:33.220026681 +0200
@@ -9459,6 +9459,12 @@ gfc_resolve_blocks (gfc_code *b, gfc_nam
case EXEC_OMP_SINGLE:
case EXEC_OMP_TARGET:
case EXEC_OMP_TARGET_DATA:
+   case EXEC_OMP_TARGET_ENTER_DATA:
+   case EXEC_OMP_TARGET_EXIT_DATA:
+   case EXEC_OMP_TARGET_PARALLEL:
+   case EXEC_OMP_TARGET_PARALLEL_DO:
+   case EXEC_OMP_TARGET_PARALLEL_DO_SIMD:
+   case EXEC_OMP_TARGET_SIMD:
case EXEC_OMP_TARGET_TEAMS:
case EXEC_OMP_TARGET_TEAMS_DISTRIBUTE:
case EXEC_OMP_TARGET_TEAMS_DISTRIBUTE_PARALLEL_DO:
@@ -9467,6 +9473,8 @@ gfc_resolve_blocks (gfc_code *b, gfc_nam
case EXEC_OMP_TARGET_UPDATE:
case EXEC_OMP_TASK:
case EXEC_OMP_TASKGROUP:
+   case EXEC_OMP_TASKLOOP:
+   case EXEC_OMP_TASKLOOP_SIMD:
case EXEC_OMP_TASKWAIT:
case EXEC_OMP_TASKYIELD:
case EXEC_OMP_TEAMS:
@@ -10384,6 +10392,9 @@ gfc_resolve_code (gfc_code *code, gfc_na
case EXEC_OMP_PARALLEL_DO:
case EXEC_OMP_PARALLEL_DO_SIMD:
case EXEC_OMP_PARALLEL_SECTIONS:
+   case EXEC_OMP_TARGET_PARALLEL:
+   case EXEC_OMP_TARGET_PARALLEL_DO:
+   case EXEC_OMP_TARGET_PARALLEL_DO_SIMD:
case EXEC_OMP_TARGET_TEAMS:
case EXEC_OMP_TARGET_TEAMS_DISTRIBUTE:
case EXEC_OMP_TARGET_TEAMS_DISTRIBUTE_PARALLEL_DO:
@@ -10404,6 +10415,9 @@ gfc_resolve_code (gfc_code *code, gfc_na
case EXEC_OMP_DO:
case EXEC_OMP_DO_SIMD:
case EXEC_OMP_SIMD:
+   case EXEC_OMP_TARGET_SIMD:
+   case EXEC_OMP_TASKLOOP:
+   case EXEC_OMP_TASKLOOP_SIMD:
  gfc_resolve_omp_do_blocks (code, ns);
  break;
case EXEC_SELECT_TYPE:
@@ -10786,6 +10800,12 @@ start:
case EXEC_OMP_SINGLE:
case EXEC

[PATCH][AArch64][tests] Skip cpu-diagnostics tests when overriding -mcpu

2016-05-16 Thread Kyrill Tkachov

Hi all,

The gcc.target/aarch64/cpu-diagnostics* tests specify invalid -mcpu options and 
look for the expected error.
However, if the user overrides the -mcpu option when testing the tests start 
FAILing because they don't get
the expected bad -mcpu option.

This patch skips those tests when that happens.
That way when testing with /-mcpu= the tests appear UNSUPPORTED 
rather than FAIL.

Ok for trunk?

Thanks,
Kyrill

2016-05-16  Kyrylo Tkachov  

* gcc.target/aarch64/cpu-diagnostics-1.c: Skip if -mcpu is overriden.
* gcc.target/aarch64/cpu-diagnostics-2.c: Likewise.
* gcc.target/aarch64/cpu-diagnostics-3.c: Likewise.
* gcc.target/aarch64/cpu-diagnostics-4.c: Likewise.
diff --git a/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-1.c b/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-1.c
index de6b8a7da4d4d2500e5191dbbd925fab2d0afdb8..ddba65544710d35510cd73bcab083fc088148526 100644
--- a/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-1.c
+++ b/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-1.c
@@ -1,4 +1,5 @@
 /* { dg-error "unknown" "" {target "aarch64*-*-*" } } */
+/* { dg-skip-if "do not override -mcpu" { *-*-* } { "-mcpu=*" } { "" } } */
 /* { dg-options "-O2 -mcpu=dummy" } */
 
 void f ()
diff --git a/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-2.c b/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-2.c
index 2ca006598ff84cf0b92f229e72f83750d4c5e91f..ae42436031b07af709f754f24ef63fff1ce9b34c 100644
--- a/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-2.c
+++ b/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-2.c
@@ -1,4 +1,5 @@
 /* { dg-error "missing" "" {target "aarch64*-*-*" } } */
+/* { dg-skip-if "do not override -mcpu" { *-*-* } { "-mcpu=*" } { "" } } */
 /* { dg-options "-O2 -mcpu=cortex-a53+no" } */
 
 void f ()
diff --git a/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-3.c b/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-3.c
index 807e3253e30637f73613cdd184dc79bb6f24e7dc..8bc6e2fe0492e4518cdbea47fa63315a2fd83bac 100644
--- a/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-3.c
+++ b/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-3.c
@@ -1,4 +1,5 @@
 /* { dg-error "invalid feature" "" {target "aarch64*-*-*" } } */
+/* { dg-skip-if "do not override -mcpu" { *-*-* } { "-mcpu=*" } { "" } } */
 /* { dg-options "-O2 -mcpu=cortex-a53+dummy" } */
 
 void f ()
diff --git a/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-4.c b/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-4.c
index 4c246eb0172b16f9bad8b914b0bd0addd44edfe4..58355b42f84040386eef97dbe09ed99a0091923c 100644
--- a/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-4.c
+++ b/gcc/testsuite/gcc.target/aarch64/cpu-diagnostics-4.c
@@ -1,4 +1,5 @@
 /* { dg-error "missing" "" {target "aarch64*-*-*" } } */
+/* { dg-skip-if "do not override -mcpu" { *-*-* } { "-mcpu=*" } { "" } } */
 /* { dg-options "-O2 -mcpu=+dummy" } */
 
 void f ()


Re: [PATCH GCC]Document vect_cond_mixed in sourcebuild.texi

2016-05-16 Thread Sandra Loosemore

On 05/16/2016 05:01 AM, Bin Cheng wrote:

Hi,
This is an obvious patch documenting vect_cond_mixed in sourcebuild.texi.  OK?

Thanks,
bin

2016-05-13  bin cheng  

* doc/sourcebuild.texi (@item vect_cond_mixed): New item.


Assuming the information is technically correct, the patch looks OK from 
a documentation perspective.


-Sandra




RE: [PATCH][MIPS] Enable LSA/DLSA for MSA

2016-05-16 Thread Robert Suchanek
Hi Matthew,
> > Ok to commit?
> 
> OK.

Done as r236289.
 
> There is a corresponding testsuite change needed for this
> as some code quality tests change if LSA is available.  This
> is the HAS_LSA 'ghost' option in mips.exp.  I'm happy to leave
> this to be dealt with as part of the overall MSA testsuite
> patch though.

It's on my TODO list and will update the patch with MSA tests
to put all tests in one go.

Regards,
Robert


Re: [PR 70857] Copy RESULT_DECL of HSA outlined kernel function

2016-05-16 Thread Jakub Jelinek
On Mon, May 16, 2016 at 04:25:10PM +0200, Martin Jambor wrote:
> the patch below fixes PR 70857.  When the HSA gridification code
> copies the declaration of the function for outlining the target
> construct, it left the old RESULT_DECL dangling to it.  I did not
> notice because it has VOID_TYPE but it needs to be done nevertheless,
> not least because ipa-pta chokes on it.
> 
> Bootstrapped and tested on x86_64 with hsa enabled.  OK for trunk and
> the gcc-6 branch?
> 
> Thanks,
> 
> Martin
> 
> 2016-05-12  Martin Jambor  
> 
>   PR hsa/70857
>   * omp-low.c (grid_expand_target_grid_body): Copy RESULT_DECL of
>   the outlined kernel function.

Ok.

> diff --git a/gcc/omp-low.c b/gcc/omp-low.c
> index c9600fb..a11f44b 100644
> --- a/gcc/omp-low.c
> +++ b/gcc/omp-low.c
> @@ -13681,6 +13681,9 @@ grid_expand_target_grid_body (struct omp_region 
> *target)
>tree new_parm_decl = copy_node (DECL_ARGUMENTS (kern_fndecl));
>DECL_CONTEXT (new_parm_decl) = kern_fndecl;
>DECL_ARGUMENTS (kern_fndecl) = new_parm_decl;
> +  gcc_assert (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (kern_fndecl;
> +  DECL_RESULT (kern_fndecl) = copy_node (DECL_RESULT (kern_fndecl));
> +  DECL_CONTEXT (DECL_RESULT (kern_fndecl)) = kern_fndecl;
>struct function *kern_cfun = DECL_STRUCT_FUNCTION (kern_fndecl);
>kern_cfun->curr_properties = cfun->curr_properties;
>  
> -- 
> 2.8.2

Jakub


RE: [PATCH][MIPS] Correct latency of loads in M5100

2016-05-16 Thread Robert Suchanek
> > Ok to commit?
> 
> > * config/mips/m5100.md (m51_int_load): Update the latency to 2.
> 
> OK.

Committed - r236288

Robert


[PR 70857] Copy RESULT_DECL of HSA outlined kernel function

2016-05-16 Thread Martin Jambor
Hi,

the patch below fixes PR 70857.  When the HSA gridification code
copies the declaration of the function for outlining the target
construct, it left the old RESULT_DECL dangling to it.  I did not
notice because it has VOID_TYPE but it needs to be done nevertheless,
not least because ipa-pta chokes on it.

Bootstrapped and tested on x86_64 with hsa enabled.  OK for trunk and
the gcc-6 branch?

Thanks,

Martin

2016-05-12  Martin Jambor  

PR hsa/70857
* omp-low.c (grid_expand_target_grid_body): Copy RESULT_DECL of
the outlined kernel function.
---
 gcc/omp-low.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index c9600fb..a11f44b 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -13681,6 +13681,9 @@ grid_expand_target_grid_body (struct omp_region *target)
   tree new_parm_decl = copy_node (DECL_ARGUMENTS (kern_fndecl));
   DECL_CONTEXT (new_parm_decl) = kern_fndecl;
   DECL_ARGUMENTS (kern_fndecl) = new_parm_decl;
+  gcc_assert (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (kern_fndecl;
+  DECL_RESULT (kern_fndecl) = copy_node (DECL_RESULT (kern_fndecl));
+  DECL_CONTEXT (DECL_RESULT (kern_fndecl)) = kern_fndecl;
   struct function *kern_cfun = DECL_STRUCT_FUNCTION (kern_fndecl);
   kern_cfun->curr_properties = cfun->curr_properties;
 
-- 
2.8.2



RFA: Generate normal DWARF DW_LOC descriptors for non integer mode pointers

2016-05-16 Thread Nick Clifton
Hi Guys,

  Currently dwarf2out.c:mem_loc_descriptor() has some special case
  code to handle the situation where an address is held in a register
  whose mode is not of type MODE_INT.  It generates a
  DW_OP_GNU_regval_type expression which may later on be converted into
  a frame pointer based expression.  This is a problem for targets which
  use a partial integer mode for their pointers (eg the msp430).  In
  such cases the conversion to a frame pointer based expression could
  be wrong if the frame pointer is not being used.

  For example the GDB testfile gdb/testsuite/gdb.base/advance.c contains
  this code fragment:

int
main ()
{
  int result;
  int b, c;
  c = 5;
  b = 3;/* advance this location */

  func (c); /* stop here after leaving current frame */

  which compiles to these instructions:
  
suba#6, r1
mov #5, 4(r1)   
mov #3, 2(r1)   
mov 4(r1),  r12 
calla   #0  ;

  (Note that only r1 - the stack pointer - is used.  r4 - the frame
  pointer - is not).

  The debug information produced for the "c" local variable looks like
  this:

Abbrev Number: 3 (DW_TAG_variable)
 DW_AT_name: c
 DW_AT_decl_file   : 1
 DW_AT_decl_line   : 40
 DW_AT_type: <0x37>
 DW_AT_location: 5 byte block: f5 4 21 32 1c(DW_OP_GNU_regval_type: 
4 (r4) <0x21>; DW_OP_lit2; DW_OP_minus)

  ie it says that "c" is stored in memory location "r4 - 2", which is
  wrong since register r4 is not even used in this function.

  The patch below addresses this problem by allowing the normal,
  register based descriptor to be produced when the mode is Pmode.

  With this patch applied the unexpected failure count in the GDB
  testsuite for the MSP430's -mlarge multilib changes from 2253 to 367.
  There are no regressions, for MSP430 or x86_64, and no changes to
  the GCC testsuite results for either target.

  OK to apply ?

Cheers
  Nick

gcc/ChangeLog
2016-05-16  Nick Clifton  

* dwarf2out.c (mem_loc_descriptor): Convert REG based addresses
whose mode is Pmode into basereg descriptors even if Pmode is
not an integer mode.

Index: gcc/dwarf2out.c
===
--- gcc/dwarf2out.c (revision 236283)
+++ gcc/dwarf2out.c (working copy)
@@ -13396,7 +13396,11 @@
   break;
 
 case REG:
-  if (GET_MODE_CLASS (mode) != MODE_INT
+  if ((GET_MODE_CLASS (mode) != MODE_INT
+ /* Targets which have pointers that use a partial integer mode
+(eg the msp430x) still want their debug information to be
+based on the normal DWARF base register notation.  */
+  && mode != Pmode)
  || (GET_MODE_SIZE (mode) > DWARF2_ADDR_SIZE
  && rtl != arg_pointer_rtx
  && rtl != frame_pointer_rtx



Re: [PATCH 1/3] Encapsulate comp_cost within a class with methods.

2016-05-16 Thread Martin Liška
On 05/16/2016 12:13 PM, Bin.Cheng wrote:
> Hi Martin,
> Could you please rebase this patch and the profiling one against
> latest trunk?  The third patch was applied before these two now.
> 
> Thanks,
> bin

Hello.

Sending the rebased version of the patch.

Martin
>From a91b1578f3907e05543b2acea0081b6e4744ade9 Mon Sep 17 00:00:00 2001
From: marxin 
Date: Mon, 16 May 2016 15:52:56 +0200
Subject: [PATCH 2/2] Add profiling support for IVOPTS

gcc/ChangeLog:

2016-04-25  Martin Liska  

	* tree-ssa-loop-ivopts.c (struct comp_cost): Introduce
	m_cost_scaled and m_frequency fields.
	(comp_cost::operator=): Assign to m_cost_scaled.
	(operator+): Likewise.
	(comp_cost::operator+=): Likewise.
	(comp_cost::operator-=): Likewise.
	(comp_cost::operator/=): Likewise.
	(comp_cost::operator*=): Likewise.
	(operator-): Likewise.
	(comp_cost::set_cost): Likewise.
	(comp_cost::get_cost_scaled): New function.
	(comp_cost::calculate_scaled_cost): Likewise.
	(comp_cost::propagate_scaled_cost): Likewise.
	(comp_cost::get_frequency): Likewise.
	(comp_cost::scale_cost): Likewise.
	(comp_cost::has_frequency): Likewise.
	(get_computation_cost_at): Propagate ratio of frequencies
	of loop header and another basic block.
	(determine_group_iv_costs): Dump new fields.
---
 gcc/tree-ssa-loop-ivopts.c | 130 -
 1 file changed, 118 insertions(+), 12 deletions(-)

diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
index 876e6ed..3a80a23 100644
--- a/gcc/tree-ssa-loop-ivopts.c
+++ b/gcc/tree-ssa-loop-ivopts.c
@@ -107,6 +107,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-ssa-address.h"
 #include "builtins.h"
 #include "tree-vectorizer.h"
+#include "sreal.h"
 
 /* FIXME: Expressions are expanded to RTL in this pass to determine the
cost of different addressing modes.  This should be moved to a TBD
@@ -173,11 +174,13 @@ enum use_type
 /* Cost of a computation.  */
 struct comp_cost
 {
-  comp_cost (): m_cost (0), m_complexity (0), m_scratch (0)
+  comp_cost (): m_cost (0), m_complexity (0), m_scratch (0),
+m_frequency (sreal (0)), m_cost_scaled (sreal (0))
   {}
 
   comp_cost (int cost, unsigned complexity)
-: m_cost (cost), m_complexity (complexity), m_scratch (0)
+: m_cost (cost), m_complexity (complexity), m_scratch (0),
+  m_frequency (sreal (0)), m_cost_scaled (sreal (0))
   {}
 
   comp_cost& operator= (const comp_cost& other);
@@ -236,6 +239,26 @@ struct comp_cost
   /* Set the scratch to S.  */
   void set_scratch (unsigned s);
 
+  /* Return scaled cost.  */
+  double get_cost_scaled ();
+
+  /* Calculate scaled cost based on frequency of a basic block with
+ frequency equal to NOMINATOR / DENOMINATOR.  */
+  void calculate_scaled_cost (int nominator, int denominator);
+
+  /* Propagate scaled cost which is based on frequency of basic block
+ the cost belongs to.  */
+  void propagate_scaled_cost ();
+
+  /* Return frequency of the cost.  */
+  double get_frequency ();
+
+  /* Scale COST by frequency of the cost.  */
+  const sreal scale_cost (int cost);
+
+  /* Return true if the frequency has a valid value.  */
+  bool has_frequency ();
+
   /* Return infinite comp_cost.  */
   static comp_cost get_infinite ();
 
@@ -249,6 +272,9 @@ private:
 			 complexity field should be larger for more
 			 complex expressions and addressing modes).  */
   int m_scratch;	  /* Scratch used during cost computation.  */
+  sreal m_frequency;	  /* Frequency of the basic block this comp_cost
+			 belongs to.  */
+  sreal m_cost_scaled;	  /* Scalled runtime cost.  */
 };
 
 comp_cost&
@@ -257,6 +283,8 @@ comp_cost::operator= (const comp_cost& other)
   m_cost = other.m_cost;
   m_complexity = other.m_complexity;
   m_scratch = other.m_scratch;
+  m_frequency = other.m_frequency;
+  m_cost_scaled = other.m_cost_scaled;
 
   return *this;
 }
@@ -275,6 +303,7 @@ operator+ (comp_cost cost1, comp_cost cost2)
 
   cost1.m_cost += cost2.m_cost;
   cost1.m_complexity += cost2.m_complexity;
+  cost1.m_cost_scaled += cost2.m_cost_scaled;
 
   return cost1;
 }
@@ -290,6 +319,8 @@ comp_cost
 comp_cost::operator+= (HOST_WIDE_INT c)
 {
   this->m_cost += c;
+  if (has_frequency ())
+this->m_cost_scaled += scale_cost (c);
 
   return *this;
 }
@@ -298,6 +329,8 @@ comp_cost
 comp_cost::operator-= (HOST_WIDE_INT c)
 {
   this->m_cost -= c;
+  if (has_frequency ())
+this->m_cost_scaled -= scale_cost (c);
 
   return *this;
 }
@@ -306,6 +339,8 @@ comp_cost
 comp_cost::operator/= (HOST_WIDE_INT c)
 {
   this->m_cost /= c;
+  if (has_frequency ())
+this->m_cost_scaled /= scale_cost (c);
 
   return *this;
 }
@@ -314,6 +349,8 @@ comp_cost
 comp_cost::operator*= (HOST_WIDE_INT c)
 {
   this->m_cost *= c;
+  if (has_frequency ())
+this->m_cost_scaled *= scale_cost (c);
 
   return *this;
 }
@@ -323,6 +360,7 @@ operator- (comp_cost cost1, comp_cost cost2)
 {
   cost1.m_cost -= cost2.m_cost;
   cost1.m_complexity -= cost2.m_complexity;
+

Re: [PATCH 2/3] Add profiling support for IVOPTS

2016-05-16 Thread Martin Liška
Hello.

Sending the rebased version of the patch.

Martin
>From a91b1578f3907e05543b2acea0081b6e4744ade9 Mon Sep 17 00:00:00 2001
From: marxin 
Date: Mon, 16 May 2016 15:52:56 +0200
Subject: [PATCH 2/2] Add profiling support for IVOPTS

gcc/ChangeLog:

2016-04-25  Martin Liska  

	* tree-ssa-loop-ivopts.c (struct comp_cost): Introduce
	m_cost_scaled and m_frequency fields.
	(comp_cost::operator=): Assign to m_cost_scaled.
	(operator+): Likewise.
	(comp_cost::operator+=): Likewise.
	(comp_cost::operator-=): Likewise.
	(comp_cost::operator/=): Likewise.
	(comp_cost::operator*=): Likewise.
	(operator-): Likewise.
	(comp_cost::set_cost): Likewise.
	(comp_cost::get_cost_scaled): New function.
	(comp_cost::calculate_scaled_cost): Likewise.
	(comp_cost::propagate_scaled_cost): Likewise.
	(comp_cost::get_frequency): Likewise.
	(comp_cost::scale_cost): Likewise.
	(comp_cost::has_frequency): Likewise.
	(get_computation_cost_at): Propagate ratio of frequencies
	of loop header and another basic block.
	(determine_group_iv_costs): Dump new fields.
---
 gcc/tree-ssa-loop-ivopts.c | 130 -
 1 file changed, 118 insertions(+), 12 deletions(-)

diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
index 876e6ed..3a80a23 100644
--- a/gcc/tree-ssa-loop-ivopts.c
+++ b/gcc/tree-ssa-loop-ivopts.c
@@ -107,6 +107,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-ssa-address.h"
 #include "builtins.h"
 #include "tree-vectorizer.h"
+#include "sreal.h"
 
 /* FIXME: Expressions are expanded to RTL in this pass to determine the
cost of different addressing modes.  This should be moved to a TBD
@@ -173,11 +174,13 @@ enum use_type
 /* Cost of a computation.  */
 struct comp_cost
 {
-  comp_cost (): m_cost (0), m_complexity (0), m_scratch (0)
+  comp_cost (): m_cost (0), m_complexity (0), m_scratch (0),
+m_frequency (sreal (0)), m_cost_scaled (sreal (0))
   {}
 
   comp_cost (int cost, unsigned complexity)
-: m_cost (cost), m_complexity (complexity), m_scratch (0)
+: m_cost (cost), m_complexity (complexity), m_scratch (0),
+  m_frequency (sreal (0)), m_cost_scaled (sreal (0))
   {}
 
   comp_cost& operator= (const comp_cost& other);
@@ -236,6 +239,26 @@ struct comp_cost
   /* Set the scratch to S.  */
   void set_scratch (unsigned s);
 
+  /* Return scaled cost.  */
+  double get_cost_scaled ();
+
+  /* Calculate scaled cost based on frequency of a basic block with
+ frequency equal to NOMINATOR / DENOMINATOR.  */
+  void calculate_scaled_cost (int nominator, int denominator);
+
+  /* Propagate scaled cost which is based on frequency of basic block
+ the cost belongs to.  */
+  void propagate_scaled_cost ();
+
+  /* Return frequency of the cost.  */
+  double get_frequency ();
+
+  /* Scale COST by frequency of the cost.  */
+  const sreal scale_cost (int cost);
+
+  /* Return true if the frequency has a valid value.  */
+  bool has_frequency ();
+
   /* Return infinite comp_cost.  */
   static comp_cost get_infinite ();
 
@@ -249,6 +272,9 @@ private:
 			 complexity field should be larger for more
 			 complex expressions and addressing modes).  */
   int m_scratch;	  /* Scratch used during cost computation.  */
+  sreal m_frequency;	  /* Frequency of the basic block this comp_cost
+			 belongs to.  */
+  sreal m_cost_scaled;	  /* Scalled runtime cost.  */
 };
 
 comp_cost&
@@ -257,6 +283,8 @@ comp_cost::operator= (const comp_cost& other)
   m_cost = other.m_cost;
   m_complexity = other.m_complexity;
   m_scratch = other.m_scratch;
+  m_frequency = other.m_frequency;
+  m_cost_scaled = other.m_cost_scaled;
 
   return *this;
 }
@@ -275,6 +303,7 @@ operator+ (comp_cost cost1, comp_cost cost2)
 
   cost1.m_cost += cost2.m_cost;
   cost1.m_complexity += cost2.m_complexity;
+  cost1.m_cost_scaled += cost2.m_cost_scaled;
 
   return cost1;
 }
@@ -290,6 +319,8 @@ comp_cost
 comp_cost::operator+= (HOST_WIDE_INT c)
 {
   this->m_cost += c;
+  if (has_frequency ())
+this->m_cost_scaled += scale_cost (c);
 
   return *this;
 }
@@ -298,6 +329,8 @@ comp_cost
 comp_cost::operator-= (HOST_WIDE_INT c)
 {
   this->m_cost -= c;
+  if (has_frequency ())
+this->m_cost_scaled -= scale_cost (c);
 
   return *this;
 }
@@ -306,6 +339,8 @@ comp_cost
 comp_cost::operator/= (HOST_WIDE_INT c)
 {
   this->m_cost /= c;
+  if (has_frequency ())
+this->m_cost_scaled /= scale_cost (c);
 
   return *this;
 }
@@ -314,6 +349,8 @@ comp_cost
 comp_cost::operator*= (HOST_WIDE_INT c)
 {
   this->m_cost *= c;
+  if (has_frequency ())
+this->m_cost_scaled *= scale_cost (c);
 
   return *this;
 }
@@ -323,6 +360,7 @@ operator- (comp_cost cost1, comp_cost cost2)
 {
   cost1.m_cost -= cost2.m_cost;
   cost1.m_complexity -= cost2.m_complexity;
+  cost1.m_cost_scaled -= cost2.m_cost_scaled;
 
   return cost1;
 }
@@ -366,6 +404,7 @@ void
 comp_cost::set_cost (int c)
 {
   m_cost = c;
+  m_cost_scaled = scale_cost (c);
 }
 
 unsigned
@@ -392,6 +431,48 

Re: [PATCH][AArch64] Improve aarch64_case_values_threshold setting

2016-05-16 Thread Wilco Dijkstra
James Greenhalgh wrote:
> As this change will change code generation for all cores (except
> Exynos-M1), I'd like to hear from those with more detailed knowledge of
> ThunderX, X-Gene and qdf24xx before I take this patch.
>
> Let's give it another week or so for comments, and expand the CC list.

Note it affect Exynos-M1 as well with -O2. 

>> GCC is ~1.2% faster on Cortex-A53 built for generic, but there is no
>> difference in perlbench.
>
> Where were these changes if not perlbench?

In the GCC test.

Wilco



Re: [Patch ARM/AArch64 07/11] Add missing vget_lane fp16 tests.

2016-05-16 Thread Kyrill Tkachov


On 11/05/16 14:23, Christophe Lyon wrote:

2016-05-02  Christophe Lyon  

* gcc.target/aarch64/advsimd-intrinsics/vget_lane.c: Add fp16 tests.


I thought for a bit wrapping the expected_f16, expectedq_f16 and other 
declarations
in an "#if defined (__ARM_FP16_FORMAT_IEEE) || defined 
(__ARM_FP16_FORMAT_ALTERNATIVE)"
as well but I don't think there's much to gain in that.

So ok for trunk.

Thanks,
Kyrill


Change-Id: I5fafd1e90baf09588ab9f5444817c74e7d865a20

diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_lane.c 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_lane.c
index 5806050..fe41c5f 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_lane.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_lane.c
@@ -13,6 +13,7 @@ uint32_t   expected_u32  = 0xfff1;
  uint64_t   expected_u64  = 0xfff0;
  poly8_texpected_p8   = 0xf6;
  poly16_t   expected_p16  = 0xfff2;
+hfloat16_t expected_f16  = 0xcb80;
  hfloat32_t expected_f32  = 0xc170;
  
  int8_t expectedq_s8  = 0xff;

@@ -25,6 +26,7 @@ uint32_t   expectedq_u32 = 0xfff2;
  uint64_t   expectedq_u64 = 0xfff1;
  poly8_texpectedq_p8  = 0xfe;
  poly16_t   expectedq_p16 = 0xfff6;
+hfloat16_t expectedq_f16 = 0xca80;
  hfloat32_t expectedq_f32 = 0xc150;
  
  int error_found = 0;

@@ -52,6 +54,10 @@ void exec_vget_lane (void)
  uint32_t var_int32;
  float32_t var_float32;
} var_int32_float32;
+  union {
+uint16_t var_int16;
+float16_t var_float16;
+  } var_int16_float16;
  
  #define TEST_VGET_LANE_FP(Q, T1, T2, W, N, L)   \

VAR(var, T1, W) = vget##Q##_lane_##T2##W(VECT_VAR(vector, T1, W, N), L); \
@@ -81,10 +87,17 @@ void exec_vget_lane (void)
VAR_DECL(var, uint, 64);
VAR_DECL(var, poly, 8);
VAR_DECL(var, poly, 16);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  VAR_DECL(var, float, 16);
+#endif
VAR_DECL(var, float, 32);
  
/* Initialize input values.  */

TEST_MACRO_ALL_VARIANTS_2_5(VLOAD, vector, buffer);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  VLOAD(vector, buffer, , float, f, 16, 4);
+  VLOAD(vector, buffer, q, float, f, 16, 8);
+#endif
VLOAD(vector, buffer, , float, f, 32, 2);
VLOAD(vector, buffer, q, float, f, 32, 4);
  
@@ -99,6 +112,9 @@ void exec_vget_lane (void)

TEST_VGET_LANE(, uint, u, 64, 1, 0);
TEST_VGET_LANE(, poly, p, 8, 8, 6);
TEST_VGET_LANE(, poly, p, 16, 4, 2);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  TEST_VGET_LANE_FP(, float, f, 16, 4, 1);
+#endif
TEST_VGET_LANE_FP(, float, f, 32, 2, 1);
  
TEST_VGET_LANE(q, int, s, 8, 16, 15);

@@ -111,6 +127,9 @@ void exec_vget_lane (void)
TEST_VGET_LANE(q, uint, u, 64, 2, 1);
TEST_VGET_LANE(q, poly, p, 8, 16, 14);
TEST_VGET_LANE(q, poly, p, 16, 8, 6);
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+  TEST_VGET_LANE_FP(q, float, f, 16, 8, 3);
+#endif
TEST_VGET_LANE_FP(q, float, f, 32, 4, 3);
  }
  




Re: [Patch ARM/AArch64 05/11] Add missing vreinterpretq_p{8,16} tests.

2016-05-16 Thread Kyrill Tkachov


On 11/05/16 14:23, Christophe Lyon wrote:

2016-05-02  Christophe Lyon  

* gcc.target/aarch64/advsimd-intrinsics/vreinterpret.c: Add
missing tests for vreinterpretq_p{8,16}.


Ok.
Thanks,
Kyrill


Change-Id: I7e9bb18c668c34685f12aa578868d7752232a96c

diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vreinterpret.c 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vreinterpret.c
index d4e5768..2570f73 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vreinterpret.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vreinterpret.c
@@ -371,6 +371,83 @@ VECT_VAR_DECL(expected_q_u64_8,uint,64,2) [] = { 
0xf7f6f5f4f3f2f1f0,
  VECT_VAR_DECL(expected_q_u64_9,uint,64,2) [] = { 0xfff3fff2fff1fff0,
 0xfff7fff6fff5fff4 };
  
+

+/* Expected results for vreinterpretq_p8_xx.  */
+VECT_VAR_DECL(expected_q_p8_1,poly,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
+   0xf4, 0xf5, 0xf6, 0xf7,
+   0xf8, 0xf9, 0xfa, 0xfb,
+   0xfc, 0xfd, 0xfe, 0xff };
+VECT_VAR_DECL(expected_q_p8_2,poly,8,16) [] = { 0xf0, 0xff, 0xf1, 0xff,
+   0xf2, 0xff, 0xf3, 0xff,
+   0xf4, 0xff, 0xf5, 0xff,
+   0xf6, 0xff, 0xf7, 0xff };
+VECT_VAR_DECL(expected_q_p8_3,poly,8,16) [] = { 0xf0, 0xff, 0xff, 0xff,
+   0xf1, 0xff, 0xff, 0xff,
+   0xf2, 0xff, 0xff, 0xff,
+   0xf3, 0xff, 0xff, 0xff };
+VECT_VAR_DECL(expected_q_p8_4,poly,8,16) [] = { 0xf0, 0xff, 0xff, 0xff,
+   0xff, 0xff, 0xff, 0xff,
+   0xf1, 0xff, 0xff, 0xff,
+   0xff, 0xff, 0xff, 0xff };
+VECT_VAR_DECL(expected_q_p8_5,poly,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
+   0xf4, 0xf5, 0xf6, 0xf7,
+   0xf8, 0xf9, 0xfa, 0xfb,
+   0xfc, 0xfd, 0xfe, 0xff };
+VECT_VAR_DECL(expected_q_p8_6,poly,8,16) [] = { 0xf0, 0xff, 0xf1, 0xff,
+   0xf2, 0xff, 0xf3, 0xff,
+   0xf4, 0xff, 0xf5, 0xff,
+   0xf6, 0xff, 0xf7, 0xff };
+VECT_VAR_DECL(expected_q_p8_7,poly,8,16) [] = { 0xf0, 0xff, 0xff, 0xff,
+   0xf1, 0xff, 0xff, 0xff,
+   0xf2, 0xff, 0xff, 0xff,
+   0xf3, 0xff, 0xff, 0xff };
+VECT_VAR_DECL(expected_q_p8_8,poly,8,16) [] = { 0xf0, 0xff, 0xff, 0xff,
+   0xff, 0xff, 0xff, 0xff,
+   0xf1, 0xff, 0xff, 0xff,
+   0xff, 0xff, 0xff, 0xff };
+VECT_VAR_DECL(expected_q_p8_9,poly,8,16) [] = { 0xf0, 0xff, 0xf1, 0xff,
+   0xf2, 0xff, 0xf3, 0xff,
+   0xf4, 0xff, 0xf5, 0xff,
+   0xf6, 0xff, 0xf7, 0xff };
+
+/* Expected results for vreinterpretq_p16_xx.  */
+VECT_VAR_DECL(expected_q_p16_1,poly,16,8) [] = { 0xf1f0, 0xf3f2,
+0xf5f4, 0xf7f6,
+0xf9f8, 0xfbfa,
+0xfdfc, 0xfffe };
+VECT_VAR_DECL(expected_q_p16_2,poly,16,8) [] = { 0xfff0, 0xfff1,
+0xfff2, 0xfff3,
+0xfff4, 0xfff5,
+0xfff6, 0xfff7 };
+VECT_VAR_DECL(expected_q_p16_3,poly,16,8) [] = { 0xfff0, 0x,
+0xfff1, 0x,
+0xfff2, 0x,
+0xfff3, 0x };
+VECT_VAR_DECL(expected_q_p16_4,poly,16,8) [] = { 0xfff0, 0x,
+0x, 0x,
+0xfff1, 0x,
+0x, 0x };
+VECT_VAR_DECL(expected_q_p16_5,poly,16,8) [] = { 0xf1f0, 0xf3f2,
+0xf5f4, 0xf7f6,
+0xf9f8, 0xfbfa,
+0xfdfc, 0xfffe };
+VECT_VAR_DECL(expected_q_p16_6,poly,16,8) [] = { 0xfff0, 0xfff1,
+0xfff2, 0xff

Re: [Patch ARM/AArch64 04/11] Add forgotten vsliq_n_u64 test.

2016-05-16 Thread Kyrill Tkachov


On 13/05/16 15:22, Christophe Lyon wrote:

On 13 May 2016 at 16:08, James Greenhalgh  wrote:

On Wed, May 11, 2016 at 03:23:54PM +0200, Christophe Lyon wrote:

2016-05-02  Christophe Lyon  

   * gcc.target/aarch64/advsimd-intrinsics/vsli_n.c: Add check for 
vsliq_n_u64.


And vsliq_n_s64 ?


Damn! You are right, I missed that one.


OK with that change.

OK thanks



Ok by me too with that change.

Thanks,
Kyrill


Thanks,
James


Change-Id: I90bb2b225ffd7bfd54a0827a0264ac20271f54f2

diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsli_n.c 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsli_n.c
index 0285083..e5f78d0 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsli_n.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vsli_n.c
@@ -169,6 +169,7 @@ void vsli_extra(void)
CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected_max_shift, COMMENT);
CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected_max_shift, COMMENT);
CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected_max_shift, COMMENT);
+  CHECK(TEST_MSG, uint, 64, 2, PRIx64, expected_max_shift, COMMENT);
CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected_max_shift, COMMENT);
CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected_max_shift, COMMENT);
  }
--
1.9.1





Re: [Patch ARM/AArch64 03/11] AdvSIMD tests: be more verbose.

2016-05-16 Thread Kyrill Tkachov


On 11/05/16 14:23, Christophe Lyon wrote:

It is useful to have more detailed information in the logs when checking
validation results: instead of repeating the intrinsic name, we now print
its return type too.


Ok.
Thanks,
Kyrill


2016-05-02  Christophe Lyon  

* gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h (CHECK,
CHECK_FP, CHECK_CUMULATIVE_SAT): Print which type was checked.

Change-Id: I74759d6a211cf52962f860fe77653a6f6edc1848

diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
index 49fbd84..a2c160c 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
@@ -81,7 +81,7 @@ extern size_t strlen(const char *);
  abort();  \
}   \
}   
\
-fprintf(stderr, "CHECKED %s\n", MSG);\
+fprintf(stderr, "CHECKED %s %s\n", STR(VECT_TYPE(T, W, N)), MSG);\
}
  
  /* Floating-point variant.  */

@@ -110,7 +110,7 @@ extern size_t strlen(const char *);
  abort();  \
}   \
}   
\
-fprintf(stderr, "CHECKED %s\n", MSG);\
+fprintf(stderr, "CHECKED %s %s\n", STR(VECT_TYPE(T, W, N)), MSG);\
}
  
  /* Clean buffer with a non-zero pattern to help diagnose buffer

@@ -335,7 +335,8 @@ extern int VECT_VAR(expected_cumulative_sat, uint, 64, 2);
  strlen(COMMENT) > 0 ? " " COMMENT : "");   \
abort();
\
  } \
-fprintf(stderr, "CHECKED CUMULATIVE SAT %s\n", MSG); \
+fprintf(stderr, "CHECKED CUMULATIVE SAT %s %s\n",\
+   STR(VECT_TYPE(T, W, N)), MSG);  \
}
  
  #define CHECK_CUMULATIVE_SAT_NAMED(test_name,EXPECTED,comment)		\




Re: [Patch ARM/AArch64 02/11] We can remove useless #ifdefs from these tests: vmul, vshl and vtst.

2016-05-16 Thread Kyrill Tkachov


On 11/05/16 14:23, Christophe Lyon wrote:

2016-05-02  Christophe Lyon  

* gcc.target/aarch64/advsimd-intrinsics/vmul.c: Remove useless #ifdef.
* gcc.target/aarch64/advsimd-intrinsics/vshl.c: Likewise.
* gcc.target/aarch64/advsimd-intrinsics/vtst.c: Likewise.


Ok.
Thanks,
Kyrill


Change-Id: I1b00b8edc4db6e6457be5bc1f92e8b6e218da644

diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmul.c 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmul.c
index 0cbb656..63f0d8d 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmul.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmul.c
@@ -37,10 +37,8 @@ VECT_VAR_DECL(expected,poly,8,16) [] = { 0x60, 0xca, 0x34, 
0x9e,
  VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc4c7, 0xc4bac000,
   0xc4ae4ccd, 0xc4a1d999 };
  
-#ifndef INSN_NAME

  #define INSN_NAME vmul
  #define TEST_MSG "VMUL"
-#endif
  
  #define FNNAME1(NAME) exec_ ## NAME

  #define FNNAME(NAME) FNNAME1(NAME)
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vshl.c 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vshl.c
index 821c11e..e8a57a4 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vshl.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vshl.c
@@ -101,10 +101,8 @@ VECT_VAR_DECL(expected_negative_shift,uint,64,2) [] = { 
0x7ff,
0x7ff };
  
  
-#ifndef INSN_NAME

  #define INSN_NAME vshl
  #define TEST_MSG "VSHL/VSHLQ"
-#endif
  
  #define FNNAME1(NAME) exec_ ## NAME

  #define FNNAME(NAME) FNNAME1(NAME)
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtst.c 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtst.c
index 7f96540..9e74ffb 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtst.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vtst.c
@@ -32,10 +32,8 @@ VECT_VAR_DECL(expected_unsigned,uint,16,8) [] = { 0x0, 
0x,
  VECT_VAR_DECL(expected_unsigned,uint,32,4) [] = { 0x0, 0x,
  0x0, 0x };
  
-#ifndef INSN_NAME

  #define INSN_NAME vtst
  #define TEST_MSG "VTST/VTSTQ"
-#endif
  
  /* We can't use the standard ref_v_binary_op.c template because vtst

 has no 64 bits variant, and outputs are always of uint type.  */




Re: [Patch ARM/AArch64 01/11] Fix typo in vreinterpret.c test comment.

2016-05-16 Thread Kyrill Tkachov


On 11/05/16 14:23, Christophe Lyon wrote:

2016-05-02  Christophe Lyon  

* gcc.target/aarch64/advsimd-intrinsics/vreinterpret.c: Fix typo in 
comment.


Ok (I agree it's obvious)

Thanks,
Kyrill


Change-Id: I7244c0dc0a5ab2dbcec65b40c050f72f92707139

diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vreinterpret.c 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vreinterpret.c
index 9e45e25..d4e5768 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vreinterpret.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vreinterpret.c
@@ -405,7 +405,7 @@ VECT_VAR_DECL(expected_q_f32_9,hfloat,32,4) [] = { 
0xf3f2f1f0, 0xf7f6f5f4,
  VECT_VAR_DECL(expected_q_f32_10,hfloat,32,4) [] = { 0xfff1fff0, 0xfff3fff2,
0xfff5fff4, 0xfff7fff6 };
  
-/* Expected results for vreinterpretq_xx_f32.  */

+/* Expected results for vreinterpret_xx_f32.  */
  VECT_VAR_DECL(expected_xx_f32_1,int,8,8) [] = { 0x0, 0x0, 0x80, 0xc1,
0x0, 0x0, 0x70, 0xc1 };
  VECT_VAR_DECL(expected_xx_f32_2,int,16,4) [] = { 0x0, 0xc180, 0x0, 0xc170 };




Re: [ARM] Enable __fp16 as a function parameter and return type.

2016-05-16 Thread Tejas Belagod

On 11/05/16 16:46, Joseph Myers wrote:

On Wed, 11 May 2016, Tejas Belagod wrote:


AFAICS, I don't think it mandates a double-rounding behavior for double to
__fp16 conversions and I don't see a change in stand between the two versions
of ACLE on the behavior of __fp16.


It's not a change between the two versions of ACLE.  It's a change
relative to the early (pre-ACLE) __fp16 specification (or, at least, a
clarification thereto in email on 12 Aug 2008) that was used as a basis
for the original implementation of __fp16 in GCC (and that thus is what's
currently implemented by GCC and tested for in the testsuite).



Hi Joseph,

Sorry for the delay in responding.

I've had a conversation with Al and I now have some context. You're right - the 
2008 mail you are referring to is the pre-ACLE behavior. By the time the first 
draft of the first version of ACLE was reviewed by CodeSourcery(circa 2011), it 
already mandated single rounding. No published ACLE has ever allowed double 
rounding.


This meant that when the first draft of ACLE was published in 2011, its pre-ACLE 
implementations in gcc and armcc were already non-conformant, in other words, 
'bug-compatible'.


We do have plans to fix pre-ACLE behavior of fp16 to conform to current ACLE 
spec, but can't say when exactly.


Thanks,
Tejas.


Re: [PTX] assembler name mangling

2016-05-16 Thread Nathan Sidwell

On 05/13/16 12:39, Alexander Monakov wrote:


This regresses offloading compilation: the new hook isn't applied during LTO
stream-in, so target functions named 'call' won't be remapped.


foop :(  I've restored the old behaviour.

nathan


RE: [MIPS,committed] Update MIPS P5600 processor definition to avoid IMADD

2016-05-16 Thread Matthew Fortune
Maciej Rozycki  writes:
> On Wed, 4 May 2016, Matthew Fortune wrote:
> 
> > diff --git a/gcc/config/mips/mips-cpus.def
> > b/gcc/config/mips/mips-cpus.def index 17034f2..5df9807 100644
> > --- a/gcc/config/mips/mips-cpus.def
> > +++ b/gcc/config/mips/mips-cpus.def
> > @@ -44,10 +44,7 @@ MIPS_CPU ("mips4", PROCESSOR_R1, 4, 0)
> > isn't tuned to a specific processor.  */  MIPS_CPU ("mips32",
> > PROCESSOR_4KC, 32, PTF_AVOID_BRANCHLIKELY)  MIPS_CPU ("mips32r2",
> > PROCESSOR_74KF2_1, 33, PTF_AVOID_BRANCHLIKELY)
> > -/* mips32r3 is micromips hense why it uses the M4K processor.
> > -   mips32r5 should use the p5600 processor, but there is no
> definition
> > -   for this yet, so in the short term we will use the same processor
> entry
> > -   as mips32r2.  */
> > +/* mips32r3 is micromips hense why it uses the M4K processor.  */
> 
>  Typo here -> s/hense/hence/ -- since you've reworked the comment and
> changed the line in the course anyway, you may have well taken the
> opportunity and fixed it.

I saw it but chose not to fix on the basis of one change leads to another.

> > @@ -150,7 +147,8 @@ MIPS_CPU ("1004kf1_1", PROCESSOR_24KF1_1, 33, 0)
> > MIPS_CPU ("interaptiv", PROCESSOR_24KF2_1, 33, 0)
> >
> >  /* MIPS32 Release 5 processors.  */
> > -MIPS_CPU ("p5600", PROCESSOR_P5600, 36, PTF_AVOID_BRANCHLIKELY)
> > +MIPS_CPU ("p5600", PROCESSOR_P5600, 36, PTF_AVOID_BRANCHLIKELY
> > +   | PTF_AVOID_IMADD)
> 
>  Not:
> 
> MIPS_CPU ("p5600", PROCESSOR_P5600, 36, (PTF_AVOID_BRANCHLIKELY
>| PTF_AVOID_IMADD))

Good spot, brackets added. I almost didn't split the line for fear
of breaking the awk parsing.

Thanks,
Matthew

diff --git a/gcc/config/mips/mips-cpus.def b/gcc/config/mips/mips-cpus.def
index 5df9807..b46c86f 100644
--- a/gcc/config/mips/mips-cpus.def
+++ b/gcc/config/mips/mips-cpus.def
@@ -147,8 +147,8 @@ MIPS_CPU ("1004kf1_1", PROCESSOR_24KF1_1, 33, 0)
 MIPS_CPU ("interaptiv", PROCESSOR_24KF2_1, 33, 0)
 
 /* MIPS32 Release 5 processors.  */
-MIPS_CPU ("p5600", PROCESSOR_P5600, 36, PTF_AVOID_BRANCHLIKELY
-   | PTF_AVOID_IMADD)
+MIPS_CPU ("p5600", PROCESSOR_P5600, 36, (PTF_AVOID_BRANCHLIKELY
+| PTF_AVOID_IMADD))
 MIPS_CPU ("m5100", PROCESSOR_M5100, 36, PTF_AVOID_BRANCHLIKELY)
 MIPS_CPU ("m5101", PROCESSOR_M5100, 36, PTF_AVOID_BRANCHLIKELY)
 
-- 
2.2.1


Re: [PATCH][RFC] Introduce BIT_FIELD_INSERT

2016-05-16 Thread Bill Schmidt
Sorry, that was the wrong vector-6.c — should have realized.  In any case, for 
each of the vector tests, we get appropriate use of element-wise loads, and no 
load-hit-store bitfield assignments, so the code generation is what we want to 
see.  Sorry for the misleading information.

Bill

> On May 15, 2016, at 7:55 PM, Bill Schmidt  wrote:
> 
> Hi Richard,
> 
> (Sorry for duplication to your personal email, I had new-mailer issues.)
> 
> The new vector-6 test produces very good code for powerpc64le with this patch:
> 
>addis 9,2,.LC0@toc@ha
>sldi 3,3,32
>addi 9,9,.LC0@toc@l
>rldicl 9,9,0,32
>or 3,9,3
>blr
> 
> I did run into some ICEs with bootstrap/regtest, though:
> 
> 26c26
> < /home/wschmidt/gcc/build/gcc-mainline-base/gcc/testsuite/g++/../../xg++  
> version 7.0.0 20160515 (experimental) [trunk revision 236259] (GCC) 
> ---
>> /home/wschmidt/gcc/build/gcc-mainline-test/gcc/testsuite/g++/../../xg++  
>> version 7.0.0 20160515 (experimental) [trunk revision 236259] (GCC) 
> 31a32,39
>> FAIL: gcc.c-torture/compile/pr70240.c   -O1  (internal compiler error)
>> FAIL: gcc.c-torture/compile/pr70240.c   -O1  (test for excess errors)
>> FAIL: gcc.c-torture/compile/pr70240.c   -O2  (internal compiler error)
>> FAIL: gcc.c-torture/compile/pr70240.c   -O2  (test for excess errors)
>> FAIL: gcc.c-torture/compile/pr70240.c   -O2 -flto -fno-use-linker-plugin 
>> -flto-partition=none  (internal compiler error)
>> FAIL: gcc.c-torture/compile/pr70240.c   -O2 -flto -fno-use-linker-plugin 
>> -flto-partition=none  (test for excess errors)
>> FAIL: gcc.c-torture/compile/pr70240.c   -Os  (internal compiler error)
>> FAIL: gcc.c-torture/compile/pr70240.c   -Os  (test for excess errors)
> 53a62,66
>> FAIL: gcc.dg/pr69896.c (internal compiler error)
>> FAIL: gcc.dg/pr69896.c (test for excess errors)
>> UNRESOLVED: gcc.dg/pr69896.c compilation failed to produce executable
>> FAIL: gcc.dg/pr70326.c (internal compiler error)
>> FAIL: gcc.dg/pr70326.c (test for excess errors)
> 281a295,353
>> FAIL: gcc.dg/torture/pr69613.c   -O1  (internal compiler error)
>> FAIL: gcc.dg/torture/pr69613.c   -O1  (test for excess errors)
>> UNRESOLVED: gcc.dg/torture/pr69613.c   -O1  compilation failed to produce 
>> executable
>> FAIL: gcc.dg/torture/pr69613.c   -O2  (internal compiler error)
>> FAIL: gcc.dg/torture/pr69613.c   -O2  (test for excess errors)
>> UNRESOLVED: gcc.dg/torture/pr69613.c   -O2  compilation failed to produce 
>> executable
>> FAIL: gcc.dg/torture/pr69613.c   -O2 -flto -fno-use-linker-plugin 
>> -flto-partition=none  (internal compiler error)
>> FAIL: gcc.dg/torture/pr69613.c   -O2 -flto -fno-use-linker-plugin 
>> -flto-partition=none  (test for excess errors)
>> UNRESOLVED: gcc.dg/torture/pr69613.c   -O2 -flto -fno-use-linker-plugin 
>> -flto-partition=none  compilation failed to produce executable
>> FAIL: gcc.dg/torture/pr69613.c   -O2 -flto -fuse-linker-plugin 
>> -fno-fat-lto-objects  (internal compiler error)
>> FAIL: gcc.dg/torture/pr69613.c   -O2 -flto -fuse-linker-plugin 
>> -fno-fat-lto-objects  (test for excess errors)
>> UNRESOLVED: gcc.dg/torture/pr69613.c   -O2 -flto -fuse-linker-plugin 
>> -fno-fat-lto-objects  compilation failed to produce executable
>> FAIL: gcc.dg/torture/pr69613.c   -O3 -g  (internal compiler error)
>> FAIL: gcc.dg/torture/pr69613.c   -O3 -g  (test for excess errors)
>> UNRESOLVED: gcc.dg/torture/pr69613.c   -O3 -g  compilation failed to produce 
>> executable
>> FAIL: gcc.dg/torture/pr69613.c   -Os  (internal compiler error)
>> FAIL: gcc.dg/torture/pr69613.c   -Os  (test for excess errors)
>> UNRESOLVED: gcc.dg/torture/pr69613.c   -Os  compilation failed to produce 
>> executable
>> FAIL: gcc.dg/torture/pr69909.c   -O1  (internal compiler error)
>> FAIL: gcc.dg/torture/pr69909.c   -O1  (test for excess errors)
>> UNRESOLVED: gcc.dg/torture/pr69909.c   -O1  compilation failed to produce 
>> executable
>> FAIL: gcc.dg/torture/pr69909.c   -O2  (internal compiler error)
>> FAIL: gcc.dg/torture/pr69909.c   -O2  (test for excess errors)
>> UNRESOLVED: gcc.dg/torture/pr69909.c   -O2  compilation failed to produce 
>> executable
>> FAIL: gcc.dg/torture/pr69909.c   -O2 -flto -fno-use-linker-plugin 
>> -flto-partition=none  (internal compiler error)
>> FAIL: gcc.dg/torture/pr69909.c   -O2 -flto -fno-use-linker-plugin 
>> -flto-partition=none  (test for excess errors)
>> UNRESOLVED: gcc.dg/torture/pr69909.c   -O2 -flto -fno-use-linker-plugin 
>> -flto-partition=none  compilation failed to produce executable
>> FAIL: gcc.dg/torture/pr69909.c   -O2 -flto -fuse-linker-plugin 
>> -fno-fat-lto-objects  (internal compiler error)
>> FAIL: gcc.dg/torture/pr69909.c   -O2 -flto -fuse-linker-plugin 
>> -fno-fat-lto-objects  (test for excess errors)
>> UNRESOLVED: gcc.dg/torture/pr69909.c   -O2 -flto -fuse-linker-plugin 
>> -fno-fat-lto-objects  compilation failed to produce executable
>> FAIL: gcc.dg/torture/pr69909.c   -O3 -g  (internal compiler error)
>> FAI

New Danish PO file for 'gcc' (version 6.1.0)

2016-05-16 Thread Translation Project Robot
Hello, gentle maintainer.

This is a message from the Translation Project robot.

A revised PO file for textual domain 'gcc' has been submitted
by the Danish team of translators.  The file is available at:

http://translationproject.org/latest/gcc/da.po

(This file, 'gcc-6.1.0.da.po', has just now been sent to you in
a separate email.)

All other PO files for your package are available in:

http://translationproject.org/latest/gcc/

Please consider including all of these in your next release, whether
official or a pretest.

Whenever you have a new distribution with a new version number ready,
containing a newer POT file, please send the URL of that distribution
tarball to the address below.  The tarball may be just a pretest or a
snapshot, it does not even have to compile.  It is just used by the
translators when they need some extra translation context.

The following HTML page has been updated:

http://translationproject.org/domain/gcc.html

If any question arises, please contact the translation coordinator.

Thank you for all your work,

The Translation Project robot, in the
name of your translation coordinator.




[Ada] Remove useless type transformation

2016-05-16 Thread Eric Botcazou
The transformation is either useless or counter-productive or even wrong.

Tested on x86_64-suse-linux, applied on the mainline.


2016-05-16  Eric Botcazou  

* gcc-interface/decl.c (gnat_to_gnu_entity) : Do not
build a specific type for the object if it is deemed a constant.

-- 
Eric BotcazouIndex: gcc-interface/decl.c
===
--- gcc-interface/decl.c	(revision 236279)
+++ gcc-interface/decl.c	(working copy)
@@ -1437,9 +1437,6 @@ gnat_to_gnu_entity (Entity_Id gnat_entit
 	  = build_reference_type (TYPE_OBJECT_RECORD_TYPE (gnu_array));
 	  }
 
-	if (const_flag)
-	  gnu_type = change_qualified_type (gnu_type, TYPE_QUAL_CONST);
-
 	/* Convert the expression to the type of the object if need be.  */
 	if (gnu_expr && initial_value_needs_conversion (gnu_type, gnu_expr))
 	  gnu_expr = convert (gnu_type, gnu_expr);


Re: [PATCH][AArch64] Improve aarch64_case_values_threshold setting

2016-05-16 Thread James Greenhalgh
On Mon, May 16, 2016 at 11:38:04AM +0100, Wilco Dijkstra wrote:
> ping

As this change will change code generation for all cores (except
Exynos-M1), I'd like to hear from those with more detailed knowledge of
ThunderX, X-Gene and qdf24xx before I take this patch.

Let's give it another week or so for comments, and expand the CC list.

I wasn't quite convinced by the Cortex-A53 numbers you gave upthread,
you said:

> >  Evandro Menezes wrote:
> >
> > True, but the results when running on A53 could be quite different.
>
> GCC is ~1.2% faster on Cortex-A53 built for generic, but there is no
> difference in perlbench.

Where were these changes if not perlbench?

Thanks,
James

> 
> From: Wilco Dijkstra
> Sent: 22 April 2016 17:15
> To: gcc-patches@gcc.gnu.org
> Cc: nd
> Subject: [PATCH][AArch64] Improve aarch64_case_values_threshold setting
> 
> GCC expands switch statements in a very simplistic way and tries to use a 
> table
> expansion even when it is a bad idea for performance or codesize.
> GCC typically emits extremely sparse tables that contain mostly default 
> entries
> (something which currently cannot be tuned by backends).  Additionally the
> computation of the minimum/maximum label offsets is too simplistic so the 
> tables
> are often twice as large as necessary.
> 
> The cost of a table switch is significant due to the setup overhead, the table
> lookup (which due to being sparse and large adds unnecessary cachemisses)
> and hard to predict indirect jump.  Therefore it is best to avoid using a 
> table
> unless there are many real case labels.
> 
> This patch fixes that by setting the default aarch64_case_values_threshold to
> 16 when the per-CPU tuning is not set.  On SPEC2006 this improves the switch
> heavy benchmarks GCC and perlbench both in performance (1-2%) as well as size
> (0.5-1% smaller).
> 
> OK for trunk?
> 
> ChangeLog:
> 2016-04-22  Wilco Dijkstra  
> 
> gcc/
> * config/aarch64/aarch64.c (aarch64_case_values_threshold):
> Return a better case_values_threshold when optimizing.
> 
> --
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 0620f1e..a240635 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -3546,7 +3546,12 @@ aarch64_cannot_force_const_mem (machine_mode mode 
> ATTRIBUTE_UNUSED, rtx x)
>return aarch64_tls_referenced_p (x);
>  }
> 
> -/* Implement TARGET_CASE_VALUES_THRESHOLD.  */
> +/* Implement TARGET_CASE_VALUES_THRESHOLD.
> +   The expansion for a table switch is quite expensive due to the number
> +   of instructions, the table lookup and hard to predict indirect jump.
> +   When optimizing for speed, with -O3 use the per-core tuning if set,
> +   otherwise use tables for > 16 cases as a tradeoff between size and
> +   performance.  */
> 
>  static unsigned int
>  aarch64_case_values_threshold (void)
> @@ -3557,7 +3562,7 @@ aarch64_case_values_threshold (void)
>&& selected_cpu->tune->max_case_values != 0)
>  return selected_cpu->tune->max_case_values;
>else
> -return default_case_values_threshold ();
> +return optimize_size ? default_case_values_threshold () : 17;
>  }
> 
> 



Re: [PATCH][AArch64] print_operand should not fallthrough from register operand into generic operand

2016-05-16 Thread James Greenhalgh
On Wed, Apr 27, 2016 at 05:39:33PM +0100, Wilco Dijkstra wrote:
> James Greenhalgh wrote:
> > So the part of this patch removing the fallthrough to general operand
> > is not OK for trunk.
> >
> > The other parts look reasonable to me, please resubmit just those.
> 
> Right, I removed the removal of the fallthrough. Here is the revised version:

OK.

Thanks,
James

> 
> ChangeLog:
> 2016-04-27  Wilco Dijkstra  
> 
> gcc/
>   * config/aarch64/aarch64.md
>   (add3_compareC_cconly_imm): Remove use of %w.
>   (add3_compareC_imm): Likewise.
>   (si3_uxtw): Split into register and immediate variants.
>   (andsi3_compare0_uxtw): Likewise.
>   (and3_compare0): Likewise.
>   (and3nr_compare0): Likewise.
>   (stack_protect_test_): Don't use %x for memory operands.
> 



Contents of PO file 'cpplib-6.1.0.da.po'

2016-05-16 Thread Translation Project Robot


cpplib-6.1.0.da.po.gz
Description: Binary data
The Translation Project robot, in the
name of your translation coordinator.



New Danish PO file for 'cpplib' (version 6.1.0)

2016-05-16 Thread Translation Project Robot
Hello, gentle maintainer.

This is a message from the Translation Project robot.

A revised PO file for textual domain 'cpplib' has been submitted
by the Danish team of translators.  The file is available at:

http://translationproject.org/latest/cpplib/da.po

(This file, 'cpplib-6.1.0.da.po', has just now been sent to you in
a separate email.)

All other PO files for your package are available in:

http://translationproject.org/latest/cpplib/

Please consider including all of these in your next release, whether
official or a pretest.

Whenever you have a new distribution with a new version number ready,
containing a newer POT file, please send the URL of that distribution
tarball to the address below.  The tarball may be just a pretest or a
snapshot, it does not even have to compile.  It is just used by the
translators when they need some extra translation context.

The following HTML page has been updated:

http://translationproject.org/domain/cpplib.html

If any question arises, please contact the translation coordinator.

Thank you for all your work,

The Translation Project robot, in the
name of your translation coordinator.




[Ada] Extend pragma Implicit_Packing for records to composite components

2016-05-16 Thread Eric Botcazou
The implementation of pragma Implicit_Packing for record types was restricted
to scalar components, i.e. it gave up as soon as there was one non-scalar
component in the record type.  This change extends it to all the other kinds
of components, at least if they are small.

The following package must compile quietly:

pragma Implicit_Packing;

package P is

  type Rec1 is record
B : Boolean;
  end record;
  for Rec1'Size use 1;

  type Rec2 is record
B : Boolean;
R : Rec1;
  end record;
  for Rec2'Size use 2;

  type Rec3 is record
B : Boolean;
R : Rec2;
  end record;
  for Rec3'Size use 3;

end P;


Tested on x86_64-suse-linux, applied on the mainline.


2016-05-16  Eric Botcazou  

* freeze.adb (Freeze_Record_Type): Extend pragma Implicit_Packing to
components of any elementary types and of composite types.

-- 
Eric BotcazouIndex: freeze.adb
===
--- freeze.adb	(revision 236281)
+++ freeze.adb	(working copy)
@@ -3534,13 +3534,23 @@ package body Freeze is
  --  Set True if we find at least one component whose type has a
  --  Scalar_Storage_Order attribute definition clause.
 
- All_Scalar_Components : Boolean := True;
- --  Set False if we encounter a component of a non-scalar type
+ All_Elem_Components : Boolean := True;
+ --  Set False if we encounter a component of a composite type
 
- Scalar_Component_Total_RM_Size : Uint := Uint_0;
- Scalar_Component_Total_Esize   : Uint := Uint_0;
- --  Accumulates total RM_Size values and total Esize values of all
- --  scalar components. Used for processing of Implicit_Packing.
+ All_Sized_Components : Boolean := True;
+ --  Set False if we encounter a component with unknown RM_Size
+
+ All_Storage_Unit_Components : Boolean := True;
+ --  Set False if we encounter a component of a composite type whose
+ --  RM_Size is not a multiple of the storage unit.
+
+ Elem_Component_Total_Esize : Uint := Uint_0;
+ --  Accumulates total Esize values of all elementary components. Used
+ --  for processing of Implicit_Packing.
+
+ Sized_Component_Total_RM_Size : Uint := Uint_0;
+ --  Accumulates total RM_Size values of all sized components. Used
+ --  for processing of Implicit_Packing.
 
  function Check_Allocator (N : Node_Id) return Node_Id;
  --  If N is an allocator, possibly wrapped in one or more level of
@@ -3835,13 +3845,22 @@ package body Freeze is
 --  this stage we might be dealing with a real component, or with
 --  an implicit subtype declaration.
 
-if not Is_Scalar_Type (Etype (Comp)) then
-   All_Scalar_Components := False;
+if Known_Static_RM_Size (Etype (Comp)) then
+   Sized_Component_Total_RM_Size :=
+ Sized_Component_Total_RM_Size + RM_Size (Etype (Comp));
+
+   if Is_Elementary_Type (Etype (Comp)) then
+  Elem_Component_Total_Esize :=
+Elem_Component_Total_Esize + Esize (Etype (Comp));
+   else
+  All_Elem_Components := False;
+
+  if RM_Size (Etype (Comp)) mod System_Storage_Unit /= 0 then
+ All_Storage_Unit_Components := False;
+  end if;
+   end if;
 else
-   Scalar_Component_Total_RM_Size :=
- Scalar_Component_Total_RM_Size + RM_Size (Etype (Comp));
-   Scalar_Component_Total_Esize :=
- Scalar_Component_Total_Esize + Esize (Etype (Comp));
+   All_Sized_Components := False;
 end if;
 
 --  If the component is an Itype with Delayed_Freeze and is either
@@ -4312,26 +4331,33 @@ package body Freeze is
 
and then not Aliased_Component
 
-   --  Must have size clause and all scalar components
+   --  Must have size clause and all sized components
 
and then Has_Size_Clause (Rec)
-   and then All_Scalar_Components
+   and then All_Sized_Components
 
--  Do not try implicit packing on records with discriminants, too
--  complicated, especially in the variant record case.
 
and then not Has_Discriminants (Rec)
 
-   --  We can implicitly pack if the specified size of the record is
-   --  less than the sum of the object sizes (no point in packing if
-   --  this is not the case).
-
-   and then RM_Size (Rec) < Scalar_Component_Total_Esize
+   --  We want to implicitly pack if the specified size of the record
+   --  is less than the sum of the object sizes (no point in packing
+   --  if this is not the case) if we can compute it, i.e. if we have
+   --  only elementary components. Otherwise, we have at l

[Ada] Extend pragma Implicit_Packing to byte-packed array types

2016-05-16 Thread Eric Botcazou
This plugs a small loophole in the implementation of pragma Implicit_Packing
for array types, when the component type is scalar and has a size multiple of
the storage unit but not a power of 2.

The following package must compile quietly:

pragma Implicit_Packing;

package P is

  type Int24 is mod 2**24;
  for Int24'Size use 24;

  type Arr is array (1 .. 8) of Int24;
  for Arr'Size use 24 * 8;

end P;


Tested on x86_64-suse-linux, applied on the mainline.


2016-05-16  Eric Botcazou  

* freeze.adb (Freeze_Array_Type): Call Addressable predicate instead
of testing for individual sizes.
(Freeze_Entity): Rework implementation of pragma Implicit_Packing for
array types, in particular test for suitable sizes upfront and do not
mimic the processing that will be redone later in Freeze_Array_Type.

-- 
Eric BotcazouIndex: freeze.adb
===
--- freeze.adb	(revision 236279)
+++ freeze.adb	(working copy)
@@ -2453,11 +2453,7 @@ package body Freeze is
 
  --  Bit packing is never needed for 8, 16, 32, 64
 
- ifCsiz = 8
-   or else Csiz = 16
-   or else Csiz = 32
-   or else Csiz = 64
- then
+ if Addressable (Csiz) then
 --  If the Esize of the component is known and equal to
 --  the component size then even packing is not needed.
 
@@ -5295,20 +5291,20 @@ package body Freeze is
 
  if E /= Base_Type (E) then
 
---  Before we do anything else, a specialized test for the case of
---  a size given for an array where the array needs to be packed,
---  but was not so the size cannot be honored. This is the case
---  where implicit packing may apply. The reason we do this so
---  early is that if we have implicit packing, the layout of the
---  base type is affected, so we must do this before we freeze
---  the base type.
+--  Before we do anything else, a specific test for the case of a
+--  size given for an array where the array would need to be packed
+--  in order for the size to be honored, but is not. This is the
+--  case where implicit packing may apply. The reason we do this so
+--  early is that, if we have implicit packing, the layout of the
+--  base type is affected, so we must do this before we freeze the
+--  base type.
 
 --  We could do this processing only if implicit packing is enabled
 --  since in all other cases, the error would be caught by the back
 --  end. However, we choose to do the check even if we do not have
 --  implicit packing enabled, since this allows us to give a more
---  useful error message (advising use of pragmas Implicit_Packing
---  or Pack).
+--  useful error message (advising use of pragma Implicit_Packing
+--  or pragma Pack).
 
 if Is_Array_Type (E) then
declare
@@ -5321,7 +5317,8 @@ package body Freeze is
   Hi   : Node_Id;
   Indx : Node_Id;
 
-  Num_Elmts : Uint;
+  Dim   : Uint;
+  Num_Elmts : Uint := Uint_1;
   --  Number of elements in array
 
begin
@@ -5337,13 +5334,21 @@ package body Freeze is
   --  a chance to freeze the base type (and it is that freeze
   --  action that causes stuff to be inherited).
 
+  --  The conditions on the size are identical to those used in
+  --  Freeze_Array_Type to set the Is_Packed flag.
+
   if Has_Size_Clause (E)
 and then Known_Static_RM_Size (E)
 and then not Is_Packed (E)
 and then not Has_Pragma_Pack (E)
 and then not Has_Component_Size_Clause (E)
 and then Known_Static_RM_Size (Ctyp)
-and then RM_Size (Ctyp) < 64
+and then Rsiz <= 64
+and then not (Addressable (Rsiz)
+   and then Known_Static_Esize (Ctyp)
+   and then Esize (Ctyp) = Rsiz)
+and then not (Rsiz mod System_Storage_Unit = 0
+   and then Is_Composite_Type (Ctyp))
 and then not Is_Limited_Composite (E)
 and then not Is_Packed (Root_Type (E))
 and then not Has_Component_Size_Clause (Root_Type (E))
@@ -5351,7 +5356,6 @@ package body Freeze is
   then
  --  Compute number of elements in array
 
- Num_Elmt

[Ada] Cleanup in the implementation of packed array types

2016-05-16 Thread Eric Botcazou
This patch cleans up the implementation of packed array types, which is shared
between the front-end proper and gigi (because the middle-end doesn't support
bit packing for array types, unlike for record types through bit fields).

The two main changes are the reduced usage of bit packing (not necessary for
composite types whose size is multiple of a byte) and the reduced usage of
the internal implementation type built by the front-end (not necessary when
the packing is entirely done in gigi).  For example, on the package:

with Interfaces; use Interfaces;

package Q is

  type Rec is record
S : Unsigned_16;
B : Unsigned_8;
  end record;

  type Arr1 is array (1 .. 8) of Rec;
  pragma Pack (Arr1);

  type Arr2 is array (1 .. 8) of Rec;
  for Arr2'Component_Size use 24;

end Q;

the packing for both array types is now done entirely in gigi and implemented
more efficiently, i.e. there is no call to the runtime.

As a side effect, this also fixes an issue with misalignment clauses applied
to unconstrained byte-packed array types, which were silently ignored in some
cases.  For example, the following package must now compile silently:

with Interfaces; use Interfaces;

package P is

  type Rec is record
I : Unsigned_32;
S : Unsigned_16;
  end record;

  type Arr is array (Positive range <>) of Rec;
  pragma Pack (Arr);
  for Arr'Alignment use 1;

  type CArr is array (1 .. 4) of Rec;
  pragma Pack (CArr);
  for CArr'Alignment use 1;

  A : Arr (1 .. 4);
  for A'Alignment use 1;

end P;


Tested on x86_64-suse-linux, applied on the mainline.


2016-05-16  Eric Botcazou  

* doc/gnat_rm/implementation_defined_attributes.rst
(Scalar_Storage_Order): Adjust restriction for packed array types.
* einfo.ads (Is_Bit_Packed_Array): Adjust description.
(Is_Packed): Likewise.
(Is_Packed_Array_Impl_Type): Likewise.
(Packed_Array_Impl_Type): Likewise.
* exp_ch4.adb (Expand_N_Indexed_Component): Don't do anything special
if the prefix is not a packed array implemented specially.
* exp_ch6.adb (Expand_Actuals): Expand indexed components only for
bit-packed array types.
* exp_pakd.adb (Install_PAT): Set Is_Packed_Array_Impl_Type flag on
the PAT before analyzing its declaration.
(Create_Packed_Array_Impl_Type): Remove redundant statements.
* freeze.adb (Check_Component_Storage_Order): Reject packed array
components only if they are bit packed.
(Freeze_Array_Type): Fix logic detecting bit packing and do not bit
pack for composite types whose size is multiple of a byte.
Create the implementation type for packed array types only when it is
needed, i.e. bit packing or packing because of holes in index types.
Make sure the Has_Non_Standard_Rep and Is_Packed flags agree.
* gcc-interface/gigi.h (make_packable_type): Add MAX_ALIGN parameter.
* gcc-interface/decl.c (gnat_to_gnu_entity)
Call maybe_pad_type instead of building the padding type manually.
(gnat_to_gnu_entity) : Do not assert that
Packed_Array_Impl_Type is present for packed arrays.
(gnat_to_gnu_component_type): Also handle known alignment for packed
types by passing it to make_packable_type.
* gcc-interface/utils.c (make_packable_type): Add MAX_ALIGN parameter
and deal with it in the array case.  Adjust recursive call.  Simplify
computation of new size and cap the alignment to BIGGEST_ALIGNMENT.

-- 
Eric Botcazou
Index: doc/gnat_rm/implementation_defined_attributes.rst
===
--- doc/gnat_rm/implementation_defined_attributes.rst	(revision 236264)
+++ doc/gnat_rm/implementation_defined_attributes.rst	(working copy)
@@ -969,7 +969,7 @@ must have the same scalar storage order
 If a component of `T` is of a record or array type, then that type must
 also have a `Scalar_Storage_Order` attribute definition clause.
 
-A component of a record or array type that is a packed array, or that
+A component of a record or array type that is a bit-packed array, or that
 does not start on a byte boundary, must have the same scalar storage order
 as the enclosing record or array type.
 
Index: einfo.ads
===
--- einfo.ads	(revision 236264)
+++ einfo.ads	(working copy)
@@ -2268,9 +2268,9 @@ package Einfo is
 --   is bit packed (i.e. the component size is known by the front end and
 --   is in the range 1-7, 9-15, 17-31, or 33-63). Is_Packed is always set
 --   if Is_Bit_Packed_Array is set, but it is possible for Is_Packed to be
---   set without Is_Bit_Packed_Array for the case of an array having one or
---   more index types that are enumeration types with non-standard
---   enumeration representations.
+--   set without Is_Bit_Packed_Array if the component size is not known by
+--   t

[PATCH GCC]Document vect_cond_mixed in sourcebuild.texi

2016-05-16 Thread Bin Cheng
Hi,
This is an obvious patch documenting vect_cond_mixed in sourcebuild.texi.  OK?

Thanks,
bin

2016-05-13  bin cheng  

* doc/sourcebuild.texi (@item vect_cond_mixed): New item.diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi
index 3142cd5..f1bd9be 100644
--- a/gcc/doc/sourcebuild.texi
+++ b/gcc/doc/sourcebuild.texi
@@ -1342,6 +1342,10 @@ Target supports Fortran @code{real} kinds larger than 
@code{real(8)}.
 @item vect_condition
 Target supports vector conditional operations.
 
+@item vect_cond_mixed
+Target supports vector conditional operations where comparison operands
+have different type from the value operands.
+
 @item vect_double
 Target supports hardware vectors of @code{double}.
 


Re: [PATCH][AArch64] Improve aarch64_case_values_threshold setting

2016-05-16 Thread Wilco Dijkstra
ping


From: Wilco Dijkstra
Sent: 22 April 2016 17:15
To: gcc-patches@gcc.gnu.org
Cc: nd
Subject: [PATCH][AArch64] Improve aarch64_case_values_threshold setting

GCC expands switch statements in a very simplistic way and tries to use a table
expansion even when it is a bad idea for performance or codesize.
GCC typically emits extremely sparse tables that contain mostly default entries
(something which currently cannot be tuned by backends).  Additionally the
computation of the minimum/maximum label offsets is too simplistic so the tables
are often twice as large as necessary.

The cost of a table switch is significant due to the setup overhead, the table
lookup (which due to being sparse and large adds unnecessary cachemisses)
and hard to predict indirect jump.  Therefore it is best to avoid using a table
unless there are many real case labels.

This patch fixes that by setting the default aarch64_case_values_threshold to
16 when the per-CPU tuning is not set.  On SPEC2006 this improves the switch
heavy benchmarks GCC and perlbench both in performance (1-2%) as well as size
(0.5-1% smaller).

OK for trunk?

ChangeLog:
2016-04-22  Wilco Dijkstra  

gcc/
* config/aarch64/aarch64.c (aarch64_case_values_threshold):
Return a better case_values_threshold when optimizing.

--
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 0620f1e..a240635 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -3546,7 +3546,12 @@ aarch64_cannot_force_const_mem (machine_mode mode 
ATTRIBUTE_UNUSED, rtx x)
   return aarch64_tls_referenced_p (x);
 }

-/* Implement TARGET_CASE_VALUES_THRESHOLD.  */
+/* Implement TARGET_CASE_VALUES_THRESHOLD.
+   The expansion for a table switch is quite expensive due to the number
+   of instructions, the table lookup and hard to predict indirect jump.
+   When optimizing for speed, with -O3 use the per-core tuning if set,
+   otherwise use tables for > 16 cases as a tradeoff between size and
+   performance.  */

 static unsigned int
 aarch64_case_values_threshold (void)
@@ -3557,7 +3562,7 @@ aarch64_case_values_threshold (void)
   && selected_cpu->tune->max_case_values != 0)
 return selected_cpu->tune->max_case_values;
   else
-return default_case_values_threshold ();
+return optimize_size ? default_case_values_threshold () : 17;
 }




Re: [PATCH][AArch64] print_operand should not fallthrough from register operand into generic operand

2016-05-16 Thread Wilco Dijkstra
ping


From: Wilco Dijkstra
Sent: 27 April 2016 17:39
To: James Greenhalgh
Cc: gcc-patches@gcc.gnu.org; nd
Subject: Re: [PATCH][AArch64] print_operand should not fallthrough from 
register operand into generic operand

James Greenhalgh wrote:
> So the part of this patch removing the fallthrough to general operand
> is not OK for trunk.
>
> The other parts look reasonable to me, please resubmit just those.

Right, I removed the removal of the fallthrough. Here is the revised version:

ChangeLog:
2016-04-27  Wilco Dijkstra  

gcc/
* config/aarch64/aarch64.md
(add3_compareC_cconly_imm): Remove use of %w.
(add3_compareC_imm): Likewise.
(si3_uxtw): Split into register and immediate variants.
(andsi3_compare0_uxtw): Likewise.
(and3_compare0): Likewise.
(and3nr_compare0): Likewise.
(stack_protect_test_): Don't use %x for memory operands.

--

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 
19981c205d3e2a6102510647bde9b29906a4fdc9..4e41b3b0f5b2369431ffec1a0029af53fc5aebd9
 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1755,7 +1755,7 @@
   "aarch64_zero_extend_const_eq (mode, operands[2],
 mode, operands[1])"
   "@
-  cmn\\t%0, %1
+  cmn\\t%0, %1
   cmp\\t%0, #%n1"
   [(set_attr "type" "alus_imm")]
 )
@@ -1787,11 +1787,11 @@
   "aarch64_zero_extend_const_eq (mode, operands[3],
  mode, operands[2])"
   "@
-  adds\\t%0, %1, %2
+  adds\\t%0, %1, %2
   subs\\t%0, %1, #%n2"
   [(set_attr "type" "alus_imm")]
 )
-
+
 (define_insn "add3_compareC"
   [(set (reg:CC_C CC_REGNUM)
(ne:CC_C
@@ -3394,7 +3394,9 @@
  (LOGICAL:SI (match_operand:SI 1 "register_operand" "%r,r")
 (match_operand:SI 2 "aarch64_logical_operand" "r,K"]
   ""
-  "\\t%w0, %w1, %w2"
+  "@
+   \\t%w0, %w1, %w2
+   \\t%w0, %w1, %2"
   [(set_attr "type" "logic_reg,logic_imm")]
 )

@@ -3407,7 +3409,9 @@
(set (match_operand:GPI 0 "register_operand" "=r,r")
(and:GPI (match_dup 1) (match_dup 2)))]
   ""
-  "ands\\t%0, %1, %2"
+  "@
+   ands\\t%0, %1, %2
+   ands\\t%0, %1, %2"
   [(set_attr "type" "logics_reg,logics_imm")]
 )

@@ -3421,7 +3425,9 @@
(set (match_operand:DI 0 "register_operand" "=r,r")
(zero_extend:DI (and:SI (match_dup 1) (match_dup 2]
   ""
-  "ands\\t%w0, %w1, %w2"
+  "@
+   ands\\t%w0, %w1, %w2
+   ands\\t%w0, %w1, %2"
   [(set_attr "type" "logics_reg,logics_imm")]
 )

@@ -3775,7 +3781,9 @@
  (match_operand:GPI 1 "aarch64_logical_operand" "r,"))
 (const_int 0)))]
   ""
-  "tst\\t%0, %1"
+  "@
+   tst\\t%0, %1
+   tst\\t%0, %1"
   [(set_attr "type" "logics_reg,logics_imm")]
 )

@@ -5170,7 +5178,7 @@
 UNSPEC_SP_TEST))
(clobber (match_scratch:PTR 3 "=&r"))]
   ""
-  "ldr\t%3, %x1\;ldr\t%0, %x2\;eor\t%0, %3, %0"
+  "ldr\t%3, %1\;ldr\t%0, %2\;eor\t%0, %3, %0"
   [(set_attr "length" "12")
(set_attr "type" "multiple")])




Re: Enable inlining into thunks

2016-05-16 Thread Jan Hubicka
> On 05/16/2016 12:22 AM, Jan Hubicka wrote:
> > Hi,
> > this patch teach inliner to inline into thunks. This is easy to do - all we 
> > need
> > is to produce a gimple body when we decide to do so. This fixes some ages 
> > old xfails
> > and enables some 40k inlines in Firefox. Not all those inlines are win, 
> > because
> > the codst model of thunks is wrong.  We need to model that thunk calls are 
> > really just
> > simple jumps. I will do that incrementally.
> > 
> > Bootstrapped/regtested x86_64-linux, will commit it tomorrow.
> 
> Hi Honza.
> 
> I've spotted a new ICE after I've applied all your patches (4) related
> to thunk expansion.
> 
> tc.ii:13:80: error: Thunk is not supposed to have body
>  int C::m_fn2(B &p1, int p2, int p3, int &p4) { m_fn1()->m_fn2(p1, p2, p3, 
> p4); }

I forgot two hunks in the final patch, here is version I comitted (which also 
fixes the ICE you mention):

Index: cgraph.c
===
--- cgraph.c(revision 236269)
+++ cgraph.c(working copy)
@@ -3324,7 +3324,7 @@ cgraph_node::verify_node (void)
  error ("More than one edge out of thunk node");
   error_found = true;
}
-  if (gimple_has_body_p (decl))
+  if (gimple_has_body_p (decl) && !global.inlined_to)
 {
  error ("Thunk is not supposed to have body");
   error_found = true;
Index: cgraphclones.c
===
--- cgraphclones.c  (revision 236269)
+++ cgraphclones.c  (working copy)
@@ -337,8 +337,6 @@ duplicate_thunk_for_node (cgraph_node *t
 
   cgraph_edge *e = new_thunk->create_edge (node, NULL, 0,
  CGRAPH_FREQ_BASE);
-  e->call_stmt_cannot_inline_p = true;
-  e->inline_failed = CIF_THUNK;
   symtab->call_edge_duplication_hooks (thunk->callees, e);
   symtab->call_cgraph_duplication_hooks (thunk, new_thunk);
   return new_thunk;
Index: cif-code.def
===
--- cif-code.def(revision 236269)
+++ cif-code.def(working copy)
@@ -95,10 +95,6 @@ DEFCIFCODE(MISMATCHED_ARGUMENTS, CIF_FIN
 DEFCIFCODE(LTO_MISMATCHED_DECLARATIONS, CIF_FINAL_ERROR,
   N_("mismatched declarations during linktime optimization"))
 
-/* Caller is thunk.  */
-DEFCIFCODE(THUNK, CIF_FINAL_ERROR, 
-  N_("thunk call"))
-
 /* Call was originally indirect.  */
 DEFCIFCODE(ORIGINALLY_INDIRECT_CALL, CIF_FINAL_NORMAL,
   N_("originally indirect function call not considered for inlining"))
Index: ipa-inline-analysis.c
===
--- ipa-inline-analysis.c   (revision 236269)
+++ ipa-inline-analysis.c   (working copy)
@@ -2932,11 +2932,13 @@ compute_inline_parameters (struct cgraph
   struct inline_edge_summary *es = inline_edge_summary (node->callees);
   struct predicate t = true_predicate ();
 
-  node->callees->inline_failed = CIF_THUNK;
   node->local.can_change_signature = false;
-  es->call_stmt_size = INLINE_SIZE_SCALE;
-  es->call_stmt_time = INLINE_TIME_SCALE;
-  account_size_time (info, INLINE_SIZE_SCALE * 2, INLINE_TIME_SCALE * 2, 
&t);
+  es->call_stmt_size = eni_size_weights.call_cost;
+  es->call_stmt_time = eni_time_weights.call_cost;
+  account_size_time (info, INLINE_SIZE_SCALE * 2,
+INLINE_TIME_SCALE * 2, &t);
+  t = not_inlined_predicate ();
+  account_size_time (info, 2 * INLINE_SIZE_SCALE, 0, &t);
   inline_update_overall_summary (node);
   info->self_size = info->size;
   info->self_time = info->time;
Index: ipa-inline-transform.c
===
--- ipa-inline-transform.c  (revision 236269)
+++ ipa-inline-transform.c  (working copy)
@@ -314,12 +314,20 @@ inline_call (struct cgraph_edge *e, bool
   /* Don't even think of inlining inline clone.  */
   gcc_assert (!callee->global.inlined_to);
 
-  e->inline_failed = CIF_OK;
-  DECL_POSSIBLY_INLINED (callee->decl) = true;
-
   to = e->caller;
   if (to->global.inlined_to)
 to = to->global.inlined_to;
+  if (to->thunk.thunk_p)
+{
+  if (in_lto_p)
+   to->get_untransformed_body ();
+  to->expand_thunk (false, true);
+  e = to->callees;
+}
+
+
+  e->inline_failed = CIF_OK;
+  DECL_POSSIBLY_INLINED (callee->decl) = true;
 
   if (DECL_FUNCTION_PERSONALITY (callee->decl))
 DECL_FUNCTION_PERSONALITY (to->decl)
@@ -580,7 +588,7 @@ preserve_function_body_p (struct cgraph_
   gcc_assert (!node->alias && !node->thunk.thunk_p);
 
   /* Look if there is any clone around.  */
-  if (node->clones)
+  if (node->clones && !node->clones->thunk.thunk_p)
 return true;
   return false;
 }
Index: testsuite/g++.dg/ipa/ivinline-7.C
=

[Ada] Improve array packing for small record component

2016-05-16 Thread Eric Botcazou
This change ensures that the packing of array types subject to pragma Pack
and whose component type is a record with a size in the range 33 .. 63 bits
is optimal, in the sense that the 'Component_Size is equal to the 'Size of
the component type.

The following package P must yield the specified output with -gnatR1:

package P is

  type R is record
I : Integer;
B : Boolean;
  end record;
  pragma Pack (R);

  type A1 is array (1 .. 8) of R;
  pragma Pack (A1);

  type A2 is array (1 .. 8) of R;
  for A2'Component_Size use 33;

end P;

Representation information for unit P (spec)


for R'Object_Size use 40;
for R'Value_Size use 33;
for R'Alignment use 1;
for R use record
   I at 0 range  0 .. 31;
   B at 4 range  0 ..  0;
end record;

for A1'Size use 264;
for A1'Alignment use 1;
for A1'Component_Size use 33;

for A2'Size use 264;
for A2'Alignment use 1;
for A2'Component_Size use 33;


Tested on x86_64-suse-linux, applied on the mainline.


2016-05-16  Eric Botcazou  

* exp_util.adb (Remove_Side_Effects): Also make a constant if we need
to capture the value for a small not by-reference record type.
* freeze.ads (Check_Compile_Time_Size): Adjust comment.
* freeze.adb (Set_Small_Size): Likewise.  Accept a size in the range
of 33 .. 64 bits.
(Check_Compile_Time_Size): Merge scalar and access type cases. Change
variable name in array type case.  For the computation of the packed
size, deal with record components and remove redundant test.
(Freeze_Array_Type): Also adjust packing status when the size of the
component type is in the range 33 .. 64 bits.
* doc/gnat_rm/representation_clauses_and_pragmas.rst: Turn primitive
into elementary type throughout.  Minor tweaks.
(Alignment Clauses): Document actual alignment of packed array types.
(Pragma Pack for Arrays): List only the 3 main cases and adjust.  Add
"simple" to the record case.  Document effect on non packable types.
(Pragma Pack for Records): Likewise.  Add record case and adjust.


-- 
Eric BotcazouIndex: doc/gnat_rm/representation_clauses_and_pragmas.rst
===
--- doc/gnat_rm/representation_clauses_and_pragmas.rst	(revision 236264)
+++ doc/gnat_rm/representation_clauses_and_pragmas.rst	(working copy)
@@ -32,9 +32,9 @@ GNAT requires that all alignment clauses
 default alignments are always a power of 2.  The default alignment
 values are as follows:
 
-* *Primitive Types*.
+* *Elementary Types*.
 
-  For primitive types, the alignment is the minimum of the actual size of
+  For elementary types, the alignment is the minimum of the actual size of
   objects of the type divided by `Storage_Unit`,
   and the maximum alignment supported by the target.
   (This maximum alignment is given by the GNAT-specific attribute
@@ -53,10 +53,11 @@ values are as follows:
   For arrays, the alignment is equal to the alignment of the component type
   for the normal case where no packing or component size is given.  If the
   array is packed, and the packing is effective (see separate section on
-  packed arrays), then the alignment will be one for long packed arrays,
-  or arrays whose length is not known at compile time.  For short packed
+  packed arrays), then the alignment will be either 4, 2 or 1 for long packed
+  arrays or arrays whose length is not known at compile time, depending on
+  whether the component size is divisible by 4, 2 or is odd.  For short packed
   arrays, which are handled internally as modular types, the alignment
-  will be as described for primitive types, e.g., a packed array of length
+  will be as described for elementary types, e.g. a packed array of length
   31 bits will have an object size of four bytes, and an alignment of 4.
 
 * *Records*.
@@ -789,7 +790,7 @@ restrictions placed on component clauses
   little-endian machines, this must be explicitly programmed.  This capability
   is not provided by `Bit_Order`.
 
-* Components that are positioned across byte boundaries
+* Components that are positioned across byte boundaries.
 
   but do not occupy an integral number of bytes.  Given that bytes are not
   reordered, such fields would occupy a non-contiguous sequence of bits
@@ -1069,22 +1070,23 @@ Pragma Pack for Arrays
 
 .. index:: Pragma Pack (for arrays)
 
-Pragma `Pack` applied to an array has no effect unless the component type
-is packable.  For a component type to be packable, it must be one of the
-following cases:
+Pragma `Pack` applied to an array has an effect that depends upon whether the
+component type is *packable*.  For a component type to be *packable*, it must
+be one of the following cases:
 
-*
-  Any scalar type
-*
-  Any type whose size is specified with a size clause
-*
-  Any packed array type with a static size
-*
-  Any record type padded because of its default alignme

Re: [PATCH 1/3] Encapsulate comp_cost within a class with methods.

2016-05-16 Thread Bin.Cheng
On Mon, Apr 25, 2016 at 10:42 AM, marxin  wrote:
> gcc/ChangeLog:
>
> 2016-04-25  Martin Liska  
>
> * tree-ssa-loop-ivopts.c(comp_cost::operator=): New function.
> (comp_cost::infinite_cost_p): Likewise.
> (operator+): Likewise.
> (comp_cost::operator+=): Likewise.
> (comp_cost::operator-=): Likewise.
> (comp_cost::operator/=): Likewise.
> (comp_cost::operator*=): Likewise.
> (operator-): Likewise.
> (operator<): Likewise.
> (operator==): Likewise.
> (operator<=): Likewise.
> (comp_cost::get_cost): Likewise.
> (comp_cost::set_cost): Likewise.
> (comp_cost::get_complexity): Likewise.
> (comp_cost::set_complexity): Likewise.
> (comp_cost::get_scratch): Likewise.
> (comp_cost::set_scratch): Likewise.
> (comp_cost::get_infinite): Likewise.
> (comp_cost::get_no_cost): Likewise.
> (struct ivopts_data): Rename inv_expr_id to max_inv_expr_id;
> (tree_ssa_iv_optimize_init): Use the renamed property.
> (new_cost): Remove.
> (infinite_cost_p): Likewise.
> (add_costs): Likewise.
> (sub_costs): Likewise.
> (compare_costs): Likewise.
> (set_group_iv_cost): Use comp_cost::infinite_cost_p.
> (get_address_cost): Use new comp_cost::comp_cost.
> (get_shiftadd_cost): Likewise.
> (force_expr_to_var_cost): Use new comp_cost::get_no_cost.
> (split_address_cost): Likewise.
> (ptr_difference_cost): Likewise.
> (difference_cost): Likewise.
> (get_expr_id): Use max_inv_expr_id.
> (get_computation_cost_at): Use comp_cost::get_infinite.
> (determine_group_iv_cost_generic): Use comp_cost::get_no_cost.
> (determine_group_iv_cost_address): Likewise.
> (determine_group_iv_cost_cond): Use comp_const::infinite_cost_p.
> (autoinc_possible_for_pair): Likewise.
> (determine_group_iv_costs): Use new methods of comp_cost.
> (determine_iv_cost): Likewise.
> (cheaper_cost_pair): Use comp_cost operators.
> (iv_ca_recount_cost): Likewise.
> (iv_ca_set_no_cp): Likewise.
> (iv_ca_set_cp): Likewise.
> (iv_ca_cost): Use comp_cost::get_infinite.
> (iv_ca_new): Use comp_cost::get_no_cost.
> (iv_ca_dump): Use new methods of comp_cost.
> (iv_ca_narrow): Use operators of comp_cost.
> (iv_ca_prune): Likewi.se
> (iv_ca_replace): Likewise.
> (try_add_cand_for): Likewise.
> (try_improve_iv_set): Likewise.
> (find_optimal_iv_set): Use new methods of comp_cost.
> (free_loop_data): Use renamed max_inv_expr_id.
> ---
Hi Martin,
Could you please rebase this patch and the profiling one against
latest trunk?  The third patch was applied before these two now.

Thanks,
bin


Fix recrusive inlining into thunks

2016-05-16 Thread Jan Hubicka
Hi,
this patch fixes ICE when thunk is inlined and call within thunk too. We need
to recurse in this case because inlining thunk does not introduce a new basic 
blocks
like normal inlining does.

Bootstrapped/regtested x86_64-linux, comitted.

Honza

Index: ChangeLog
===
--- ChangeLog   (revision 236272)
+++ ChangeLog   (working copy)
@@ -1,5 +1,9 @@
 2016-05-16  Jan Hubicka  
 
+   * tree-inline.c (expand_call_inline): recurse after inlining thunk.
+
+2016-05-16  Jan Hubicka  
+
* tree.c (free_lang_data_in_decl): Also set target/optimization flags
for thunks.
 
Index: tree-inline.c
===
--- tree-inline.c   (revision 236269)
+++ tree-inline.c   (working copy)
@@ -4485,6 +4485,7 @@ expand_call_inline (basic_block bb, gimp
   gimple_call_set_fndecl (stmt, edge->callee->decl);
   update_stmt (stmt);
   id->src_node->remove ();
+  expand_call_inline (bb, stmt, id);
   return true;
 }
   fn = cg_edge->callee->decl;


RE: [PATCH][MIPS] Enable LSA/DLSA for MSA

2016-05-16 Thread Matthew Fortune
Robert Suchanek 
> The below enables LSA/DLSA instructions for -mmsa.
> 
> Ok to commit?

OK.

There is a corresponding testsuite change needed for this
as some code quality tests change if LSA is available.  This
is the HAS_LSA 'ghost' option in mips.exp.  I'm happy to leave
this to be dealt with as part of the overall MSA testsuite
patch though.

Thanks,
Matthew


Re: Enable inlining into thunks

2016-05-16 Thread Martin Liška
On 05/16/2016 12:22 AM, Jan Hubicka wrote:
> Hi,
> this patch teach inliner to inline into thunks. This is easy to do - all we 
> need
> is to produce a gimple body when we decide to do so. This fixes some ages old 
> xfails
> and enables some 40k inlines in Firefox. Not all those inlines are win, 
> because
> the codst model of thunks is wrong.  We need to model that thunk calls are 
> really just
> simple jumps. I will do that incrementally.
> 
> Bootstrapped/regtested x86_64-linux, will commit it tomorrow.

Hi Honza.

I've spotted a new ICE after I've applied all your patches (4) related
to thunk expansion.

tc.ii:13:80: error: Thunk is not supposed to have body
 int C::m_fn2(B &p1, int p2, int p3, int &p4) { m_fn1()->m_fn2(p1, p2, p3, p4); 
}

^
_ZThn8_N1C5m_fn2ER1BiiRi/37 (virtual int C::_ZThn8_N1C5m_fn2ER1BiiRi(B&, int, 
int, int&)) @0x7fc693311450
  Type: function definition analyzed
  Visibility: public virtual artificial
  next sharing asm name: 35
  References: 
  Referring: 
  Function virtual int C::_ZThn8_N1C5m_fn2ER1BiiRi(B&, int, int, int&)/37 is 
inline copy in virtual int C::m_fn2(B&, int, int, int&)/0
  Clone of _ZThn8_N1C5m_fn2ER1BiiRi/19
  Availability: local
  First run: 0
  Function flags: body local
  Thunk fixed offset -8 virtual value 0 has virtual offset 0)
  Called by: _ZN1C5m_fn2ER1BiiRi/36 (speculative) (inlined) (0.33 per call) 
(can throw external) 
  Calls: *.LTHUNK0/1 (0.33 per call) (can throw external) 
tc.ii:13:80: internal compiler error: verify_cgraph_node failed
0xb3258f cgraph_node::verify_node()
../../gcc/cgraph.c:3444
0xb20acb symtab_node::verify()
../../gcc/symtab.c:1177
0xb20b54 symtab_node::verify_symtab_nodes()
../../gcc/symtab.c:1197
0xb40bc6 symtab_node::checking_verify_symtab_nodes()
../../gcc/cgraph.h:614
0xdc9bc2 symbol_table::remove_unreachable_nodes(_IO_FILE*)
../../gcc/ipa.c:690
0x178f732 ipa_inline
../../gcc/ipa-inline.c:2444
0x179038e execute
../../gcc/ipa-inline.c:2848

Thanks,
Martin
class A;
class B {
public:
  virtual A *m_fn1();
};
class A {
public:
  virtual int m_fn2(B &, int, int, int &) = 0;
};
class C : B, A {
  int m_fn2(B &, int, int, int &);
};
int C::m_fn2(B &p1, int p2, int p3, int &p4) { m_fn1()->m_fn2(p1, p2, p3, p4); }


RE: [PATCH][MIPS] Correct latency of loads in M5100

2016-05-16 Thread Matthew Fortune
Robert Suchanek  writes:
> A small patch to correct the latency for M5100.
> 
> Ok to commit?

>   * config/mips/m5100.md (m51_int_load): Update the latency to 2.

OK.

Matthew


[AArch64, 4/4] Reimplement vmvn* intrinscis, remove inline assembly

2016-05-16 Thread Jiong Wang

This patch remove inline assembly and reimplement all mvn/mvnq vector
integer intrinsics through the standard "one_cmpl2" pattern was
introduced later after the initial implementation of those intrinsics.
that's why inline assembly was used historically.

OK for trunk?

no regression on the exist advsimd-intrinsics/vmvn.c.

2016-05-16  Jiong Wang

gcc/
  * config/aarch64/arm_neon.h (vmvn_s8): Reimplement using C operator.
  Remove inline assembly.
  (vmvn_s16): Likewise.
  (vmvn_s32): Likewise.
  (vmvn_u8): Likewise.
  (vmvn_u16): Likewise.
  (vmvn_u32): Likewise.
  (vmvnq_s8): Likewise.
  (vmvnq_s16): Likewise.
  (vmvnq_s32): Likewise.
  (vmvnq_u8): Likewise.
  (vmvnq_u16): Likewise.
  (vmvnq_u32): Likewise.
  (vmvn_p8): Likewise.
  (vmvnq_p16): Likewise.

diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 432a1fad9a6df6fef844896df5e8ad29cc31f548..ae4c429a87822a8807f2d2ec054d3194b39ef6ac 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -8093,161 +8093,6 @@ vmull_u32 (uint32x2_t a, uint32x2_t b)
   return result;
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vmvn_p8 (poly8x8_t a)
-{
-  poly8x8_t result;
-  __asm__ ("mvn %0.8b,%1.8b"
-   : "=w"(result)
-   : "w"(a)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vmvn_s8 (int8x8_t a)
-{
-  int8x8_t result;
-  __asm__ ("mvn %0.8b,%1.8b"
-   : "=w"(result)
-   : "w"(a)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmvn_s16 (int16x4_t a)
-{
-  int16x4_t result;
-  __asm__ ("mvn %0.8b,%1.8b"
-   : "=w"(result)
-   : "w"(a)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmvn_s32 (int32x2_t a)
-{
-  int32x2_t result;
-  __asm__ ("mvn %0.8b,%1.8b"
-   : "=w"(result)
-   : "w"(a)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vmvn_u8 (uint8x8_t a)
-{
-  uint8x8_t result;
-  __asm__ ("mvn %0.8b,%1.8b"
-   : "=w"(result)
-   : "w"(a)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmvn_u16 (uint16x4_t a)
-{
-  uint16x4_t result;
-  __asm__ ("mvn %0.8b,%1.8b"
-   : "=w"(result)
-   : "w"(a)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmvn_u32 (uint32x2_t a)
-{
-  uint32x2_t result;
-  __asm__ ("mvn %0.8b,%1.8b"
-   : "=w"(result)
-   : "w"(a)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vmvnq_p8 (poly8x16_t a)
-{
-  poly8x16_t result;
-  __asm__ ("mvn %0.16b,%1.16b"
-   : "=w"(result)
-   : "w"(a)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vmvnq_s8 (int8x16_t a)
-{
-  int8x16_t result;
-  __asm__ ("mvn %0.16b,%1.16b"
-   : "=w"(result)
-   : "w"(a)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmvnq_s16 (int16x8_t a)
-{
-  int16x8_t result;
-  __asm__ ("mvn %0.16b,%1.16b"
-   : "=w"(result)
-   : "w"(a)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmvnq_s32 (int32x4_t a)
-{
-  int32x4_t result;
-  __asm__ ("mvn %0.16b,%1.16b"
-   : "=w"(result)
-   : "w"(a)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vmvnq_u8 (uint8x16_t a)
-{
-  uint8x16_t result;
-  __asm__ ("mvn %0.16b,%1.16b"
-   : "=w"(result)
-   : "w"(a)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmvnq_u16 (uint16x8_t a)
-{
-  uint16x8_t result;
-  __asm__ ("mvn %0.16b,%1.16b"
-   : "=w"(result)
-   : "w"(a)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmvnq_u32 (uint32x4_t a)
-{
-  uint32x4_t result;
-  __asm__ ("mvn %0.16b,%1.16b"
-   : "=w"(result)
-   : "w"(a)
-   : /* No clobbers */);
-  return result;
-}
-
-
 __extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 vpadal_s8 (int16x4_t a, int8x8_t b)
 {
@@ -18622,6 +18467,92 @@ vmulq_n_u32 (uint32x4_t __a, uint32_t __b)
   return __a * __b;
 }
 
+/* vmvn  */
+
+__

[AArch64, 3/4] Reimplement multiply by element to get rid of inline assembly

2016-05-16 Thread Jiong Wang

This patch reimplement vector multiply by element on top of the existed
vmul_lane* intrinsics instead of inline assembly.

There is no code generation change from this patch.

OK for trunk?

2016-05-16  Jiong Wang

gcc/
  * config/aarch64/aarch64-simd.md (vmul_n_f32): Remove inline assembly.
  Use builtin.
  (vmul_n_s16): Likewise.
  (vmul_n_s32): Likewise.
  (vmul_n_u16): Likewise.
  (vmul_n_u32): Likewise.
  (vmulq_n_f32): Likewise.
  (vmulq_n_f64): Likewise.
  (vmulq_n_s16): Likewise.
  (vmulq_n_s32): Likewise.
  (vmulq_n_u16): Likewise.
  (vmulq_n_u32): Likewise.

gcc/testsuite/
  * gcc.target/aarch64/simd/vmul_elem_1.c: Use intrinsics.

diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index ca7ace5aa656163826569d046fcbf02f9f7d4d6c..84931aeec2d885f8552197fe8a72500f127e2bbb 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -7938,61 +7938,6 @@ vmovn_u64 (uint64x2_t a)
   return result;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmul_n_f32 (float32x2_t a, float32_t b)
-{
-  float32x2_t result;
-  __asm__ ("fmul %0.2s,%1.2s,%2.s[0]"
-   : "=w"(result)
-   : "w"(a), "w"(b)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmul_n_s16 (int16x4_t a, int16_t b)
-{
-  int16x4_t result;
-  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
-   : "=w"(result)
-   : "w"(a), "x"(b)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmul_n_s32 (int32x2_t a, int32_t b)
-{
-  int32x2_t result;
-  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
-   : "=w"(result)
-   : "w"(a), "w"(b)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmul_n_u16 (uint16x4_t a, uint16_t b)
-{
-  uint16x4_t result;
-  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
-   : "=w"(result)
-   : "w"(a), "x"(b)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmul_n_u32 (uint32x2_t a, uint32_t b)
-{
-  uint32x2_t result;
-  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
-   : "=w"(result)
-   : "w"(a), "w"(b)
-   : /* No clobbers */);
-  return result;
-}
-
 #define vmull_high_lane_s16(a, b, c)\
   __extension__ \
 ({  \
@@ -8443,72 +8388,6 @@ vmull_u32 (uint32x2_t a, uint32x2_t b)
   return result;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmulq_n_f32 (float32x4_t a, float32_t b)
-{
-  float32x4_t result;
-  __asm__ ("fmul %0.4s,%1.4s,%2.s[0]"
-   : "=w"(result)
-   : "w"(a), "w"(b)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vmulq_n_f64 (float64x2_t a, float64_t b)
-{
-  float64x2_t result;
-  __asm__ ("fmul %0.2d,%1.2d,%2.d[0]"
-   : "=w"(result)
-   : "w"(a), "w"(b)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmulq_n_s16 (int16x8_t a, int16_t b)
-{
-  int16x8_t result;
-  __asm__ ("mul %0.8h,%1.8h,%2.h[0]"
-   : "=w"(result)
-   : "w"(a), "x"(b)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmulq_n_s32 (int32x4_t a, int32_t b)
-{
-  int32x4_t result;
-  __asm__ ("mul %0.4s,%1.4s,%2.s[0]"
-   : "=w"(result)
-   : "w"(a), "w"(b)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmulq_n_u16 (uint16x8_t a, uint16_t b)
-{
-  uint16x8_t result;
-  __asm__ ("mul %0.8h,%1.8h,%2.h[0]"
-   : "=w"(result)
-   : "w"(a), "x"(b)
-   : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmulq_n_u32 (uint32x4_t a, uint32_t b)
-{
-  uint32x4_t result;
-  __asm__ ("mul %0.4s,%1.4s,%2.s[0]"
-   : "=w"(result)
-   : "w"(a), "w"(b)
-   : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 vmvn_p8 (poly8x8_t a)
 {
@@ -18924,6 +18803,74 @@ vmulq_laneq_u32 (uint32x4_t __a, uint32x4_t __b, const int __lane)
   return __a * __aarch64_vget_lane_any (__b, __lane);
 }
 
+/* vmul_n.  */
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmul_n_f32 (float32x2_t __a, float32_t __b)
+{
+  return __a * __b;
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_in

[AArch64, 2/4] Extend vector mutiply by element to all supported modes

2016-05-16 Thread Jiong Wang

AArch64 support vector multiply by element for V2DF, V2SF, V4SF, V2SI,
V4SI, V4HI, V8HI.

All above are well supported by "*aarch64_mul3_elt" pattern and
"*aarch64_mul3_elt_" if there is lane size
change.

Above patterns are trying to match "(mul (vec_dup (vec_select)))"
which is genuinely vector multiply by element.

While vector multiply by element can also comes from "(mul (vec_dup
(scalar" where the scalar value is already sitting in vector register
then duplicated to other lanes, and there is no lane size change.

We have "*aarch64_mul3_elt_to_128df" to match this already, but it's
restricted for V2DF while this patch extends this support to more modes,
for example vector integer operations.

For the testcase included, the following codegen change will happen:


-   ldr w0, [x3, 160]
-   dup v1.2s, w0
-   mul v1.2s, v1.2s, v2.2s
+   ldr s1, [x3, 160]
+   mul v1.2s, v0.2s, v1.s[0]

OK for trunk?

2016-05-16  Jiong Wang

gcc/
  * config/aarch64/aarch64-simd.md (*aarch64_mul3_elt_to_128df): Extend to all
  supported modes.  Rename to "*aarch64_mul3_elt_from_dup".

gcc/testsuite/
  * /gcc.target/aarch64/simd/vmul_elem_1.c: New.

  
--

Regards,
Jiong

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index eb18defef15c24bf2334045e92bf7c34b989136d..7f338ff78fabccee868a4befbffed54c3e842dc9 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -371,15 +371,15 @@
   [(set_attr "type" "neon_mul__scalar")]
 )
 
-(define_insn "*aarch64_mul3_elt_to_128df"
-  [(set (match_operand:V2DF 0 "register_operand" "=w")
- (mult:V2DF
-   (vec_duplicate:V2DF
-	 (match_operand:DF 2 "register_operand" "w"))
-  (match_operand:V2DF 1 "register_operand" "w")))]
+(define_insn "*aarch64_mul3_elt_from_dup"
+ [(set (match_operand:VMUL 0 "register_operand" "=w")
+(mult:VMUL
+  (vec_duplicate:VMUL
+	(match_operand: 1 "register_operand" ""))
+  (match_operand:VMUL 2 "register_operand" "w")))]
   "TARGET_SIMD"
-  "fmul\\t%0.2d, %1.2d, %2.d[0]"
-  [(set_attr "type" "neon_fp_mul_d_scalar_q")]
+  "mul\t%0., %2., %1.[0]";
+  [(set_attr "type" "neon_mul__scalar")]
 )
 
 (define_insn "aarch64_rsqrte_2"
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vmul_elem_1.c b/gcc/testsuite/gcc.target/aarch64/simd/vmul_elem_1.c
new file mode 100644
index ..290a4e9adbc5d9ce1335ca28120e437293776f30
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vmul_elem_1.c
@@ -0,0 +1,519 @@
+/* Test the vmul_n_f64 AArch64 SIMD intrinsic.  */
+
+/* { dg-do run } */
+/* { dg-options "-O2 --save-temps" } */
+
+#include "arm_neon.h"
+
+extern void abort (void);
+
+#define A (132.4f)
+#define B (-0.0f)
+#define C (-34.8f)
+#define D (289.34f)
+float32_t expected2_1[2] = {A * A, B * A};
+float32_t expected2_2[2] = {A * B, B * B};
+float32_t expected4_1[4] = {A * A, B * A, C * A, D * A};
+float32_t expected4_2[4] = {A * B, B * B, C * B, D * B};
+float32_t expected4_3[4] = {A * C, B * C, C * C, D * C};
+float32_t expected4_4[4] = {A * D, B * D, C * D, D * D};
+float32_t _elemA = A;
+float32_t _elemB = B;
+float32_t _elemC = C;
+float32_t _elemD = D;
+
+#define AD (1234.5)
+#define BD (-0.0)
+#define CD (71.3)
+#define DD (-1024.4)
+float64_t expectedd2_1[2] = {AD * CD, BD * CD};
+float64_t expectedd2_2[2] = {AD * DD, BD * DD};
+float64_t _elemdC = CD;
+float64_t _elemdD = DD;
+
+
+#define AS (1024)
+#define BS (-31)
+#define CS (0)
+#define DS (655)
+int32_t expecteds2_1[2] = {AS * AS, BS * AS};
+int32_t expecteds2_2[2] = {AS * BS, BS * BS};
+int32_t expecteds4_1[4] = {AS * AS, BS * AS, CS * AS, DS * AS};
+int32_t expecteds4_2[4] = {AS * BS, BS * BS, CS * BS, DS * BS};
+int32_t expecteds4_3[4] = {AS * CS, BS * CS, CS * CS, DS * CS};
+int32_t expecteds4_4[4] = {AS * DS, BS * DS, CS * DS, DS * DS};
+int32_t _elemsA = AS;
+int32_t _elemsB = BS;
+int32_t _elemsC = CS;
+int32_t _elemsD = DS;
+
+#define AH ((int16_t) 0)
+#define BH ((int16_t) -32)
+#define CH ((int16_t) 102)
+#define DH ((int16_t) -51)
+#define EH ((int16_t) 71)
+#define FH ((int16_t) -91)
+#define GH ((int16_t) 48)
+#define HH ((int16_t) 255)
+int16_t expectedh4_1[4] = {AH * AH, BH * AH, CH * AH, DH * AH};
+int16_t expectedh4_2[4] = {AH * BH, BH * BH, CH * BH, DH * BH};
+int16_t expectedh4_3[4] = {AH * CH, BH * CH, CH * CH, DH * CH};
+int16_t expectedh4_4[4] = {AH * DH, BH * DH, CH * DH, DH * DH};
+int16_t expectedh8_1[8] = {AH * AH, BH * AH, CH * AH, DH * AH,
+			   EH * AH, FH * AH, GH * AH, HH * AH};
+int16_t expectedh8_2[8] = {AH * BH, BH * BH, CH * BH, DH * BH,
+			   EH * BH, FH * BH, GH * BH, HH * BH};
+int16_t expectedh8_3[8] = {AH * CH, BH * CH, CH * CH, DH * CH,
+			   EH * CH, FH * CH, GH * CH, HH * CH};
+int16_t expectedh8_4[8] = {AH * DH, BH * DH, CH * DH, DH * DH,
+			   EH * DH, FH * DH, GH * DH, HH * DH};
+int16_t expectedh8_5[8] = {AH * EH, BH * EH, CH * EH, DH * EH,
+			   EH * EH, FH * EH, GH * EH, HH * EH}

[AArch64, 1/4] Add the missing support of vfms_n_f32, vfmsq_n_f32, vfmsq_n_f64

2016-05-16 Thread Jiong Wang

The support of vfma_n_f64, vfms_n_f32, vfmsq_n_f32, vfmsq_n_f64 are
missing in current gcc arm_neon.h.

Meanwhile, besides "(fma (vec_dup (vec_select)))", fma by element can
also comes from "(fma (vec_dup(scalar" where the scalar value is already
sitting in vector register then duplicated to other lanes, and there is
no lane size change.

This patch implement this and can generate better code under some
context. For example:

cat test.c
===
typedef __Float32x2_t float32x2_t;
typedef float float32_t;

float32x2_t
vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
{
  return __builtin_aarch64_fmav2sf (__b,  (float32x2_t) {__c, __c}, 
__a);

}

before (-O2)
===
vfma_n_f32:
dup v2.2s, v2.s[0]
fmlav0.2s, v1.2s, v2.2s
ret
after
===
vfma_n_f32:
fmlav0.2s, v1.2s, v2.s[0]
ret

OK for trunk?

2016-05-16  Jiong Wang 

gcc/
  * config/aarch64/aarch64-simd.md (*aarch64_fma4_elt_to_128df): Rename
  to *aarch64_fma4_elt_from_dup.
  (*aarch64_fnma4_elt_to_128df): Rename to 
*aarch64_fnma4_elt_from_dup.

  * config/aarch64/arm_neon.h (vfma_n_f64): New.
  (vfms_n_f32): Likewise.
  (vfms_n_f64): Likewise.
  (vfmsq_n_f32): Likewise.
  (vfmsq_n_f64): Likewise.

gcc/testsuite/
  * gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c: Use standard 
syntax.

  * gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c: Likewise.
  * gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h: New entry for 
float64x1.

  * gcc.target/aarch64/advsimd-intrinsics/vfms_vfma_n.c: New.
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index bd73bce64414e8bc01732d14311d742cf28f4586..90eaca176b4706e6cc42f16ce2c956f1c8ad17b1 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1579,16 +1579,16 @@
   [(set_attr "type" "neon_fp_mla__scalar")]
 )
 
-(define_insn "*aarch64_fma4_elt_to_128df"
-  [(set (match_operand:V2DF 0 "register_operand" "=w")
-(fma:V2DF
-  (vec_duplicate:V2DF
-	  (match_operand:DF 1 "register_operand" "w"))
-  (match_operand:V2DF 2 "register_operand" "w")
-  (match_operand:V2DF 3 "register_operand" "0")))]
+(define_insn "*aarch64_fma4_elt_from_dup"
+  [(set (match_operand:VMUL 0 "register_operand" "=w")
+(fma:VMUL
+  (vec_duplicate:VMUL
+	  (match_operand: 1 "register_operand" "w"))
+  (match_operand:VMUL 2 "register_operand" "w")
+  (match_operand:VMUL 3 "register_operand" "0")))]
   "TARGET_SIMD"
-  "fmla\\t%0.2d, %2.2d, %1.2d[0]"
-  [(set_attr "type" "neon_fp_mla_d_scalar_q")]
+  "fmla\t%0., %2., %1.[0]"
+  [(set_attr "type" "neon_mla__scalar")]
 )
 
 (define_insn "*aarch64_fma4_elt_to_64v2df"
@@ -1656,17 +1656,17 @@
   [(set_attr "type" "neon_fp_mla__scalar")]
 )
 
-(define_insn "*aarch64_fnma4_elt_to_128df"
-  [(set (match_operand:V2DF 0 "register_operand" "=w")
-(fma:V2DF
-  (neg:V2DF
-(match_operand:V2DF 2 "register_operand" "w"))
-  (vec_duplicate:V2DF
-	(match_operand:DF 1 "register_operand" "w"))
-  (match_operand:V2DF 3 "register_operand" "0")))]
-  "TARGET_SIMD"
-  "fmls\\t%0.2d, %2.2d, %1.2d[0]"
-  [(set_attr "type" "neon_fp_mla_d_scalar_q")]
+(define_insn "*aarch64_fnma4_elt_from_dup"
+  [(set (match_operand:VMUL 0 "register_operand" "=w")
+(fma:VMUL
+  (neg:VMUL
+(match_operand:VMUL 2 "register_operand" "w"))
+  (vec_duplicate:VMUL
+	(match_operand: 1 "register_operand" "w"))
+  (match_operand:VMUL 3 "register_operand" "0")))]
+  "TARGET_SIMD"
+  "fmls\t%0., %2., %1.[0]"
+  [(set_attr "type" "neon_mla__scalar")]
 )
 
 (define_insn "*aarch64_fnma4_elt_to_64v2df"
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 2612a325718918cf7cd808f28c09c9c4c7b11c07..ca7ace5aa656163826569d046fcbf02f9f7d4d6c 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -14456,6 +14456,12 @@ vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
   return __builtin_aarch64_fmav2sf (__b, vdup_n_f32 (__c), __a);
 }
 
+__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+vfma_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c)
+{
+  return (float64x1_t) {__b[0] * __c + __a[0]};
+}
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vfmaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
 {
@@ -14597,6 +14603,29 @@ vfmsq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c)
   return __builtin_aarch64_fmav2df (-__b, __c, __a);
 }
 
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vfms_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
+{
+  return __builtin_aarch64_fmav2sf (-__b, vdup_n_f32 (__c), __a);
+}
+
+__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+vfms_n_f64 (float64x1_t __a, float64x1_t __b, float64_t __c)
+{
+  return (float64x1_t) {-__b[0] * __c + __a[0]};
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__al

[Ada] Use noncanonical host name for canadian cross builds

2016-05-16 Thread Eric Botcazou
This brings consistency to the way host tools are invoked when doing canadian 
cross builds with Ada: gnattools was using host_alias and gcc/ada host, while 
the most correct incantation is probably host_noncanonical.

Tested on x86_64-suse-linux, applied on the mainline.


2016-05-16  Eric Botcazou  

gnattools/
* configure.ac: Add ACX_NONCANONICAL_HOST.
* configure: Regenerate.
* Makefile.in: Replace host_alias with host_noncanonical.
(gnattools-cross): Do not rename the tools.
gcc/
* configure.ac: Add ACX_NONCANONICAL_HOST.
* configure: Regenerate.
* Makefile.in: Set host_noncanonical.
gcc/ada
* gcc-interface/Make-lang.in (GNATMAKE_FOR_HOST): In the canadian
cross case, use host_noncanonical instead of host as prefix.
(GNATBIND_FOR_HOST): Likewise.
(GNATLINK_FOR_HOST): Likewise.
(GNATLS_FOR_HOST): Likewise.

-- 
Eric BotcazouIndex: gcc/Makefile.in
===
--- gcc/Makefile.in	(revision 236264)
+++ gcc/Makefile.in	(working copy)
@@ -56,6 +56,7 @@ MAKEOVERRIDES =
 
 build=@build@
 host=@host@
+host_noncanonical=@host_noncanonical@
 target=@target@
 target_noncanonical:=@target_noncanonical@
 
Index: gcc/ada/gcc-interface/Make-lang.in
===
--- gcc/ada/gcc-interface/Make-lang.in	(revision 236264)
+++ gcc/ada/gcc-interface/Make-lang.in	(working copy)
@@ -175,10 +175,10 @@ else
   # or a cross-native compiler. We provide defaults for tools targeting the
   # host platform, but they can be overriden by just setting _FOR_HOST
   # variables.
-  GNATMAKE_FOR_HOST=$(host)-gnatmake
-  GNATBIND_FOR_HOST=$(host)-gnatbind
-  GNATLINK_FOR_HOST=$(host)-gnatlink
-  GNATLS_FOR_HOST=$(host)-gnatls
+  GNATMAKE_FOR_HOST=$(host_noncanonical)-gnatmake
+  GNATBIND_FOR_HOST=$(host_noncanonical)-gnatbind
+  GNATLINK_FOR_HOST=$(host_noncanonical)-gnatlink
+  GNATLS_FOR_HOST=$(host_noncanonical)-gnatls
 
   ifeq ($(host), $(target))
 # This is a cross native. All the sources are taken from the currently
Index: gcc/configure.ac
===
--- gcc/configure.ac	(revision 236264)
+++ gcc/configure.ac	(working copy)
@@ -35,6 +35,9 @@ AC_CANONICAL_BUILD
 AC_CANONICAL_HOST
 AC_CANONICAL_TARGET
 
+# Determine the noncanonical host name, for Ada.
+ACX_NONCANONICAL_HOST
+
 # Determine the noncanonical target name, for directory use.
 ACX_NONCANONICAL_TARGET
 
Index: gnattools/Makefile.in
===
--- gnattools/Makefile.in	(revision 236264)
+++ gnattools/Makefile.in	(working copy)
@@ -25,7 +25,6 @@ libdir = @libdir@
 build = @build@
 target = @target@
 host = @host@
-host_alias = @host_alias@
 prefix = @prefix@
 INSTALL = @INSTALL@
 INSTALL_DATA = @INSTALL_DATA@
@@ -34,6 +33,7 @@ INSTALL_PROGRAM = @INSTALL_PROGRAM@
 # Nonstandard autoconf-set variables.
 LN_S=@LN_S@
 target_noncanonical=@target_noncanonical@
+host_noncanonical=@host_noncanonical@
 
 # Variables for the user (or the top level) to override.
 exeext = @EXEEXT@
@@ -118,10 +118,10 @@ ifeq ($(build), $(host))
   GNATBIND_FOR_HOST=gnatbind
   GNATLS_FOR_HOST=gnatls
 else
-  GNATMAKE_FOR_HOST=$(host_alias)-gnatmake
-  GNATLINK_FOR_HOST=$(host_alias)-gnatlink
-  GNATBIND_FOR_HOST=$(host_alias)-gnatbind
-  GNATLS_FOR_HOST=$(host_alias)-gnatls
+  GNATMAKE_FOR_HOST=$(host_noncanonical)-gnatmake
+  GNATLINK_FOR_HOST=$(host_noncanonical)-gnatlink
+  GNATBIND_FOR_HOST=$(host_noncanonical)-gnatbind
+  GNATLS_FOR_HOST=$(host_noncanonical)-gnatls
 endif
 
 # Put the host RTS dir first in the PATH to hide the default runtime
@@ -219,50 +219,6 @@ gnattools-cross: $(GCC_DIR)/stamp-tools
 	# gnattools2
 	$(MAKE) -C $(GCC_DIR)/ada/tools -f ../Makefile \
 	  $(TOOLS_FLAGS_TO_PASS_CROSS) common-tools
-	# Rename cross tools to where the GCC makefile wants them when
-	# installing.  FIXME: installation should be done elsewhere.
-	if [ -f $(GCC_DIR)/gnatbind$(exeext) ] ; then \
-	  mv $(GCC_DIR)/gnatbind$(exeext) $(GCC_DIR)/gnatbind-cross$(exeext); \
-	fi
-	if [ -f $(GCC_DIR)/gnatchop$(exeext) ] ; then \
-	  mv $(GCC_DIR)/gnatchop$(exeext) $(GCC_DIR)/gnatchop-cross$(exeext); \
-	fi
-	if [ -f $(GCC_DIR)/gnat$(exeext) ] ; then \
-	  mv $(GCC_DIR)/gnat$(exeext) $(GCC_DIR)/gnat-cross$(exeext); \
-	fi
-	if [ -f $(GCC_DIR)/gnatkr$(exeext) ] ; then \
-	  mv $(GCC_DIR)/gnatkr$(exeext) $(GCC_DIR)/gnatkr-cross$(exeext); \
-	fi
-	if [ -f $(GCC_DIR)/gnatlink$(exeext) ] ; then \
-	  mv $(GCC_DIR)/gnatlink$(exeext) $(GCC_DIR)/gnatlink-cross$(exeext); \
-	fi
-	if [ -f $(GCC_DIR)/gnatls$(exeext) ] ; then \
-	  mv $(GCC_DIR)/gnatls$(exeext) $(GCC_DIR)/gnatls-cross$(exeext); \
-	fi
-	if [ -f $(GCC_DIR)/gnatmake$(exeext) ] ; then \
-	  mv $(GCC_DIR)/gnatmake$(exeext) $(GCC_DIR)/gnatmake-cross$(exeext); \
-	fi
-	if [ -f $(GCC_DIR)/gnatmem$(exeext) ] ; then \
-	  mv $(GCC_DIR)/gnatmem$(exeex

[ARM] Fix bogus -fstack-usage warning on naked functions

2016-05-16 Thread Eric Botcazou
Hi,

-fstack-usage issues the "not supported by this target" warning on naked 
functions because the prologue routines do an early return for them.

Tested on arm-eabi, may I apply it on all active branches?


2016-05-16  Eric Botcazou  

* config/arm/arm.c (arm_expand_prologue): Set the stack usage to 0
for naked functions.
(thumb1_expand_prologue): Likewise.

-- 
Eric BotcazouIndex: config/arm/arm.c
===
--- config/arm/arm.c	(revision 236264)
+++ config/arm/arm.c	(working copy)
@@ -21467,7 +21467,11 @@ arm_expand_prologue (void)
 
   /* Naked functions don't have prologues.  */
   if (IS_NAKED (func_type))
-return;
+{
+  if (flag_stack_usage_info)
+	current_function_static_stack_size = 0;
+  return;
+}
 
   /* Make a copy of c_f_p_a_s as we may need to modify it locally.  */
   args_to_push = crtl->args.pretend_args_size;
@@ -24721,7 +24725,11 @@ thumb1_expand_prologue (void)
 
   /* Naked functions don't have prologues.  */
   if (IS_NAKED (func_type))
-return;
+{
+  if (flag_stack_usage_info)
+	current_function_static_stack_size = 0;
+  return;
+}
 
   if (IS_INTERRUPT (func_type))
 {


Re: [PATCH][RFC] Introduce BIT_FIELD_INSERT

2016-05-16 Thread Eric Botcazou
> The following patch adds BIT_FIELD_INSERT, an operation to
> facilitate doing bitfield inserts on registers (as opposed
> to currently where we'd have a BIT_FIELD_REF store).

Why not call it BIT_FIELD_INSERT_EXPR instead to make it clear that it's an 
expression and not a mere operation?

> Originally this was developed as part of bitfield lowering
> where bitfield stores were lowered into read-modify-write
> cycles and the modify part, instead of doing shifting and masking,
> be kept in a more high-level form to ease combining them.
> 
> A second use case (the above is still valid) is vector element
> inserts which we currently can only do via memory or
> by extracting all components and re-building the vector using
> a CONSTRUCTOR.  For this second use case I added code
> re-writing the BIT_FIELD_REF stores the C family FEs produce
> into BIT_FIELD_INSERT when update-address-taken can otherwise
> re-write a decl into SSA form (the testcase shows we miss
> a similar opportunity with the MEM_REF form of a vector insert,
> I plan to fix that for the final submission).

The description in tree.def looks off then, it only mentions words and 
integral types.

> One speciality of BIT_FIELD_INSERT as opposed to BIT_FIELD_REF
> is that the size of the insertion is given implicitely via the
> type size/precision of the value to insert.  That avoids
> introducing ways to have quaternary ops in folding and GIMPLE stmts.

Yes, it's a bit unfortunate, but sensible.  Maybe add a ??? note about that.

-- 
Eric Botcazou


Re: [PATCH PR69848/partial]Propagate comparison into VEC_COND_EXPR if target supports

2016-05-16 Thread Bin.Cheng
On Fri, May 13, 2016 at 5:53 PM, Richard Biener
 wrote:
> On May 13, 2016 6:02:27 PM GMT+02:00, Bin Cheng  wrote:
>>Hi,
>>As PR69848 reported, GCC vectorizer now generates comparison outside of
>>VEC_COND_EXPR for COND_REDUCTION case, as below:
>>
>>  _20 = vect__1.6_8 != { 0, 0, 0, 0 };
>>  vect_c_2.8_16 = VEC_COND_EXPR <_20, { 0, 0, 0, 0 }, vect_c_2.7_13>;
>>  _21 = VEC_COND_EXPR <_20, ivtmp_17, _19>;
>>
>>This results in inefficient expanding.  With IR like:
>>
>>vect_c_2.8_16 = VEC_COND_EXPR >0, 0 }, vect_c_2.7_13>;
>>  _21 = VEC_COND_EXPR ;
>>
>>We can do:
>>1) Expanding time optimization, for example, reverting comparison
>>operator by switching VEC_COND_EXPR operands.  This is useful when
>>backend only supports some comparison operators.
>>2) For backend not supporting vcond_mask patterns, saving one LT_EXPR
>>instruction which introduced by expand_vec_cond_expr.
>>
>>This patch fixes this by propagating comparison into VEC_COND_EXPR even
>>if it's used multiple times.  For now, GCC does single_use_only
>>propagation.  Ideally, we may duplicate the comparison before each use
>>statement just before expanding, so that TER can successfully backtrack
>>it from each VEC_COND_EXPR.  Unfortunately I didn't find a good pass to
>>do this.  Tree-vect-generic.c looks like a good candidate, but it's so
>>early that following CSE could undo the transform.  Another possible
>>fix is to generate comparison inside VEC_COND_EXPR directly in function
>>vectorizable_reduction.
>
> I prefer this for now.
Hi Richard, you mean this patch, or the possible fix before your comment?
Here is an updated patch addressing comment issue pointed out by
Bernhard Reutner-Fischer.  Thanks.

Thanks,
bin
>
> Richard.
>
>>As for possible comparison CSE opportunities, I checked that it's
>>simple enough to be handled by RTL CSE.
>>
>>Bootstrap and test on x86_64 and AArch64.  Any comments?
>>
>>Thanks,
>>bin
>>
>>2016-05-12  Bin Cheng  
>>
>>   PR tree-optimization/69848
>>   * optabs-tree.c (expand_vcond_mask_p, expand_vcond_p): New.
>>   (expand_vec_cmp_expr_p): Call above functions.
>>   * optabs-tree.h (expand_vcond_mask_p, expand_vcond_p): New.
>>   * tree-ssa-forwprop.c (optabs-tree.h): Include header file.
>>   (forward_propagate_into_cond): Propgate multiple uses for
>>   VEC_COND_EXPR.
>
>
diff --git a/gcc/optabs-tree.c b/gcc/optabs-tree.c
index faac087..13538e5 100644
--- a/gcc/optabs-tree.c
+++ b/gcc/optabs-tree.c
@@ -313,26 +313,51 @@ expand_vec_cmp_expr_p (tree value_type, tree mask_type)
   return (icode != CODE_FOR_nothing);
 }
 
-/* Return TRUE iff, appropriate vector insns are available
-   for vector cond expr with vector type VALUE_TYPE and a comparison
+/* Return TRUE iff appropriate vector insns are available
+   for VCOND_MASK pattern with vector type VALUE_TYPE and a comparison
with operand vector types in CMP_OP_TYPE.  */
 
 bool
-expand_vec_cond_expr_p (tree value_type, tree cmp_op_type)
+expand_vcond_mask_p (tree value_type, tree cmp_op_type)
 {
-  machine_mode value_mode = TYPE_MODE (value_type);
-  machine_mode cmp_op_mode = TYPE_MODE (cmp_op_type);
   if (VECTOR_BOOLEAN_TYPE_P (cmp_op_type)
   && get_vcond_mask_icode (TYPE_MODE (value_type),
   TYPE_MODE (cmp_op_type)) != CODE_FOR_nothing)
 return true;
 
-  if (GET_MODE_SIZE (value_mode) != GET_MODE_SIZE (cmp_op_mode)
-  || GET_MODE_NUNITS (value_mode) != GET_MODE_NUNITS (cmp_op_mode)
-  || get_vcond_icode (TYPE_MODE (value_type), TYPE_MODE (cmp_op_type),
- TYPE_UNSIGNED (cmp_op_type)) == CODE_FOR_nothing)
-return false;
-  return true;
+  return false;
+}
+
+/* Return TRUE iff appropriate vector insns are available
+   for VCOND pattern with vector type VALUE_TYPE and a comparison
+   with operand vector types in CMP_OP_TYPE.  */
+
+bool
+expand_vcond_p (tree value_type, tree cmp_op_type)
+{
+  machine_mode value_mode = TYPE_MODE (value_type);
+  machine_mode cmp_op_mode = TYPE_MODE (cmp_op_type);
+  if (GET_MODE_SIZE (value_mode) == GET_MODE_SIZE (cmp_op_mode)
+  && GET_MODE_NUNITS (value_mode) == GET_MODE_NUNITS (cmp_op_mode)
+  && get_vcond_icode (TYPE_MODE (value_type), TYPE_MODE (cmp_op_type),
+ TYPE_UNSIGNED (cmp_op_type)) != CODE_FOR_nothing)
+return true;
+
+  return false;
+}
+
+/* Return TRUE iff appropriate vector insns are available
+   for vector cond expr with vector type VALUE_TYPE and a comparison
+   with operand vector types in CMP_OP_TYPE.  */
+
+bool
+expand_vec_cond_expr_p (tree value_type, tree cmp_op_type)
+{
+  if (expand_vcond_mask_p (value_type, cmp_op_type)
+  || expand_vcond_p (value_type, cmp_op_type))
+return true;
+
+  return false;
 }
 
 /* Use the current target and options to initialize
diff --git a/gcc/optabs-tree.h b/gcc/optabs-tree.h
index c3b9280..feab40f 100644
--- a/gcc/optabs-tree.h
+++ b/gcc/optabs-tree.h
@@ -39,6 +39,8 @@ optab optab_for_tree_code (enum tree_code, con