Re: [nvptx] fix some c++ tests

2015-10-08 Thread Thomas Schwinge
Hi Nathan!

Thanks for looking into this.  When I reported this,
,
quite a lot of testcases had been failing -- with recent GCC trunk, the
number is smaller (because of other middle end/optimization changes, I
suppose).  Yet, the problem still can be observed; grep for "%retval" in
g++.log.

On Thu, 8 Oct 2015 13:33:55 -0400, Nathan Sidwell  wrote:
> I've committed this to trunk.  The C++ ABI now returns a pointer to the 
> passed-in artificial arg that points to the return area.  consequently 
> return-in-mem and type_mode(return_type) == VOIDmode are  not tautologies.

>   * config/nvptx/nvptx.c (nvptx_declare_function_name): Functions
>   may return pointer as well as in memory.
>   (nvptx_output_return): Likewise.

Hmm, but what I now see happening is that now there is incorrect
"%out_retval" usage (but it's not part of the function prototypes), for
example:

$ build-gcc/gcc/xg++ -Bbuild-gcc/gcc/ 
source-gcc/gcc/testsuite/g++.dg/ipa/ipa-icf-6.C -std=gnu++98 -O3 -o 
ipa-icf-6.exe -S

Diff before/after your change:

$ diff -U4 O/ipa-icf-6.exe ipa-icf-6.exe
--- O/ipa-icf-6.exe 2015-10-06 18:30:21.581187448 +0200
+++ ipa-icf-6.exe   2015-10-09 07:49:23.894893180 +0200
@@ -10,8 +10,9 @@
 .visible .func _Z5test21A(.param.u64 %in_ar1, .param.u64 %in_ar2)
 {
.reg.u64 %ar2;
.reg.u64 %ar1;
+   .reg.u64 %retval;
.reg.u64 %hr10;
.reg.u64 %r25;
.reg.u64 %r26;
.reg.u64 %r27;
@@ -28,8 +29,9 @@
}
 $L2:
st.u64  [%r25+8], %r25;
mov.u64 %retval, %r25;
+   st.param.u64[%out_retval], %retval;
ret;
}
 // BEGIN FUNCTION DECL: _ZL7test_mePF1AS_E
 .func _ZL7test_mePF1AS_E(.param.u64 %in_ar1);
@@ -78,8 +80,9 @@
 .visible .func _Z5test11A(.param.u64 %in_ar1, .param.u64 %in_ar2)
 {
.reg.u64 %ar2;
.reg.u64 %ar1;
+   .reg.u64 %retval;
.reg.u64 %hr10;
.reg.u64 %r25;
.reg.u64 %r26;
.reg.u64 %r27;
@@ -96,8 +99,9 @@
}
 $L6:
st.u64  [%r25+8], %r25;
mov.u64 %retval, %r25;
+   st.param.u64[%out_retval], %retval;
ret;
}
 // BEGIN GLOBAL FUNCTION DECL: main
 .visible .func (.param.u32 %out_retval)main(.param.u32 %argc, .param.u64 
%argv);

(I have not yet made an attempt at trying to understand the problem.)


For reference:

> Index: gcc/config/nvptx/nvptx.c
> ===
> --- gcc/config/nvptx/nvptx.c  (revision 228617)
> +++ gcc/config/nvptx/nvptx.c  (working copy)
> @@ -531,13 +531,8 @@ nvptx_declare_function_name (FILE *file,
>nvptx_write_function_decl (s, name, decl);
>fprintf (file, "%s", s.str().c_str());
>  
> -  bool return_in_mem = false;
> -  if (TYPE_MODE (result_type) != VOIDmode)
> -{
> -  machine_mode mode = TYPE_MODE (result_type);
> -  if (!RETURN_IN_REG_P (mode))
> - return_in_mem = true;
> -}
> +  bool return_in_mem = (TYPE_MODE (result_type) != VOIDmode
> + && !RETURN_IN_REG_P (TYPE_MODE (result_type)));
>  
>fprintf (file, "\n{\n");
>  
> @@ -547,9 +542,13 @@ nvptx_declare_function_name (FILE *file,
>  false, return_in_mem);
>if (return_in_mem)
>  fprintf (file, "\t.reg.u%d %%ar1;\n", GET_MODE_BITSIZE (Pmode));
> -  else if (TYPE_MODE (result_type) != VOIDmode)
> +
> +  /* C++11 ABI causes us to return a reference to the passed in
> + pointer for return_in_mem.  */
> +  if (cfun->machine->ret_reg_mode != VOIDmode)
>  {
> -  machine_mode mode = arg_promotion (TYPE_MODE (result_type));
> +  machine_mode mode = arg_promotion
> + ((machine_mode)cfun->machine->ret_reg_mode);
>fprintf (file, "\t.reg%s %%retval;\n",
>  nvptx_ptx_type_from_mode (mode, false));
>  }
> @@ -635,17 +634,13 @@ nvptx_declare_function_name (FILE *file,
>  const char *
>  nvptx_output_return (void)
>  {
> -  tree fntype = TREE_TYPE (current_function_decl);
> -  tree result_type = TREE_TYPE (fntype);
> -  if (TYPE_MODE (result_type) != VOIDmode)
> +  machine_mode mode = (machine_mode)cfun->machine->ret_reg_mode;
> +
> +  if (mode != VOIDmode)
>  {
> -  machine_mode mode = TYPE_MODE (result_type);
> -  if (RETURN_IN_REG_P (mode))
> - {
> -   mode = arg_promotion (mode);
> -   fprintf (asm_out_file, "\tst.param%s\t[%%out_retval], %%retval;\n",
> -nvptx_ptx_type_from_mode (mode, false));
> - }
> +  mode = arg_promotion (mode);
> +  fprintf (asm_out_file, "\tst.param%s\t[%%out_retval], %%retval;\n",
> +nvptx_ptx_type_from_mode (mode, false));
>  }
>  
>return "ret;";
> Index: gcc/config/nvptx/nvptx.h
> ===
> --- gcc/config/nvptx/nvptx.h  (revision 22861

[PATCH] [5/n] Fix minor SSA_NAME leaks

2015-10-08 Thread Jeff Law


This set of leaks is a bit more interesting.  value-prof just blindly 
clears the vdefs on certain statements.  Of course that results in a 
leaked SSA_NAME.


The first of the three instances was the one I hit, the others were 
found by inspection.  Like prior patches, I've kept a minimized testcase 
for this issue.


FWIW, with this patch I can now bootstrap x86-64 with my code to verify 
we haven't leaked any SSA_NAMEs.   My verifier knows about a purposeful 
leak in function splitting and doesn't warn about leaks stemming from 
that code.


There's still a ton of leaks in the vectorizer and probably elsewhere, 
but being able to bootstrap is a notable achievement.



Bootstrapped and regression tested on x86_64-linux-gnu.  Installed on 
the trunk.


Jeff
* value-prof.c (gimple_ic): Add missing calls to unlink_stmt_vdef
and release_ssa_name in two places.
(gimple_stringop_fixed_value): Similarly.

diff --git a/gcc/value-prof.c b/gcc/value-prof.c
index 90211ef..ddf1215 100644
--- a/gcc/value-prof.c
+++ b/gcc/value-prof.c
@@ -1384,6 +1384,11 @@ gimple_ic (gcall *icall_stmt, struct cgraph_node 
*direct_call,
   cond_stmt = gimple_build_cond (EQ_EXPR, tmp1, tmp0, NULL_TREE, NULL_TREE);
   gsi_insert_before (&gsi, cond_stmt, GSI_SAME_STMT);
 
+  if (TREE_CODE (gimple_vdef (icall_stmt)) == SSA_NAME)
+{
+  unlink_stmt_vdef (icall_stmt);
+  release_ssa_name (gimple_vdef (icall_stmt));
+}
   gimple_set_vdef (icall_stmt, NULL_TREE);
   gimple_set_vuse (icall_stmt, NULL_TREE);
   update_stmt (icall_stmt);
@@ -1472,6 +1477,11 @@ gimple_ic (gcall *icall_stmt, struct cgraph_node 
*direct_call,
{
  gimple *copy;
 
+ if (TREE_CODE (gimple_vdef (iretbnd_stmt)) == SSA_NAME)
+   {
+ unlink_stmt_vdef (iretbnd_stmt);
+ release_ssa_name (gimple_vdef (iretbnd_stmt));
+   }
  gimple_set_vdef (iretbnd_stmt, NULL_TREE);
  gimple_set_vuse (iretbnd_stmt, NULL_TREE);
  update_stmt (iretbnd_stmt);
@@ -1698,6 +1708,11 @@ gimple_stringop_fixed_value (gcall *vcall_stmt, tree 
icall_size, int prob,
   cond_stmt = gimple_build_cond (EQ_EXPR, tmp1, tmp0, NULL_TREE, NULL_TREE);
   gsi_insert_before (&gsi, cond_stmt, GSI_SAME_STMT);
 
+  if (TREE_CODE (gimple_vdef (vcall_stmt)) == SSA_NAME)
+{
+  unlink_stmt_vdef (vcall_stmt);
+  release_ssa_name (gimple_vdef (vcall_stmt));
+}
   gimple_set_vdef (vcall_stmt, NULL);
   gimple_set_vuse (vcall_stmt, NULL);
   update_stmt (vcall_stmt);


[PR67766] reorder return value copying from PARALLELs and CONCATs (was: Re: [PR64164] drop copyrename, integrate into expand)

2015-10-08 Thread Alexandre Oliva
This fixes fallout from the PR64164 expander revamp.  On alpha, PARALLEL
hard return values may be modeless, and this confuses the code that
wants to copy the pseudo/s in the returned value to the return hard
regs.

It used to work because PARALLELs and CONCATs used to lead to DECL_RTL
with the same mode, but now we try harder to create a pseudo or MEM with
a reasonable mode.

The solution was as simple as moving down the code that handled mode
differences, so that PARALLELs and CONCATs are handled as they should.
Since AFAICT they don't ever have to deal with mode promotion anyway, we
should be fine with this simple change, that Uroš kindly tested with an
alpha-linux-gnu regstrap.  I tested it myself on x86_64-linux-gnu and
i686-linux-gnu.

Ok to install?


[PR67766] reorder handling of parallels, concats and promoted values in return

From: Alexandre Oliva 

for  gcc/ChangeLog

PR middle-end/67766
* function.c (expand_function_end): Move return value
promotion past the handling of PARALLELs and CONCATs.
---
 gcc/function.c |   24 
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/gcc/function.c b/gcc/function.c
index e76ba2b..d16d6d8 100644
--- a/gcc/function.c
+++ b/gcc/function.c
@@ -5446,18 +5446,6 @@ expand_function_end (void)
  decl_rtl);
  shift_return_value (GET_MODE (decl_rtl), true, real_decl_rtl);
}
- /* If a named return value dumped decl_return to memory, then
-we may need to re-do the PROMOTE_MODE signed/unsigned
-extension.  */
- else if (GET_MODE (real_decl_rtl) != GET_MODE (decl_rtl))
-   {
- int unsignedp = TYPE_UNSIGNED (TREE_TYPE (decl_result));
- promote_function_mode (TREE_TYPE (decl_result),
-GET_MODE (decl_rtl), &unsignedp,
-TREE_TYPE (current_function_decl), 1);
-
- convert_move (real_decl_rtl, decl_rtl, unsignedp);
-   }
  else if (GET_CODE (real_decl_rtl) == PARALLEL)
{
  /* If expand_function_start has created a PARALLEL for decl_rtl,
@@ -5488,6 +5476,18 @@ expand_function_end (void)
  emit_move_insn (tmp, decl_rtl);
  emit_move_insn (real_decl_rtl, tmp);
}
+ /* If a named return value dumped decl_return to memory, then
+we may need to re-do the PROMOTE_MODE signed/unsigned
+extension.  */
+ else if (GET_MODE (real_decl_rtl) != GET_MODE (decl_rtl))
+   {
+ int unsignedp = TYPE_UNSIGNED (TREE_TYPE (decl_result));
+ promote_function_mode (TREE_TYPE (decl_result),
+GET_MODE (decl_rtl), &unsignedp,
+TREE_TYPE (current_function_decl), 1);
+
+ convert_move (real_decl_rtl, decl_rtl, unsignedp);
+   }
  else
emit_move_insn (real_decl_rtl, decl_rtl);
}


-- 
Alexandre Oliva, freedom fighterhttp://FSFLA.org/~lxoliva/
You must be the change you wish to see in the world. -- Gandhi
Be Free! -- http://FSFLA.org/   FSF Latin America board member
Free Software Evangelist|Red Hat Brasil GNU Toolchain Engineer


[PR67828] don't unswitch loops on undefined SSA values (was: Re: [PR64164] drop copyrename, integrate into expand)

2015-10-08 Thread Alexandre Oliva
This patch fixes a latent bug in loop unswitching exposed by the PR64164
changes.

We would move a test out of a loop that might never have been executed,
and that accessed an uninitialized variable.  The uninitialized SSA
name, due to uncprop, now gets coalescesd with other SSA names,
expanding the ill effects of the undefined behavior we introduce: in
spite of the zero initialization introduced in later rtl stages for the
uninitialized pseudo, by then we've already expanded a PHI node that
referenced the unitialized variable in the path coming from a path in
which it would necessarily be zero, to a copy from the coalesced pseudo,
that gets modified between the zero-initialization and the copy, so the
copied zero is no longer zero.  Oops.

We might want to be stricter in coalesce conflict detection to avoid
this sort of problem, and perhaps to avoid undefined values in uncprop,
but this would all be attempting to limit the effects of undefined
behavior, which is probably a waste of effort.  As long as we avoid
introducing undefined behavior ourselves, we shouldn't have to do any of
that.  So, this patch fixes loop unswitching so as to not introduce
undefined behavior.

Regstrapped on x86_64-linux-gnu and i686-linux-gnu.  Ok to install?


[PR67828] don't unswitch on default defs of non-parms

From: Alexandre Oliva 

for  gcc/ChangeLog

PR rtl-optimizatoin/67828
* tree-ssa-loop-unswitch.c: Include tree-ssa.h.
(tree_may_unswitch_on): Don't unswitch on expressions
involving undefined values.

for  gcc/testsuite/ChangeLog

PR rtl-optimization/67828
* gcc.dg/torture/pr67828.c: New.
---
 gcc/testsuite/gcc.dg/torture/pr67828.c |   43 
 gcc/tree-ssa-loop-unswitch.c   |5 
 2 files changed, 48 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/torture/pr67828.c

diff --git a/gcc/testsuite/gcc.dg/torture/pr67828.c 
b/gcc/testsuite/gcc.dg/torture/pr67828.c
new file mode 100644
index 000..c7b6965
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/pr67828.c
@@ -0,0 +1,43 @@
+/* Check that we don't misoptimize the final value of d.  We used to
+   apply loop unswitching on if(j), introducing undefined behavior
+   that the original code wouldn't exercise, and this undefined
+   behavior would get later passes to misoptimize the loop.  */
+
+/* { dg-do run } */
+
+#include 
+#include 
+
+int x;
+
+int __attribute__ ((noinline, noclone))
+xprintf (int d) {
+  if (d)
+{
+  if (x)
+   printf ("%d", d);
+  abort ();
+}
+}
+
+int a, b;
+short c;
+
+int
+main ()
+{
+  int j, d = 1;
+  for (; c >= 0; c++)
+{
+  a = d;
+  d = 0;
+  if (b)
+   {
+ xprintf (0);
+ if (j)
+   xprintf (0);
+   }
+}
+  xprintf (d);
+  exit (0);
+}
diff --git a/gcc/tree-ssa-loop-unswitch.c b/gcc/tree-ssa-loop-unswitch.c
index 4328d6a..d6faa37 100644
--- a/gcc/tree-ssa-loop-unswitch.c
+++ b/gcc/tree-ssa-loop-unswitch.c
@@ -32,6 +32,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "internal-fn.h"
 #include "gimplify.h"
 #include "tree-cfg.h"
+#include "tree-ssa.h"
 #include "tree-ssa-loop-niter.h"
 #include "tree-ssa-loop.h"
 #include "tree-into-ssa.h"
@@ -139,6 +140,10 @@ tree_may_unswitch_on (basic_block bb, struct loop *loop)
   /* Condition must be invariant.  */
   FOR_EACH_SSA_TREE_OPERAND (use, stmt, iter, SSA_OP_USE)
 {
+  /* Unswitching on undefined values would introduce undefined
+behavior that the original program might never exercise.  */
+  if (ssa_undefined_value_p (use, true))
+   return NULL_TREE;
   def = SSA_NAME_DEF_STMT (use);
   def_bb = gimple_bb (def);
   if (def_bb


-- 
Alexandre Oliva, freedom fighterhttp://FSFLA.org/~lxoliva/
You must be the change you wish to see in the world. -- Gandhi
Be Free! -- http://FSFLA.org/   FSF Latin America board member
Free Software Evangelist|Red Hat Brasil GNU Toolchain Engineer


Re: [PATCH] New attribute to create target clones

2015-10-08 Thread Jeff Law

On 10/08/2015 02:01 PM, Evgeny Stupachenko wrote:

On Thu, Oct 8, 2015 at 10:00 PM, Jeff Law  wrote:

On 09/24/2015 04:28 PM, Evgeny Stupachenko wrote:


I've fixed ICE and review issues.
x86 make check and bootstrap passed.

Thanks,
Evgeny

ChangeLog

2015-09-25  Evgeny Stupachenko

gcc/
  * Makefile.in (OBJS): Add multiple_target.o.
  * multiple_target.c (make_attribute): New.
  (create_dispatcher_calls): Ditto.
  (expand_target_clones): Ditto.
  (ipa_target_clone): Ditto.
  * passes.def (pass_target_clone): New ipa pass.
  * tree-pass.h (make_pass_target_clone): Ditto.

gcc/c-family
  * c-common.c (handle_target_clones_attribute): New.
  * (c_common_attribute_table): Add handle_target_clones_attribute.
  * (handle_always_inline_attribute): Add check on target_clones
  attribute.
  * (handle_target_attribute): Ditto.

gcc/testsuite
  * gcc.dg/mvc1.c: New test for multiple targets cloning.
  * gcc.dg/mvc2.c: Ditto.
  * gcc.dg/mvc3.c: Ditto.
  * gcc.dg/mvc4.c: Ditto.
  * gcc.dg/mvc5.c: Ditto.
  * gcc.dg/mvc6.c: Ditto.
  * gcc.dg/mvc7.c: Ditto.
  * g++.dg/ext/mvc1.C: Ditto.
  * g++.dg/ext/mvc2.C: Ditto.
  * g++.dg/ext/mvc3.C: Ditto.

gcc/doc
  * doc/extend.texi (target_clones): New attribute description.






target_clones.patch


Sorry this has taken so long to come back to...  As I mentioned a couple
months ago, I'd hoped Jan would chime in on the IPA/symtab requirements.
But that didn't happen.


SO I went back and reviewed the discussion between Jan, Ilya & myself WRT
some of the rules around aliases, clones, etc.  I think the key question for
this patch is whether or not the clones have the same assembler name or not.
 From looking at expand_target_clones, I'm confident the answer is the clones
have different assembler names.  In fact, the assembler names are munged
with the options used for that specific clone of the original function.




+/* Makes a function attribute of the form NAME(ARG_NAME) and chains
+   it to CHAIN.  */
+
+static tree
+make_attribute (const char *name, const char *arg_name, tree chain)
+{
+  tree attr_name;
+  tree attr_arg_name;
+  tree attr_args;
+  tree attr;
+
+  attr_name = get_identifier (name);
+  attr_arg_name = build_string (strlen (arg_name), arg_name);
+  attr_args = tree_cons (NULL_TREE, attr_arg_name, NULL_TREE);
+  attr = tree_cons (attr_name, attr_args, chain);
+  return attr;
+}


This seems generic enough that I'd prefer it in attribs.c.  I was rather
surprised when I looked and didn't find an existing routine to do this.



+
+/* If the call in NODE has multiple target attribute with multiple
fields,
+   replace it with dispatcher call and create dispatcher (once).  */
+
+static void
+create_dispatcher_calls (struct cgraph_node *node)
+{
+  cgraph_edge *e;
+  cgraph_edge *e_next;
+  for (e = node->callers; e ;e = (e == NULL) ? e_next : e->next_caller)


That's a rather strange way to write the loop increment.  If I follow the
loop logic correctly, it seems that we always end up using e->next_caller,
it's just obscured.

For the test if we're calling a versioned function, we just "continue".  e
will be non-null and thus we use e->next_caller to set e for the next
iteration.

If the test for calling a versioned function falls, we set e_next to
e->next_caller, then later set e to NULL.  That results in using e_next to
set e for the next iteration.  But e_next was initialized to e->next_caller.

So why not just write the loop increment as e = e->next-caller?


Because of this:
   e->redirect_callee (inode);
It modifies next_caller field.
The other way is to remember all what we want to redirect and create
one more loop through the modifications to apply them.

Seems like a comment would be wise.




I'm slightly concerned with using the pretty printer to build up the new
name.  Is there precedent for this anywhere else in GCC?

I don't remember where it exactly came from. However it's not a big
deal to simplify this to std functions.

Thanks.  Not sure why, but I'd appreciate that change.




When creating the munged name, don't you also have to make sure that other
symbols that aren't supported for names don't sneak through?  I see that you
replace = and -, but you'd need to replace any symbol that could possibly be
used in an option, but which isn't allowed in a function name at the
assembler level.  I'd be worried about anything that might possibly be seen
as an operator by the assembler, '.', and possibly others.

This restriction comes from "targetm.target_option.valid_attribute_p"
and it is the same for current implementation of function
multiversioning.
It exits with error: "attribute(target("...")) is unknown".
It looks reasonable to put the check before symtab changes.
Right, but there's nothing inherently that says that a option couldn't 
have other operators such as

[PATCH] c/67882 - improve -Warray-bounds for invalid offsetof

2015-10-08 Thread Martin Sebor

Gcc attempts to diagnose invalid offsetof expressions whose member
designator is an array element with an out-of-bounds index. The
logic in the function that does this detection is incomplete, leading
to false negatives. Since the result of the expression in these cases
can be surprising, this patch tightens up the logic to diagnose more
such cases.

Tested by boostrapping and running c/c++ tests on x86_64 with no
regressions.

Martin
diff --git a/gcc/c-family/c-common.c b/gcc/c-family/c-common.c
index 4b922bf..fc7c991d 100644
--- a/gcc/c-family/c-common.c
+++ b/gcc/c-family/c-common.c
@@ -10536,12 +10536,31 @@ c_common_to_target_charset (HOST_WIDE_INT c)
 
 /* Fold an offsetof-like expression.  EXPR is a nested sequence of component
references with an INDIRECT_REF of a constant at the bottom; much like the
-   traditional rendering of offsetof as a macro.  Return the folded result.  */
+   traditional rendering of offsetof as a macro.  Return the folded result.
+   PCTX, which is initially null, is set by the first recursive call of
+   the function to refer to a local object describing the potentially out-
+   of-bounds index of the array member whose offset is being computed, and
+   to indicate whether all indices to the same array object have the highest
+   valid value.  The function issues a warning for out-of-bounds array indices
+   that either refer to elements past the one just past end of the array object
+   or that exceed any of the major bounds.  */
+
+struct offsetof_ctx_t
+{
+  tree inx; /* The invalid array index or NULL_TREE.  */
+  int maxinx;   /* All indices to the array have the highest valid value. */
+};
 
 tree
-fold_offsetof_1 (tree expr)
+fold_offsetof_1 (tree expr, offsetof_ctx_t *pctx /* = 0 */)
 {
   tree base, off, t;
+  offsetof_ctx_t ctx = { NULL_TREE, -1 };
+
+  /* Set the context pointer to point to the local context object
+ to use by subsequent recursive calls.  */
+  if (!pctx)
+pctx = &ctx;
 
   switch (TREE_CODE (expr))
 {
@@ -10567,10 +10586,19 @@ fold_offsetof_1 (tree expr)
   return TREE_OPERAND (expr, 0);
 
 case COMPONENT_REF:
-  base = fold_offsetof_1 (TREE_OPERAND (expr, 0));
+  base = fold_offsetof_1 (TREE_OPERAND (expr, 0), pctx);
   if (base == error_mark_node)
 	return base;
 
+  if (ctx.inx != NULL_TREE) {
+	warning (OPT_Warray_bounds,
+		 "index %E denotes an offset "
+		 "greater than size of %qT",
+		 ctx.inx, TREE_TYPE (TREE_OPERAND (expr, 0)));
+	/* Reset to avoid diagnosing the same expression multiple times.  */
+	pctx->inx = NULL_TREE;
+  }
+
   t = TREE_OPERAND (expr, 1);
   if (DECL_C_BIT_FIELD (t))
 	{
@@ -10581,10 +10609,11 @@ fold_offsetof_1 (tree expr)
   off = size_binop_loc (input_location, PLUS_EXPR, DECL_FIELD_OFFSET (t),
 			size_int (tree_to_uhwi (DECL_FIELD_BIT_OFFSET (t))
   / BITS_PER_UNIT));
+  pctx->maxinx = -1;
   break;
 
 case ARRAY_REF:
-  base = fold_offsetof_1 (TREE_OPERAND (expr, 0));
+  base = fold_offsetof_1 (TREE_OPERAND (expr, 0), pctx);
   if (base == error_mark_node)
 	return base;
 
@@ -10601,8 +10630,10 @@ fold_offsetof_1 (tree expr)
 	{
 	  upbound = size_binop (PLUS_EXPR, upbound,
 build_int_cst (TREE_TYPE (upbound), 1));
-	  if (tree_int_cst_lt (upbound, t))
-		{
+
+	  if (tree_int_cst_lt (upbound, t)) {
+		pctx->inx = t;
+
 		tree v;
 
 		for (v = TREE_OPERAND (expr, 0);
@@ -10622,25 +10653,61 @@ fold_offsetof_1 (tree expr)
 		/* Don't warn if the array might be considered a poor
 		   man's flexible array member with a very permissive
 		   definition thereof.  */
-		  if (TREE_CODE (v) == ARRAY_REF
-		  || TREE_CODE (v) == COMPONENT_REF)
+		if (TREE_CODE (v) != ARRAY_REF
+		&& TREE_CODE (v) != COMPONENT_REF)
+		  pctx = NULL;
+	  }
+	  else if (pctx->inx == NULL_TREE && tree_int_cst_equal (upbound, t))
+		{
+		  /* Index is considered valid when it's either less than
+		 the upper bound or equal to it and refers to the lowest
+		 rank.  Since in the latter case it may not at this point
+		 in the recursive call to the function be known whether
+		 the element at the index is used to do more than to
+		 compute its offset (e.g., it can be used to designate
+		 a member of the type to which the just past-the-end
+		 element refers), point the INX variable at the index
+		 and leave it up to the caller to decide whether or not
+		 to diagnose it.  Special handling is required for minor
+		 index values referring to the element just past the end
+		 of the array object.  */
+		  pctx->inx = t;
+		  tree_code lastcode = TREE_CODE (TREE_OPERAND (expr, 0));
+		  if ((lastcode != ARRAY_REF && pctx != &ctx)
+		  || (pctx == &ctx && pctx->maxinx))
+		pctx = NULL;
+		}
+	  else
+		{
+		  HOST_WIDE_INT ubi = tree_to_shwi (upbound);
+		  HOST_WIDE_INT inx = tree_to_shwi (t);
+
+		  if (pctx->maxinx)
+		pctx->maxinx = inx + 1 == 

[PATCH] c/67882 - improve -Warray-bounds for invalid offsetof

2015-10-08 Thread Martin Sebor

Gcc attempts to diagnose invalid offsetof expressions whose member
designator is an array element with an out-of-bounds index. The
logic in the function that does this detection is incomplete, leading
to false negatives. Since the result of the expression in these cases
can be surprising, this patch tightens up the logic to diagnose more
such cases.



Re: [RFC, Patch]: Optimized changes in the register used inside loop for LICM and IVOPTS.

2015-10-08 Thread Bin.Cheng
On Thu, Oct 8, 2015 at 1:53 PM, Ajit Kumar Agarwal
 wrote:
>
>
> -Original Message-
> From: Bin.Cheng [mailto:amker.ch...@gmail.com]
> Sent: Thursday, October 08, 2015 10:29 AM
> To: Ajit Kumar Agarwal
> Cc: GCC Patches; Vinod Kathail; Shail Aditya Gupta; Vidhumouli Hunsigida; 
> Nagaraju Mekala
> Subject: Re: [RFC, Patch]: Optimized changes in the register used inside loop 
> for LICM and IVOPTS.
>
> On Thu, Oct 8, 2015 at 12:32 PM, Ajit Kumar Agarwal 
>  wrote:
>> Following Proposed:
>>
>> Changes are done in the Loop Invariant(LICM) at RTL level and also the 
>> Induction variable optimization based on SSA representation.
>> The current logic used in LICM for register used inside the loops is
>> changed. The Live Out of the loop latch node and the Live in of the
>> destination of the exit nodes is used to set the Loops Liveness at the exit 
>> of the Loop. The register used is the number of live variables at the exit 
>> of the Loop calculated above.
>>
>> For Induction variable optimization on tree SSA representation, the
>> register used logic is based on the number of phi nodes at the loop
>> header to represent the liveness at the loop. Current Logic used only the 
>> number of phi nodes at the loop header. I have made changes  to represent 
>> the phi operands also live at the loop. Thus number of phi operands also 
>> gets incremented in the number of registers used.
> Hi,
>>>For the GIMPLE IVO part, I don't think the change is reasonable enough.  
>>>IMHO, IVO fails to restrict iv number in some complex cases, your change 
>>>tries to >>rectify that by increasing register pressure irrespective to 
>>>out-of-ssa and coalescing.  I think the original code models reg-pressure 
>>>better, what needs to be >>changed is how we compute cost from register 
>>>pressure and use that to restrict iv number.
>
> Considering the liveness with respect to all the phi arguments will not 
> increase the register pressure. It improves the heuristics for restricting
> The IV that increases the register pressure. The cost model uses regs_used 
> and modelling the
I think register pressure is increased along with regs_needed, doesn't
matter if it will be canceled in estimate_reg_pressure_cost for both
ends of cost comparison.
Liveness with respect to the phi arguments measures
> Better register pressure.
I agree IV number should be controlled for some cases, but not by
increasing `n' using phi argument number unconditionally.  Considering
summary reduction as an example, most likely the ssa names will be
coalesced and held in single register.  Furthermore, there is no
reason to count phi node/arg number for floating point phi nodes.

>
> Number of phi nodes in the loop header is not only the criteria for 
> regs_used, but the number of liveness with respect to loop should be
> Criteria to measure appropriate register pressure.
IMHO, it's hard to accurately track liveness info on SSA(PHI), because
of coalescing etc.  So could you give some examples/proof for this?

Thanks,
bin
>
> Thanks & Regards
> Ajit


Re: [RFA 1/2]: Don't ignore target_header_dir when deciding inhibit_libc

2015-10-08 Thread Hans-Peter Nilsson
> From: Ulrich Weigand 
> Date: Thu, 8 Oct 2015 18:52:22 +0200

> Hans-Peter Nilsson wrote:
> 
> > Let me ask you right back: after an installation, should
> > installation of a newer gcc *not* automatically pick up the
> > header files installed (copied to sys-include) by the previous
> > installation when using the same prefix, *without* any
> > --with-headers specified in the new configury?
> 
> I'm not using sys-include, so I don't really have a strong
> opinion on this setup.  However, I found this in the docs:

Ow, I didn't look there this time.  I've forgotten if I've done
so in the past.  If so, it may have been at a time even before
with-sysroot!

>   @item --with-headers
>   @itemx --with-headers=@var{dir}
>   Deprecated in favor of @option{--with-sysroot}.
>   Specifies that target headers are available when building a cross compiler.
>   The @var{dir} argument specifies a directory which has the target include
>   files.  These include files will be copied into the @file{gcc} install
>   directory.  @emph{This option with the @var{dir} argument is required} when
>   building a cross compiler, if @file{@var{prefix}/@var{target}/sys-include}
>   doesn't pre-exist.  If @file{@var{prefix}/@var{target}/sys-include} does
>   pre-exist, the @var{dir} argument may be omitted.  @command{fixincludes}
>   will be run on these files to make them compatible with GCC@.
> 
>   @item --without-headers
>   Tells GCC not use any target headers from a libc when building a cross
>   compiler.  When crossing to GNU/Linux, you need the headers so GCC
>   can build the exception handling for libgcc.
> 
> This seems to imply to me that --with-headers without any  argument
> is supposed to use the pre-existing sys-include directory.

Looks like it.  But the implementation had another opinion...

> The docs are somewhat silent on what exactly the complete absence of
> both --with-headers and --without-headers means.

It should either be a clear verbose error at configure time or
should work.

> Another potential interpretation might be:
> 
>  --with-headers=   Copy headers from  to sys-include
>  --with-headers Use existing sys-include directory
>Same as --with-headers
>  --without-headers  Do not use any headers
>
> which simplifies the option space, and makes --with/out-headers
> match the behavior of other --with/out- options.  It would basically
> require use of sys-include for cross-compilers (which the docs could
> be read to imply anyway, already).

Needless to mention, that's the way I've read it, possibly
because it was the minimum editing distance to the
implementation at the time... :-}

Well, with the exception of "--with-headers" (no argument)
which I, just like the person(s) before me, did not consider.
Sorry.

> > So, ISTM we should change --with-headers (=yes) to either look
> > in sys-include or in include.  Setting it to sys-include
> > wouldn't help you or anyone else as it's already the default...
> 
> On the other hand, the current docs appear to imply that the
> intent was for --with-headers (=yes) to look into a pre-existing
> sys-include directory for headers.

Right.  So, if you'd prefer to align the implementation with
that, I don't mind.  But, these are odd cases as-is, so current
use and users matter when aligning the documentation and
implementation and I wouldn't be surprised if the entire
usage-space is between ours...

> > I wouldn't understand to instead change things around and make
> > "include" be inspected by default.  It's only the --with-headers
> > option that's broken.
> 
> So this would be:
> 
>  --with-headers=   Copy headers from  to sys-include
>  --with-headers Use headers from prefix include directory
>Use existing sys-include directory
>  --without-headers  Do not use any headers
> 
> I agree that this is the smallest change to current behavior;
> on the other hand, it seems quite odd overall (i.e. hardest to
> explain to someone unfamiliar with current behavior).

Arguably so, a bit unfortunate.

> At the very least, the docs would have to be adapted.

Agreed.

brgds, H-P


Re: Do not use TYPE_CANONICAL in useless_type_conversion

2015-10-08 Thread Jan Hubicka
> 
> Index: expr.c
> ===
> --- expr.c(revision 228604)
> +++ expr.c(working copy)
> @@ -6703,7 +6704,7 @@ store_field (rtx target, HOST_WIDE_INT b
> emit_group_store (temp_target, temp, TREE_TYPE (exp), size);
> temp = temp_target;
>   }
> -  else if (mode == BLKmode)
> +  else if (GET_MODE (temp) == BLKmode)
>   {
> /* Handle calls that return BLKmode values in registers.  */
> if (REG_P (temp) && TREE_CODE (exp) == CALL_EXPR)

This patch passed the testing, so if FIELD_DECL of VOIDmode referring BLKmode
type is a sane thing, I guess this is a right fix.  I would say that however
the type of FIELD_DECL should be compatible with the type of COMPONENT_REF and
that should be added to the Gimple operand testing and fixed at Ada side?

Honza


Fix use of availability in ipa-icf

2015-10-08 Thread Jan Hubicka
Hi,
this is a bug I noticed while reading the code.  We can not assume interposable
objects to be equivalent unless they are actually the same symbols.

Bootstrapped/regtested x86_64-linux, comitted.

Honza

* ipa-icf.c (sem_item::compare_symbol_references): Fix use
of availability.
Index: ipa-icf.c
===
--- ipa-icf.c   (revision 228625)
+++ ipa-icf.c   (working copy)
@@ -521,8 +521,8 @@ sem_item::compare_symbol_references (
   n1 = n1->ultimate_alias_target (&avail1);
   n2 = n2->ultimate_alias_target (&avail2);
 
-  if (avail1 >= AVAIL_INTERPOSABLE && ignored_nodes.get (n1)
-  && avail2 >= AVAIL_INTERPOSABLE && ignored_nodes.get (n2))
+  if (avail1 > AVAIL_INTERPOSABLE && ignored_nodes.get (n1)
+  && avail2 > AVAIL_INTERPOSABLE && ignored_nodes.get (n2))
 return true;
 
   return return_false_with_msg ("different references");


Re: [PATCH] [4/n] Fix minor SSA_NAME leaks

2015-10-08 Thread Jeff Law

On 10/08/2015 05:18 PM, Jeff Law wrote:

Another, this time in tree-ssa-loop-im.  In this case it's just a
missing release_defs for a statement we know won't have virtual operands.

As with the others, I've got a reduced testcase and will be trying to
figure out how to best utilize it.

Bootstrapped & regression tested on x86_64-linux-gnu.  Installed on the
trunk.

Whoops.  Forgot the actual change.

Jeff
commit e0e4b75785305e46a8419814757f0de30e258764
Author: law 
Date:   Thu Oct 8 23:19:35 2015 +

[PATCH] [4/n] Fix minor SSA_NAME leaks

* tree-ssa-loop-im.c (rewrite_bittest): Add missing call to
release_defs.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@228627 
138bc75d-0d04-0410-961f-82ee72b054a4

diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c
index 9b2436f..603e6d4 100644
--- a/gcc/tree-ssa-loop-im.c
+++ b/gcc/tree-ssa-loop-im.c
@@ -983,7 +983,9 @@ rewrite_bittest (gimple_stmt_iterator *bsi)
   rsi = *bsi;
   gsi_insert_before (bsi, stmt1, GSI_NEW_STMT);
   gsi_insert_before (&rsi, stmt2, GSI_SAME_STMT);
+  gimple *to_release = gsi_stmt (rsi);
   gsi_remove (&rsi, true);
+  release_defs (to_release);
 
   return stmt1;
 }


[PATCH] [4/n] Fix minor SSA_NAME leaks

2015-10-08 Thread Jeff Law
Another, this time in tree-ssa-loop-im.  In this case it's just a 
missing release_defs for a statement we know won't have virtual operands.


As with the others, I've got a reduced testcase and will be trying to 
figure out how to best utilize it.


Bootstrapped & regression tested on x86_64-linux-gnu.  Installed on the 
trunk.


Jeff
commit d2bd5c817c877bf2ad4d3a5fbef4c7985e2cad39
Author: Jeff Law 
Date:   Thu Oct 8 17:17:29 2015 -0600

[PATCH] [4/n] Fix minor SSA_NAME leaks

* tree-ssa-loop-im.c (rewrite_bittest): Add missing call to
release_defs.

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 31e2f30..2c760ee 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,5 +1,8 @@
 2015-10-08  Jeff Law  
 
+   * tree-ssa-loop-im.c (rewrite_bittest): Add missing call to
+   release_defs.
+
* tree-stdarg.c (expand_ifn_va_arg_1): Add missing call to
unlink_stmt_vdef and release_ssa_name_fn.
 


Re: Do not use TYPE_CANONICAL in useless_type_conversion

2015-10-08 Thread Jan Hubicka
> Eric Botcazou  writes:
> 
> >> Thank you! I commited the patch.
> >
> > It breaks the Ada build on x86-64 though:
> 
> Also on ia64:

Sorry to hear that :(
> 
> /usr/local/gcc/test/Build/./prev-gcc/xgcc 
> -B/usr/local/gcc/test/Build/./prev-gcc/ -B/usr/ia64-suse-linux/bin/ 
> -B/usr/ia64-suse-linux/bin/ -B/usr/ia64-suse-linux/lib/ -isystem 
> /usr/ia64-suse-linux/include -isystem /usr/ia64-suse-linux/sys-include-c 
> -g -O2 -gtoggle  -gnatpg -gnata -W -Wall -nostdinc -I- -I. -Iada/generated 
> -Iada -I../../gcc/ada -I../../gcc/ada/gcc-interface 
> ../../gcc/ada/eval_fat.adb -o ada/eval_fat.o
> +===GNAT BUG DETECTED==+
> | 6.0.0 20151007 (experimental) (ia64-suse-linux) GCC error:   |
> | in convert_move, at expr.c:282   |
> | Error detected around ../../gcc/ada/eval_fat.adb:191:21  |

Lacking ia-64 machine I have trouble to reproduce this.
Does the patch in https://gcc.gnu.org/ml/gcc-patches/2015-10/msg00902.html help?
and if it doesn't can you possibly take a look at the backtrace and what trees 
are passed
to expand_assignmnet?

Thanks,
Honza
> 
> Andreas.
> 
> -- 
> Andreas Schwab, SUSE Labs, sch...@suse.de
> GPG Key fingerprint = 0196 BAD8 1CE9 1970 F4BE  1748 E4D4 88E3 0EEA B9D7
> "And now for something completely different."


Re: [PATCH] New attribute to create target clones

2015-10-08 Thread Jan Hubicka
> >
> >Yes, here you have different names for different variants of the function
> >body. Basically this pass takes ctarget attribute and creates bunch of 
> >verisons
> >of the functions and assigns them the proper target attributes, right?
> Right.  Given a single function in the source tree with the new
> attribute, we'll create clones and compile each clone with a
> different set of options.  It's got a lot of similarities to the
> multi-versioning code.  The key difference is with multi-versioning,
> you actually have a different source level implementation for each
> target while Evgeny's stuff has a single source implementation.
> 
> >
> >One thing I am confused about is why this does not happen early?
> >What happens to inlines of functions with specific taret requirements? I.e.
> >if I compile with AVX enabled have function body with AVX code but then, at
> >late compilation, force a clone with -mno-avx?
> >
> >I would expect cloning to happen early, perhaps even before early 
> >optimizations...
> >Switching random target flags mid optimization queue seems dangerous.
> These shouldn't be inlined to the best of my knowledge.  We go back
> and direct all callers to a dispatcher.  Inlining them would be a
> mistake since the goal here is to specialize the clones around
> target capabilities.  Thus if something got inlined, then we lose
> that ability.

OK, I assume that every multiversioned function will end up in something
like this:

foo_ver1() target(...)
{
}
foo_ver2() target(...)
{
}
foo()
{
  dispatch either to foo_ver1() or foo_ver2()
}

I wonder why foo_ver1/foo_ver2 needs ever become public?  If there is only way
to call them via dispatcher, then the code setting TREE_PUBLIC seems wrong.  If
there is direct way to call them, then inlinng is possible.

Of course it also depends what you inline into function. You can have

bar() target(-mavx) {fancy avx code}
foobar() { .. if (avx) bar();}
foo() ctarget(-mavx,-mno-avx) {foobar();}

Now if you compile with -mavx and because ctarget takes effect only after 
inlining,
at inlining time the target attributes will match and we can edn up inline 
bar->foobar->foo.
After that we multiversion foo and drop AVX flag we will likely get ICE at 
expansion
time.

> 
> 
> Since we're going through a dispatcher I don't think we're allowed
> to change the ABI across the clones.

If the dispatcher was something like

switch(value)
{
  case 1: foo_ver1(param); break;
  case 2: foo_ver2(param); break;
}
then it would be possible to change ABI if we can prove that param=0 in ipa-cp.
If the dispatcher uses indirect calls, then we probably want to consider
foo_ver* as having address taken and thus not local.

Honza
> 
> 
> >
> >Also when you are copying a function, you probably want to copy the 
> >associated
> >thunks and version them, too?
> Dunno on that.
> 
> Jeff


Re: [Patch] PowerPC IEEE 128-bit patch #7 (revised #2)

2015-10-08 Thread Joseph Myers
Question: what happens if you mix __float128 and __ibm128 in an arithmetic 
or conditional expression?

__float128 a;
__ibm128 b;
int x;
/* ... */
a + b;
x ? a : b;

(And likewise if one or both are the corresponding complex types.)  As I 
suggested in  I think 
this would best be rejected for both C and C++ (with testcases).  That 
accords with TS 18661-3 (just published) making floating-point conversions 
undefined where neither type's set of values is a subset of the other's.

The invalid_binary_op target hook should be usable to cover the case where 
a binary operation would cause implicit conversions, but I don't see an 
existing hook that would deal with ? : expressions.

-- 
Joseph S. Myers
jos...@codesourcery.com


[PATCH] [3/n] Fix minor SSA_NAME leaks

2015-10-08 Thread Jeff Law
And other minor leak.  This time in tree-stdarg.  Unlike other cases, 
we're dropping just the virtual definition, other definitions on the 
statement need to be preserved (they're going to be re-used). 
Additionally, this one is missing the call to unlink_stmt_vdef.


Like other cases, I've got a minimized test, but no good way to add it 
to the testsuite right now.


Bootstrapped and regression tested on x86_64-linux-gnu.  Installed on 
the trunk.


Jeff
commit 4d303443cc66bf32f3f045014dd22f0e475f0d50
Author: Jeff Law 
Date:   Thu Oct 8 14:46:03 2015 -0600

[PATCH] [3/n] Fix minor SSA_NAME leaks

* tree-stdarg.c (expand_ifn_va_arg_1): Add missing call to
unlink_stmt_vdef and release_ssa_name_fn.

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 64309c1..31e2f30 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,5 +1,8 @@
 2015-10-08  Jeff Law  
 
+   * tree-stdarg.c (expand_ifn_va_arg_1): Add missing call to
+   unlink_stmt_vdef and release_ssa_name_fn.
+
* tree-ssa-dse.c (dse_optimize_stmt): Add missing call to
release_defs.
 
diff --git a/gcc/tree-stdarg.c b/gcc/tree-stdarg.c
index d69fa06..3e6d98c 100644
--- a/gcc/tree-stdarg.c
+++ b/gcc/tree-stdarg.c
@@ -1080,6 +1080,8 @@ expand_ifn_va_arg_1 (function *fun)
 
/* Remove the IFN_VA_ARG gimple_call.  It's the last stmt in the
   bb.  */
+   unlink_stmt_vdef (stmt);
+   release_ssa_name_fn (fun, gimple_vdef (stmt));
gsi_remove (&i, true);
gcc_assert (gsi_end_p (i));
 


Re: [PATCH] PR66870 PowerPC64 Enable gold linker with split stack

2015-10-08 Thread Matthias Klose

On 08.10.2015 20:56, Lynn A. Boger wrote:

I think my original fix with linux.h doing the #undef on
TARGET_CAN_SPLIT_STACK_64BIT is the right fix at least
for powerpc-linux-gnu 32 bit only.

It works for powerpc-linux-gnu without multilib and doesn't break
powerpc64-linux-gnu or powerpc64le-linux-gnu.

Can you tell me how you are configuring the multilib build that defaults to
powerpc-linux-gnu and how it
fails?  Maybe there is another problem for that combination.


Configured with: -v
 --with-pkgversion='Ubuntu 20151005-0ubuntu1'
 --with-bugurl='file:///usr/share/doc/gcc-snapshot/README.Bugs'
 --enable-languages=c,ada,c++,java,go,fortran,objc,obj-c++
 --prefix=/usr/lib/gcc-snapshot
 --enable-shared
 --enable-linker-build-id
 --disable-nls
 --with-sysroot=/
 --enable-clocale=gnu
 --enable-libstdcxx-debug
 --enable-libstdcxx-time=yes
 --with-default-libstdcxx-abi=new
 --enable-gnu-unique-object
 --disable-libitm
 --disable-libquadmath
 --enable-plugin
 --with-system-zlib
 --disable-browser-plugin
 --enable-java-awt=gtk
 --enable-gtk-cairo
 --with-java-home=/usr/lib/jvm/java-1.5.0-gcj-6-snap-powerpc/jre
 --enable-java-home
 --with-jvm-root-dir=/usr/lib/jvm/java-1.5.0-gcj-6-snap-powerpc
 --with-jvm-jar-dir=/usr/lib/jvm-exports/java-1.5.0-gcj-6-snap-powerpc
 --with-arch-directory=ppc
 --with-ecj-jar=/usr/share/java/eclipse-ecj.jar
 --enable-objc-gc
 --enable-secureplt
 --disable-softfloat
 --with-cpu=default32
 --disable-softfloat
 --enable-targets=powerpc-linux,powerpc64-linux
 --enable-multiarch
 --disable-werror
 --with-long-double-128
 --enable-multilib
 --enable-checking=yes
 --build=powerpc-linux-gnu
 --host=powerpc-linux-gnu
 --target=powerpc-linux-gnu

fails in gotools with:
cc1: error: '-fsplit-stack' currently only supported on PowerPC64 GNU/Linux with 
glibc-2.18 or later

cc1: error: '-fsplit-stack' is not supported by this compiler configuration

this information is from the log below, but it's a parallel build, so a bit 
useless
https://launchpadlibrarian.net/220374353/buildlog_ubuntu-wily-powerpc.gcc-snapshot_20151005-0ubuntu1_BUILDING.txt.gz


As David noted, the use of TARGET_64BIT or TARGET_POWERPC64 won't work for this
#define.


I found that out too =)  Note that ada builds are currently broken on the trunk.

Matthias



Re: Do not use TYPE_CANONICAL in useless_type_conversion

2015-10-08 Thread Jan Hubicka
> On Thu, 8 Oct 2015, Jan Hubicka wrote:
> 
> > > 
> > > && TREE_CODE (outer_type) == OFFSET_TYPE
> > > 
> > > Ok with those changes.
> > 
> > Thank you! I commited the patch.
> > At a hike today it appeared to me that for ipa-icf and other calling 
> > convetions
> > checks we should not rely on useless_type_conversion_p because there may be
> > types that are compatible in gimple type system but have different calling
> > conventions.  I will hack calling convention comparer tomorrow - should not 
> > be
> > too hard, just doing the cumulative walk and comparing that the RTL 
> > containers
> > are the same.
> 
> I think the patch caused a bootstrap failure on x86_64-linux with Ada.
> We're having an aggregate copy SImode = BLKmode and end up with
> BLKmode from int_mode_for_mode (BLKmode) which later ICEs (obviously)
> in gen_lowpart.

Yep, there is pre-existing code to go from BLKmode to normal mode at
store field.
  else if (mode == BLKmode)
{ 
  /* Handle calls that return BLKmode values in registers.  */
  if (REG_P (temp) && TREE_CODE (exp) == CALL_EXPR)
{ 
  rtx temp_target = gen_reg_rtx (GET_MODE (temp));
  copy_blkmode_from_reg (temp_target, temp, TREE_TYPE (exp));
  temp = temp_target;
}
  else
{ 
  HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
  rtx temp_target;
  mode = smallest_mode_for_size (size * BITS_PER_UNIT, MODE_INT);
  temp_target = gen_reg_rtx (mode);
  temp_target
= extract_bit_field (temp, size * BITS_PER_UNIT, 0, 1,
 temp_target, mode, mode);
  temp = temp_target;
}
}
  /* Store the value in the bitfield.  */

Problem here is that the EXP is schizofrenic:

(gdb) p debug_tree (exp)
 
unit size 
user align 32 symtab -169633040 alias set 12 canonical type 
0x76988c78
fields 
unsigned nonaddressable QI file s-regpat.adb line 1013 col 19
size 
unit size 
align 8 offset_align 128
offset 
bit offset  context 
 original field 
 chain >
Ada size 
chain >

arg 0  unit size 

align 32 symtab -169633120 alias set -1 canonical type 
0x76988bd0 fields  context  Ada size 
pointer_to_this  chain >
used SI file s-regpat.adb line 1013 col 19 size  unit size 
align 32 context  abstract_origin 
(reg/v:SI 1546 [ new_flags ])>
arg 1 
external packed bit-field nonaddressable decl_3 SI file s-regpat.adb 
line 1013 col 19 size 
unit size 
align 1 offset_align 128 offset  bit 
offset  bit_field_type  context >>

The type has BLKmode and size 32. DECL_SIZE of the FIELD_DECL is however 24 
(see it printed as Ada size).
The DECL_MODE of the FIELD_DECL is VOIDmode (not printed), while the TYPE_MODE 
of type contained is BLKmode.
Because get_inner_reference compute mode based on DECL_MODE:
  if (TREE_CODE (exp) == COMPONENT_REF)
{
  tree field = TREE_OPERAND (exp, 1);
  size_tree = DECL_SIZE (field);
  if (flag_strict_volatile_bitfields > 0
  && TREE_THIS_VOLATILE (exp)
  && DECL_BIT_FIELD_TYPE (field)
  && DECL_MODE (field) != BLKmode)
/* Volatile bitfields should be accessed in the mode of the
 field's type, not the mode computed based on the bit
 size.  */
mode = TYPE_MODE (DECL_BIT_FIELD_TYPE (field));
  else if (!DECL_BIT_FIELD (field))
mode = DECL_MODE (field);
  else if (DECL_MODE (field) == BLKmode)
blkmode_bitfield = true;

  *punsignedp = DECL_UNSIGNED (field);
}
We miss the check
  else if (mode == BLKmode)
and fail to convert the register and die in horrible death.

What is the reason behind this construct? Is that Ada bug?

Anyway, the following patch fixes the issue by caring about mode of the
temporary (which is controled by the type) instead of the mode of the field. I
am testing it on x86_64-linux now.

Honza

Index: expr.c
===
--- expr.c  (revision 228604)
+++ expr.c  (working copy)
@@ -6703,7 +6704,7 @@ store_field (rtx target, HOST_WIDE_INT b
  emit_group_store (temp_target, temp, TREE_TYPE (exp), size);
  temp = temp_target;
}
-  else if (mode == BLKmode)
+  else if (GET_MODE (temp) == BLKmode)
{
  /* Handle calls that return BLKmode values in registers.  */
  if (REG_P (temp) && TREE_CODE (exp) == CALL_EXPR)


[PATCH] [2/n] Fix minor SSA_NAME leaks

2015-10-08 Thread Jeff Law
Another fairly obvious leak, this time in tree-ssa-dse.c where we again 
call gsi_remove without an associated release_defs.


The only thing of note here is this time we have virtual operations on 
the statement.  So before we call gsi_remove and release_defs, we have 
to call unlink_stmt_vdef to propagate away the uses of the VDEF.


tree-ssa-dse properly calls unlink_stmt_vdef, so the right things are 
happening as far as that's concerned.


Like the previous patch, I do have a minimized test, but it's unclear 
how we'd really want to put this into a testing harness at this point. 
If we get to an enforced no-leak policy in the manager, then these tests 
can just drop in as compile tests.  Or perhaps as a plug-in test where 
we can check the state of the name manager.


Anyway, bootstrapped and regression tested on x86_64-linux-gnu. 
Installed on the trunk.


Jeff

commit edf28250d6a25393684ee0e28f87fafe57183f76
Author: Jeff Law 
Date:   Thu Oct 8 14:25:42 2015 -0600

[PATCH] [2/n] Fix minor SSA_NAME leaks

* tree-ssa-dse.c (dse_optimize_stmt): Add missing call to
release_defs.

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 4286491..64309c1 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,8 @@
+2015-10-08  Jeff Law  
+
+   * tree-ssa-dse.c (dse_optimize_stmt): Add missing call to
+   release_defs.
+
 2015-10-08  H.J. Lu  
 
* config/i386/i386.c (ix86_compute_frame_layout): Round up the
diff --git a/gcc/tree-ssa-dse.c b/gcc/tree-ssa-dse.c
index ac9c05a..80ebdb6 100644
--- a/gcc/tree-ssa-dse.c
+++ b/gcc/tree-ssa-dse.c
@@ -271,6 +271,7 @@ dse_optimize_stmt (gimple_stmt_iterator *gsi)
  /* Remove the dead store.  */
  if (gsi_remove (gsi, true))
bitmap_set_bit (need_eh_cleanup, gimple_bb (stmt)->index);
+ release_defs (stmt);
}
  break;
}


Re: [PATCH] New attribute to create target clones

2015-10-08 Thread Evgeny Stupachenko
On Thu, Oct 8, 2015 at 10:00 PM, Jeff Law  wrote:
> On 09/24/2015 04:28 PM, Evgeny Stupachenko wrote:
>>
>> I've fixed ICE and review issues.
>> x86 make check and bootstrap passed.
>>
>> Thanks,
>> Evgeny
>>
>> ChangeLog
>>
>> 2015-09-25  Evgeny Stupachenko
>>
>> gcc/
>>  * Makefile.in (OBJS): Add multiple_target.o.
>>  * multiple_target.c (make_attribute): New.
>>  (create_dispatcher_calls): Ditto.
>>  (expand_target_clones): Ditto.
>>  (ipa_target_clone): Ditto.
>>  * passes.def (pass_target_clone): New ipa pass.
>>  * tree-pass.h (make_pass_target_clone): Ditto.
>>
>> gcc/c-family
>>  * c-common.c (handle_target_clones_attribute): New.
>>  * (c_common_attribute_table): Add handle_target_clones_attribute.
>>  * (handle_always_inline_attribute): Add check on target_clones
>>  attribute.
>>  * (handle_target_attribute): Ditto.
>>
>> gcc/testsuite
>>  * gcc.dg/mvc1.c: New test for multiple targets cloning.
>>  * gcc.dg/mvc2.c: Ditto.
>>  * gcc.dg/mvc3.c: Ditto.
>>  * gcc.dg/mvc4.c: Ditto.
>>  * gcc.dg/mvc5.c: Ditto.
>>  * gcc.dg/mvc6.c: Ditto.
>>  * gcc.dg/mvc7.c: Ditto.
>>  * g++.dg/ext/mvc1.C: Ditto.
>>  * g++.dg/ext/mvc2.C: Ditto.
>>  * g++.dg/ext/mvc3.C: Ditto.
>>
>> gcc/doc
>>  * doc/extend.texi (target_clones): New attribute description.
>>
>>
>
>>
>> target_clones.patch
>
> Sorry this has taken so long to come back to...  As I mentioned a couple
> months ago, I'd hoped Jan would chime in on the IPA/symtab requirements.
> But that didn't happen.
>
>
> SO I went back and reviewed the discussion between Jan, Ilya & myself WRT
> some of the rules around aliases, clones, etc.  I think the key question for
> this patch is whether or not the clones have the same assembler name or not.
> From looking at expand_target_clones, I'm confident the answer is the clones
> have different assembler names.  In fact, the assembler names are munged
> with the options used for that specific clone of the original function.
>
>
>>
>> +/* Makes a function attribute of the form NAME(ARG_NAME) and chains
>> +   it to CHAIN.  */
>> +
>> +static tree
>> +make_attribute (const char *name, const char *arg_name, tree chain)
>> +{
>> +  tree attr_name;
>> +  tree attr_arg_name;
>> +  tree attr_args;
>> +  tree attr;
>> +
>> +  attr_name = get_identifier (name);
>> +  attr_arg_name = build_string (strlen (arg_name), arg_name);
>> +  attr_args = tree_cons (NULL_TREE, attr_arg_name, NULL_TREE);
>> +  attr = tree_cons (attr_name, attr_args, chain);
>> +  return attr;
>> +}
>
> This seems generic enough that I'd prefer it in attribs.c.  I was rather
> surprised when I looked and didn't find an existing routine to do this.
>
>
>> +
>> +/* If the call in NODE has multiple target attribute with multiple
>> fields,
>> +   replace it with dispatcher call and create dispatcher (once).  */
>> +
>> +static void
>> +create_dispatcher_calls (struct cgraph_node *node)
>> +{
>> +  cgraph_edge *e;
>> +  cgraph_edge *e_next;
>> +  for (e = node->callers; e ;e = (e == NULL) ? e_next : e->next_caller)
>
> That's a rather strange way to write the loop increment.  If I follow the
> loop logic correctly, it seems that we always end up using e->next_caller,
> it's just obscured.
>
> For the test if we're calling a versioned function, we just "continue".  e
> will be non-null and thus we use e->next_caller to set e for the next
> iteration.
>
> If the test for calling a versioned function falls, we set e_next to
> e->next_caller, then later set e to NULL.  That results in using e_next to
> set e for the next iteration.  But e_next was initialized to e->next_caller.
>
> So why not just write the loop increment as e = e->next-caller?

Because of this:
  e->redirect_callee (inode);
It modifies next_caller field.
The other way is to remember all what we want to redirect and create
one more loop through the modifications to apply them.

>
>
>
>> +{
>> +  tree resolver_decl;
>> +  tree idecl;
>> +  tree decl;
>> +  gimple *call = e->call_stmt;
>> +  struct cgraph_node *inode;
>> +
>> +  /* Checking if call of function is call of versioned function.
>> +Versioned function are not inlined, so there is no need to
>> +check for inline.  */
>
> This comment doesn't parse well.  Perhaps:
>
> /* Check if this is a call to a versioned function.  Verisoned
>fucntions are not inlined, so there is no need to check for that.  */
>
>
>> +
>> +/* If the function in NODE has multiple target attribute with multiple
>> fields,
>> +   create the appropriate clone for each field.  */
>> +
>> +static bool
>> +expand_target_clones (struct cgraph_node *node)
>
> So this is probably getting a little large.  Can we look to refactor it a
> little?  It's not a huge deal and there's certainly code in GCC that is far
> worse, but it just feels like there's e

Re: [PATCH] New attribute to create target clones

2015-10-08 Thread Jeff Law

On 10/08/2015 01:23 PM, Jan Hubicka wrote:

Sorry this has taken so long to come back to...  As I mentioned a
couple months ago, I'd hoped Jan would chime in on the IPA/symtab
requirements.  But that didn't happen.


Sorry for that.  I had bit too many real life things this summer
and I am still trying to catch up.

It happens to us all :-)  No worries.



Ilya's code seems different from what this patch does. Ilya simply needs
multiple declarations for one physical assembler name (this is not an alias).
This is not currently supported by symtab (support for that was removed long
time ago as part of the one decl project) and I have some perliminary patches
to push out, but since they add basic sanity checking that the different
declarations of the same thing looks compatible I get too many positives I need
to walk through.  Those seems real bugs in glibc (which uses duplicated decls
for checking) and the pointer bounds code.
Right.  I was mostly concerned because I goof'd letting those bits from 
Ilya in and didn't want to repeat the mistake again.  It was pretty 
clear once I found the thread between the tree of us that Evgeny's 
patches were doing something rather different and didn't violate the 
assumptions of the symtab code.




Yes, here you have different names for different variants of the function
body. Basically this pass takes ctarget attribute and creates bunch of verisons
of the functions and assigns them the proper target attributes, right?
Right.  Given a single function in the source tree with the new 
attribute, we'll create clones and compile each clone with a different 
set of options.  It's got a lot of similarities to the multi-versioning 
code.  The key difference is with multi-versioning, you actually have a 
different source level implementation for each target while Evgeny's 
stuff has a single source implementation.




One thing I am confused about is why this does not happen early?
What happens to inlines of functions with specific taret requirements? I.e.
if I compile with AVX enabled have function body with AVX code but then, at
late compilation, force a clone with -mno-avx?

I would expect cloning to happen early, perhaps even before early 
optimizations...
Switching random target flags mid optimization queue seems dangerous.
These shouldn't be inlined to the best of my knowledge.  We go back and 
direct all callers to a dispatcher.  Inlining them would be a mistake 
since the goal here is to specialize the clones around target 
capabilities.  Thus if something got inlined, then we lose that ability.





As for the patch itself:
+  if (node->definition)
+   {
+ if (!node->has_gimple_body_p ())
+   return false;
+ node->get_body ();
+
+ /* Replacing initial function with clone only for 1st ctarget.  */
+ new_node = node->create_version_clone_with_body (vNULL, NULL,
+  NULL, false,
+  NULL, NULL,
+  "target_clone");
+ new_node->externally_visible = node->externally_visible;
+ new_node->address_taken = node->address_taken;
+ new_node->thunk = node->thunk;
+ new_node->alias = node->alias;
+ new_node->weakref = node->weakref;
+ new_node->cpp_implicit_alias = node->cpp_implicit_alias;
+ new_node->local.local = node->local.local;
+ TREE_PUBLIC (new_node->decl) = TREE_PUBLIC (node->decl);
+   }
Since you test it has gimple body, then you don't need to worry about
alias/thunk/weakrefs/implicit_alias properties. Those will be never set.
How does the dispatcher look like?  Can the function be considered local
in a sense that one can change calling conventions of one clone but not another?

Easiest to think of them as having the same abilities as multi-versioning.


Since we're going through a dispatcher I don't think we're allowed to 
change the ABI across the clones.





Also when you are copying a function, you probably want to copy the associated
thunks and version them, too?

Dunno on that.

Jeff


Re: FW: [PATCH] Target hook for disabling the delay slot filler.

2015-10-08 Thread Jeff Law

On 09/18/2015 05:10 AM, Simon Dardis wrote:

Are you trying to say that you have the option as to what kind of
branch to use?  ie, "ordinary", presumably without a delay slot or one
with a delay slot?



Is the "ordinary" actually just a nullified delay slot or some form of
likely/not likely static hint?


Specifically for MIPSR6: the ISA possesses traditional delay slot branches and
a normal branch (no delay slots, not annulling, no hints, subtle static hazard),
aka "compact branch" in MIPS terminology. They could be described as nullify
on taken delay slot branch but we saw little to no value in that.

Matthew Fortune provided a writeup with their handling in GCC:

https://gcc.gnu.org/ml/gcc-patches/2015-07/msg01892.html
Thanks. I never looked at that message, almost certainly because it was 
MIPS specific.  I'm trying hard to stay out of backends that have good 
active maintainers, and MIPS certainly qualifies on that point.






But what is the compact form at the micro-architectural level?  My
mips-fu has diminished greatly, but my recollection is the bubble is
always there.   Is that not the case?


The pipeline bubble will exist but the performance impact varies across
R6 cores. High-end OoO cores won't be impacted as much, but lower
end cores will. microMIPSR6 removes delay slot branches altogether which
pushes the simplest micro-architectures to optimize away the cost of a
pipeline bubble.

[ ... snip more micro-archticture stuff ... ]
Thanks.  That helps a lot.  I didn't realize the bubble was being 
squashed to varying degrees.  And FWIW, I wouldn't be surprised if you 
reach a point on the OoO cores where you'll just want to move away from 
delay slots totally and rely on your compact branches as much as 
possible.  It may give your hardware guys a degree of freedom that helps 
them in the common case (compact branches) at the expense of slowing 
down code with old fashioned delay slots.



Compact branches do a strange restriction in that they cannot be followed by a
CTI. This is to simplify branch predictors apparently but this may be lifted in
future ISA releases.
Come on! :-)  There's some really neat things you can do when you allow 
branches in delay slots.  The PA was particularly fun in that regard. 
My recollection is HP had some hand written assembly code in their 
libraries which exploited the out-of-line execution you could get in 
this case.  We never tried to exploit in GCC simply because the 
opportunities didn't see all that common or profitable.







If it is able to find insns from the commonly executed path that don't
have a long latency, then the fill is usually profitable (since the
pipeline bubble always exists).  However, pulling a long latency
instruction (say anything that might cache miss or an fdiv/fsqrt) off
the slow path and conditionally nullifying it can be *awful*.
Everything else is in-between.


I agree. The variability in profit/loss in a concern and I see two ways to deal
with it:

A) modify the delay slot filler so that it choses speculative instructions of
less than some $cost and avoid instruction duplication when the eager filler
picks an instruction from a block with multiple predecessors. Making such
changes would be invasive and require more target specific hooks.
The cost side here should be handled by existing mechanisms.  You just 
never allow anything other than simple arith, logicals & copies.


You'd need a hook to avoid this when copying was needed.

You'd probably also need some kind of target hook to indicate the level 
of prediction where this is profitable since the cost varies across your 
micro-architectures.


And you'd also have to worry about the special code which triggers when 
there's a well predicted branch, but a resource conflict.  In that case 
reorg can fill the slot from the predicted path and insert compensation 
code on the non-predicted path.






B) Use compact branches instead of speculative delay slot execution and forsake
variable performance for a consistent pipeline bubble by not using the
speculative delay filler altogether.

Between these two choices, B seems to better option as due to sheer simplicity.
Choosing neither gives speculative instruction execution when there could be a
small consistent penalty instead.

B is certainly easier.

The main objection I had was given my outdated knowledge of the MIPS 
processors it seemed like you were taking a step backwards.  You've 
cleared that up and if you're comfortable with the tradeoff, then I 
won't object to the target hook to disable eager filling.


Can you repost that patch?  Given I was the last one to do major work on 
reorg (~20 years ago mind you) it probably makes the most sense for me 
to own the review.


jeff




Re: [PATCH] New attribute to create target clones

2015-10-08 Thread Jan Hubicka
> Sorry this has taken so long to come back to...  As I mentioned a
> couple months ago, I'd hoped Jan would chime in on the IPA/symtab
> requirements.  But that didn't happen.

Sorry for that.  I had bit too many real life things this summer
and I am still trying to catch up.
> 
> 
> SO I went back and reviewed the discussion between Jan, Ilya &
> myself WRT some of the rules around aliases, clones, etc.  I think
> the key question for this patch is whether or not the clones have
> the same assembler name or not.  From looking at

Ilya's code seems different from what this patch does. Ilya simply needs
multiple declarations for one physical assembler name (this is not an alias).
This is not currently supported by symtab (support for that was removed long
time ago as part of the one decl project) and I have some perliminary patches
to push out, but since they add basic sanity checking that the different
declarations of the same thing looks compatible I get too many positives I need
to walk through.  Those seems real bugs in glibc (which uses duplicated decls
for checking) and the pointer bounds code.

> expand_target_clones, I'm confident the answer is the clones have
> different assembler names.  In fact, the assembler names are munged

Yes, here you have different names for different variants of the function
body. Basically this pass takes ctarget attribute and creates bunch of verisons
of the functions and assigns them the proper target attributes, right?

One thing I am confused about is why this does not happen early?
What happens to inlines of functions with specific taret requirements? I.e.
if I compile with AVX enabled have function body with AVX code but then, at
late compilation, force a clone with -mno-avx?

I would expect cloning to happen early, perhaps even before early 
optimizations...
Switching random target flags mid optimization queue seems dangerous.

As for the patch itself:
+  if (node->definition)
+   {
+ if (!node->has_gimple_body_p ())
+   return false;
+ node->get_body ();
+
+ /* Replacing initial function with clone only for 1st ctarget.  */
+ new_node = node->create_version_clone_with_body (vNULL, NULL,
+  NULL, false,
+  NULL, NULL,
+  "target_clone");
+ new_node->externally_visible = node->externally_visible;
+ new_node->address_taken = node->address_taken;
+ new_node->thunk = node->thunk;
+ new_node->alias = node->alias;
+ new_node->weakref = node->weakref;
+ new_node->cpp_implicit_alias = node->cpp_implicit_alias;
+ new_node->local.local = node->local.local;
+ TREE_PUBLIC (new_node->decl) = TREE_PUBLIC (node->decl);
+   }
Since you test it has gimple body, then you don't need to worry about
alias/thunk/weakrefs/implicit_alias properties. Those will be never set.
How does the dispatcher look like?  Can the function be considered local
in a sense that one can change calling conventions of one clone but not another?

On the other hand, node->create_version_clone_with_body creates a function
that is local, if we want to create globally visible clones, I think we should
add a parameter there and avoid the localization followed by unlocalization.
(I know we already have too many parameters here, perhaps we could add a flags
parameter that will also handle the existing skip_return flag.)
For exmaple, I think you want to have all functions with same WEAK attributes
or in the same COMDAT group.

Also when you are copying a function, you probably want to copy the associated
thunks and version them, too?

+  id = get_identifier (new_asm_name);
+  symtab->change_decl_assembler_name (new_node->decl, id);

here I think you can just pass new_asm_name as clone_name. I.e. replace
the current use of "target_clone" string.

+  targetm.target_option.valid_attribute_p (new_node->decl, NULL,
+  TREE_VALUE (attributes), 0);

looks like return value should not be ignored. If attribute is not valid,
we need to error and do something sane.

Honza


Re: [PATCH][committed] Fix PR67652: wrong sizeof calculation in liboffloadmic

2015-10-08 Thread Ilya Verbin
On Mon, Sep 28, 2015 at 18:15:14 +0200, Jakub Jelinek wrote:
> > -char * env_var = (char*) 
> > malloc(sizeof("COI_DMA_CHANNEL_COUNT=2" + 1));
> > +char * env_var = (char*) 
> > malloc(sizeof("COI_DMA_CHANNEL_COUNT=2"));
> >  sprintf(env_var, "COI_DMA_CHANNEL_COUNT=2");
> >  putenv(env_var);  
> 
> Missing error handling if malloc returns NULL?

Fixed.

On Mon, Sep 28, 2015 at 09:19:30 -0700, Andrew Pinski wrote:
> Also why not just use strdup here? instead of malloc/sizeof/sprintf ?

Done.

Committed as obvious.


liboffloadmic/
* runtime/offload_engine.cpp (Engine::init_process): Use strdup instead
of sizeof+malloc+sprintf, check for return value.
* runtime/offload_env.cpp (MicEnvVar::get_env_var_kind): Check for
strdup return value.
* runtime/offload_host.cpp (__offload_init_library_once): Check for
strdup return value.  Fix size calculation of COI_HOST_THREAD_AFFINITY.
* runtime/emulator/coi_device.cpp (COIProcessWaitForShutdown): Check for
malloc return value.


diff --git a/liboffloadmic/runtime/offload_engine.cpp 
b/liboffloadmic/runtime/offload_engine.cpp
index 00b673a..4a88546 100644
--- a/liboffloadmic/runtime/offload_engine.cpp
+++ b/liboffloadmic/runtime/offload_engine.cpp
@@ -173,8 +173,9 @@ void Engine::init_process(void)
 // use putenv instead of setenv as Windows has no setenv.
 // Note: putenv requires its argument can't be freed or modified.
 // So no free after call to putenv or elsewhere.
-char * env_var = (char*) malloc(sizeof("COI_DMA_CHANNEL_COUNT=2"));
-sprintf(env_var, "COI_DMA_CHANNEL_COUNT=2");
+char * env_var = strdup("COI_DMA_CHANNEL_COUNT=2");
+   if (env_var == NULL)
+ LIBOFFLOAD_ERROR(c_malloc);
 putenv(env_var);  
 }
 }
diff --git a/liboffloadmic/runtime/offload_env.cpp 
b/liboffloadmic/runtime/offload_env.cpp
index 79f5f36..ac33b67 100644
--- a/liboffloadmic/runtime/offload_env.cpp
+++ b/liboffloadmic/runtime/offload_env.cpp
@@ -212,10 +212,14 @@ MicEnvVarKind MicEnvVar::get_env_var_kind(
 *env_var_name_length = 3;
 *env_var_name = *env_var_def = c;
 *env_var_def = strdup(*env_var_def);
+   if (*env_var_def == NULL)
+ LIBOFFLOAD_ERROR(c_malloc);
 return  c_mic_var;
 }
 *env_var_def = c + strlen("ENV=");
 *env_var_def = strdup(*env_var_def);
+   if (*env_var_def == NULL)
+ LIBOFFLOAD_ERROR(c_malloc);
 return c_mic_card_env;
 }
 if (isalpha(*c)) {
@@ -229,6 +233,8 @@ MicEnvVarKind MicEnvVar::get_env_var_kind(
 return c_no_mic;
 }
 *env_var_def = strdup(*env_var_def);
+if (*env_var_def == NULL)
+  LIBOFFLOAD_ERROR(c_malloc);
 return card_is_set? c_mic_card_var : c_mic_var;
 }
 
diff --git a/liboffloadmic/runtime/offload_host.cpp 
b/liboffloadmic/runtime/offload_host.cpp
index 08f626f..eec457d 100644
--- a/liboffloadmic/runtime/offload_host.cpp
+++ b/liboffloadmic/runtime/offload_host.cpp
@@ -5173,6 +5173,8 @@ static void __offload_init_library_once(void)
 if (strcasecmp(env_var, "none") != 0) {
 // value is composed of comma separated physical device indexes
 char *buf = strdup(env_var);
+   if (buf == NULL)
+ LIBOFFLOAD_ERROR(c_malloc);
 char *str, *ptr;
 for (str = strtok_r(buf, ",", &ptr); str != 0;
  str = strtok_r(0, ",", &ptr)) {
@@ -5245,7 +5247,9 @@ static void __offload_init_library_once(void)
 if (env_var != 0) {
 char * new_env_var =
(char*) malloc(sizeof("COI_HOST_THREAD_AFFINITY=") +
-  sizeof(env_var) + 1);
+  strlen(env_var));
+   if (new_env_var == NULL)
+ LIBOFFLOAD_ERROR(c_malloc);
 sprintf(new_env_var, "COI_HOST_THREAD_AFFINITY=%s", env_var);
 putenv(new_env_var);
 }
@@ -5254,6 +5258,8 @@ static void __offload_init_library_once(void)
 env_var = getenv("MIC_LD_LIBRARY_PATH");
 if (env_var != 0) {
 mic_library_path = strdup(env_var);
+   if (mic_library_path == NULL)
+ LIBOFFLOAD_ERROR(c_malloc);
 }
 
 
@@ -5262,6 +5268,8 @@ static void __offload_init_library_once(void)
 const char *base_name = "offload_main";
 if (mic_library_path != 0) {
 char *buf = strdup(mic_library_path);
+   if (buf == NULL)
+ LIBOFFLOAD_ERROR(c_malloc);
 char *try_name = (char*) alloca(strlen(mic_library_path) +
 strlen(base_name) + 2);
 char *dir, *ptr;
@@ -5275,6 +5283,8 @@ static void __offload_init_library_once(void)
 struct stat st;
 if (stat(try_name, &st) == 0 && S_ISREG(st.st_mode)) {
 mic_device_main = strdup(try_name);
+   if (mic_device_main == NUL

[Patch] PowerPC IEEE 128-bit patch #7 (revised #2)

2015-10-08 Thread Michael Meissner
This is the revised patch #7 for the IEEE 128-bit software floating point
support in the PowerPC. I have tested this on big endian power7 (both 32/64
bit) and little endian power8 with no regressions. Is this ok to install in the
trunk?

This patch should allow __float128 support on VSX systems, but it won't be
usable until patch #8 goes in that adds support in libgcc. It is expected that
we will need to update the glibc sources (which are where the master definition
for the soft-fp directory comes from) to add kf mode support for IEEE 128-bit
floating point. I have intermediate patches that add the support by using sed
to transform the <...>tf<...> names into <...>kf<...>.

At a high level, the changes in this patch are:

1) As we discussed previously, the option name is changed from
-mfloat128-software and -mfloat128-none to be -mfloat128 and -mno-float128.

2) Add defines so that the user can determine if __float128 is available, and
what is the default format for long double.

3) I removed the code I previously had to disallow IFmode/KFmode if -mfloat128
is not enabled.

4) Like other vector types, don't allow pre-increment and pre-decrement memory
forms that reload eventually has to fix up.

5) Add IEEE 128-bit register allocation, and call/return as vector types. In a
few places, I had to modify tests against VECTOR_TYPE_P to use
ALTIVEC_OR_VSX_VECTOR_MODE, since the IEEE 128-bit floating point types aren't
considered a vector type by the machine independent portion of the compiler.

6) Add support for all of the emulator names. In this version of the patch, I
added support for converting between the various decimal types and __float128
(but so far those new emulated functions have not been written). In addition, I
modified the compare function to have 2 variants, __cmpukf2 which is intended
to mimic the behavior of the 'fcmpu' instruction, and __cmpokf2 which is
intended to mimic the behaviour of the 'fcmpo' instruction. At the present
time, I have not written the __cmpokf2 function, and the compiler does not
currently generated 'fcmpo'.

7) I added support for the 'q' and 'Q' suffix for explict __float128 constants.

8) I reordered the tests in MODES_TIEABLE_P so that vector types are tested
before scalar floating point types, so that __float128 pairs with vector
double, but not with long double (if IBM extended double is the long double
format) and __ibm128.

9) I updated the documentation for the new switch and for the __float128 and
__ibm128 keywords.

10) I went through rs6000.md and cloned all of the support for the 128-bit
types, so that you can use the right operation on KFmode for IEEE types, IFmode
for IBM types, and TFmode will be treated as KFmode if -mabi=ieeelongdouble and
IFmode if -mabi=ibmlongdouble.

[gcc]
2015-10-06  Michael Meissner  

* config/rs6000/rs6000-cpus.def (POWERPC_MASKS): Add -mfloat128.

* config/rs6000/rs6000.opt (-mfloat128): Replace the switches
-mfloat128-{software,none} with -m{,no-}float128.

* config/rs6000/rs6000-c.c (rs6000_cpu_cpp_builtins): Add defines
to tell users whether __float128 is available, and whether long
double is IBM extended double or IEEE 128-bit floating point.

* config/rs6000/rs6000.c (TARGET_C_MODE_FOR_SUFFIX): Define.
(rs6000_hard_regno_mode_ok): Remove code not allowing 128-bit
floating types from any register if the appropriate option is not
used.
(rs6000_debug_reg_global): Delete -mfloat128-{software,none}
debugging.
(rs6000_setup_reg_addr_masks): Don't allow pre-increment or
pre-decrement on IEEE 128-bit floating point.
(rs6000_init_hard_regno_mode_ok): Add support for IEEE 128-bit
floating point types.
(rs6000_option_override_internal): Replace the switches
-mfloat128-{software,none} with -m{,no-}float128, and move the
tests to be closer to other ISA 2.06/2.07 options.
(rs6000_gen_le_vsx_permute): Add support for IEEE 128-bit floating
point.
(init_cumulative_args): Assume that IEEE 128-bit floating point
emulation routines have prototypes, and don't need to pass the
arguments in the save area as well as the Altivec register set.
(rs6000_function_arg): Likewise.
(rs6000_arg_partial_bytes): Likewise.
(rs6000_init_builtins): Set up ieee128_float_type_node to always
use KFmode, and ibm128_float_type_node to always use IFmode.
(init_float128_ibm): Split rs6000_init_libfuncs into 2 functions,
one for IEEE 128-bit floating point, and one for IBM extended
double floating point. For IFmode, use the traditional IBM
extended double names. For TFmode, if -mabi=ieeelongdouble, use
the names for KFmode.
(init_float128_ieee): Likewise.
(rs6000_init_libfuncs): Likewise.
(rs6000_generate_compare): For IEEE 128-bit floating point, use
the compare function that

Re: [PATCH] New attribute to create target clones

2015-10-08 Thread Jeff Law

On 09/24/2015 04:28 PM, Evgeny Stupachenko wrote:

I've fixed ICE and review issues.
x86 make check and bootstrap passed.

Thanks,
Evgeny

ChangeLog

2015-09-25  Evgeny Stupachenko

gcc/
 * Makefile.in (OBJS): Add multiple_target.o.
 * multiple_target.c (make_attribute): New.
 (create_dispatcher_calls): Ditto.
 (expand_target_clones): Ditto.
 (ipa_target_clone): Ditto.
 * passes.def (pass_target_clone): New ipa pass.
 * tree-pass.h (make_pass_target_clone): Ditto.

gcc/c-family
 * c-common.c (handle_target_clones_attribute): New.
 * (c_common_attribute_table): Add handle_target_clones_attribute.
 * (handle_always_inline_attribute): Add check on target_clones
 attribute.
 * (handle_target_attribute): Ditto.

gcc/testsuite
 * gcc.dg/mvc1.c: New test for multiple targets cloning.
 * gcc.dg/mvc2.c: Ditto.
 * gcc.dg/mvc3.c: Ditto.
 * gcc.dg/mvc4.c: Ditto.
 * gcc.dg/mvc5.c: Ditto.
 * gcc.dg/mvc6.c: Ditto.
 * gcc.dg/mvc7.c: Ditto.
 * g++.dg/ext/mvc1.C: Ditto.
 * g++.dg/ext/mvc2.C: Ditto.
 * g++.dg/ext/mvc3.C: Ditto.

gcc/doc
 * doc/extend.texi (target_clones): New attribute description.






target_clones.patch
Sorry this has taken so long to come back to...  As I mentioned a couple 
months ago, I'd hoped Jan would chime in on the IPA/symtab requirements. 
 But that didn't happen.



SO I went back and reviewed the discussion between Jan, Ilya & myself 
WRT some of the rules around aliases, clones, etc.  I think the key 
question for this patch is whether or not the clones have the same 
assembler name or not.  From looking at expand_target_clones, I'm 
confident the answer is the clones have different assembler names.  In 
fact, the assembler names are munged with the options used for that 
specific clone of the original function.





+/* Makes a function attribute of the form NAME(ARG_NAME) and chains
+   it to CHAIN.  */
+
+static tree
+make_attribute (const char *name, const char *arg_name, tree chain)
+{
+  tree attr_name;
+  tree attr_arg_name;
+  tree attr_args;
+  tree attr;
+
+  attr_name = get_identifier (name);
+  attr_arg_name = build_string (strlen (arg_name), arg_name);
+  attr_args = tree_cons (NULL_TREE, attr_arg_name, NULL_TREE);
+  attr = tree_cons (attr_name, attr_args, chain);
+  return attr;
+}
This seems generic enough that I'd prefer it in attribs.c.  I was rather 
surprised when I looked and didn't find an existing routine to do this.




+
+/* If the call in NODE has multiple target attribute with multiple fields,
+   replace it with dispatcher call and create dispatcher (once).  */
+
+static void
+create_dispatcher_calls (struct cgraph_node *node)
+{
+  cgraph_edge *e;
+  cgraph_edge *e_next;
+  for (e = node->callers; e ;e = (e == NULL) ? e_next : e->next_caller)
That's a rather strange way to write the loop increment.  If I follow 
the loop logic correctly, it seems that we always end up using 
e->next_caller, it's just obscured.


For the test if we're calling a versioned function, we just "continue". 
 e will be non-null and thus we use e->next_caller to set e for the 
next iteration.


If the test for calling a versioned function falls, we set e_next to 
e->next_caller, then later set e to NULL.  That results in using e_next 
to set e for the next iteration.  But e_next was initialized to 
e->next_caller.


So why not just write the loop increment as e = e->next-caller?




+{
+  tree resolver_decl;
+  tree idecl;
+  tree decl;
+  gimple *call = e->call_stmt;
+  struct cgraph_node *inode;
+
+  /* Checking if call of function is call of versioned function.
+Versioned function are not inlined, so there is no need to
+check for inline.  */

This comment doesn't parse well.  Perhaps:

/* Check if this is a call to a versioned function.  Verisoned
   fucntions are not inlined, so there is no need to check for that.  */



+
+/* If the function in NODE has multiple target attribute with multiple fields,
+   create the appropriate clone for each field.  */
+
+static bool
+expand_target_clones (struct cgraph_node *node)
So this is probably getting a little large.  Can we look to refactor it 
a little?  It's not a huge deal and there's certainly code in GCC that 
is far worse, but it just feels like there's enough going on in this 
code that there ought to be 2-3 helpers for the larger chunks of work 
going on inside this code.


The first is the attribute parsing.  The second is creation of the 
nodes, and the final would be munging the name and attributes on the clones.


I'm slightly concerned with using the pretty printer to build up the new 
name.  Is there precedent for this anywhere else in GCC?


When creating the munged name, don't you also have to make sure that 
other symbols that aren't supported for names don't sneak through?  I 
see that you replace =

Re: [PATCH] PR66870 PowerPC64 Enable gold linker with split stack

2015-10-08 Thread Lynn A. Boger
I think my original fix with linux.h doing the #undef on 
TARGET_CAN_SPLIT_STACK_64BIT is the right fix at least

for powerpc-linux-gnu 32 bit only.

It works for powerpc-linux-gnu without multilib and doesn't break 
powerpc64-linux-gnu or powerpc64le-linux-gnu.


Can you tell me how you are configuring the multilib build that defaults 
to powerpc-linux-gnu and how it

fails?  Maybe there is another problem for that combination.

As David noted, the use of TARGET_64BIT or TARGET_POWERPC64 won't work 
for this #define.


On 10/07/2015 12:31 PM, Matthias Klose wrote:

On 07.10.2015 17:36, Lynn A. Boger wrote:

Pretty sure this is the fix, but still doing some testing.


linux.h isn't included for multilib enabled builds defaulting to 
powerpc-linux-gnu, I am currently testing


--- gcc/config/rs6000/sysv4.h(revision 228571)
+++ gcc/config/rs6000/sysv4.h(working copy)
@@ -943,8 +943,9 @@
 /* On ppc64 and ppc64le, split stack is only support for
64 bit. */
 #undef TARGET_CAN_SPLIT_STACK_64BIT
-#if TARGET_GLIBC_MAJOR > 2 \
-  || (TARGET_GLIBC_MAJOR == 2 && TARGET_GLIBC_MINOR >= 18)
+#if TARGET_64BIT \
+  && (TARGET_GLIBC_MAJOR > 2 \
+  || (TARGET_GLIBC_MAJOR == 2 && TARGET_GLIBC_MINOR >= 18))
 #define TARGET_CAN_SPLIT_STACK_64BIT
 #endif








Re: [PATCH v6][aarch64] Implemented reciprocal square root (rsqrt) estimation in -ffast-math

2015-10-08 Thread Evandro Menezes

LGTM

--
Evandro Menezes

On 01/02/1970 09:27 PM, Benedikt Huber wrote:

This sixth revision of the patch:
  * Cleans up style issues.
  * Makes test conform to standards.

Ok for check in.

Benedikt Huber (1):
   2015-10-02  Benedikt Huber  
Philipp Tomsich  

  gcc/ChangeLog  |  19 
  gcc/config/aarch64/aarch64-builtins.c  | 112 
  gcc/config/aarch64/aarch64-protos.h|   3 +
  gcc/config/aarch64/aarch64-simd.md |  27 +
  gcc/config/aarch64/aarch64-tuning-flags.def|   1 +
  gcc/config/aarch64/aarch64.c   | 115 -
  gcc/config/aarch64/aarch64.md  |   3 +
  gcc/config/aarch64/aarch64.opt |   4 +
  gcc/doc/invoke.texi|  12 +++
  .../gcc.target/aarch64/rsqrt-asm-check_1.c |  65 
  gcc/testsuite/gcc.target/aarch64/rsqrt_1.c | 111 
  11 files changed, 467 insertions(+), 5 deletions(-)
  create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check_1.c
  create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt_1.c





Re: PATC: Round up the SSE register save area only if needed

2015-10-08 Thread Uros Bizjak
On Thu, Oct 8, 2015 at 8:29 PM, H.J. Lu  wrote:
> There is is no point to round up the SSE register save area to 16 bytes if
> the incoming stack boundary is less than 16 bytes.
>
> OK for trunk?

OK, with the improved comment:

/* The only ABI that has saved SSE registers (Win64) also has a
   16-byte aligned default stack, and thus we don't need to be
   within the re-aligned local stack frame to save them.  In case
   incoming stack boundary is aligned to less than 16 bytes,
   unaligned move of SSE register will be emitted, so there is
   no point to round up the SSE register save area outside the
   re-aligned local stack frame to 16 bytes.  */

> H.J.
> ---
> * config/i386/i386.c (ix86_compute_frame_layout): Round up the
> SSE register save area to 16 bytes only if the incoming stack
> boundary is no less than 16 bytes.

Thanks,
Uros.


PATC: Round up the SSE register save area only if needed

2015-10-08 Thread H.J. Lu
There is is no point to round up the SSE register save area to 16 bytes if
the incoming stack boundary is less than 16 bytes.

OK for trunk?

H.J.
---
* config/i386/i386.c (ix86_compute_frame_layout): Round up the
SSE register save area to 16 bytes only if the incoming stack
boundary is no less than 16 bytes.

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index a24bd26..7f0479c 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -11382,10 +11382,11 @@ ix86_compute_frame_layout (struct ix86_frame *frame)
   /* Align and set SSE register save area.  */
   if (frame->nsseregs)
 {
-  /* The only ABI that has saved SSE registers (Win64) also has a
- 16-byte aligned default stack, and thus we don't need to be
-within the re-aligned local stack frame to save them.  */
-  offset = ROUND_UP (offset, 16);
+  /* There is is no point to round up the SSE register save area
+to 16 bytes if the incoming stack boundary is less than 16
+bytes.  */
+  if (ix86_incoming_stack_boundary >= 128)
+   offset = ROUND_UP (offset, 16);
   offset += frame->nsseregs * 16;
 }
   frame->sse_reg_save_offset = offset;


Re: Move some bit and binary optimizations in simplify and match

2015-10-08 Thread Bernd Schmidt

On 10/08/2015 08:03 PM, Joseph Myers wrote:

On Thu, 8 Oct 2015, Bernd Schmidt wrote:


On 10/07/2015 11:54 AM, Hurugalawadi, Naveen wrote:

Move Fold X & (X ^ Y) as X & ~Y to match.pd.
Move Fold X & (Y ^ X) as ~Y & X to match.pd.


I wonder if we shouldn't try to autogenerate patterns such as these. I did
something like that for a different project a long time ago. Generate
expressions up to a certain level of complexity, identify which ones are
equivalent, and pick the one with the lowest cost for simplifications...


Any bitwise expression whose ultimate operands are X, Y, 0 and -1
(possibly with conversions among types of the same width) could be
canonicalized to one of: 0, -1, X, Y, ~X, ~Y, X^Y, X^~Y, A&B or A|B (where
A is X or ~X and B is Y or ~Y).  I don't guarantee those are the best
canonical forms, but if you're folding this sort of expression you ought
to be able to make GCC fold all such expressions down to some such form
(and fold away all equality comparisons among such expressions with
constant value).


I was actually thinking of also doing this for more complex expressions 
with more than two different operands.



Bernd


Re: Move some bit and binary optimizations in simplify and match

2015-10-08 Thread Joseph Myers
On Thu, 8 Oct 2015, Bernd Schmidt wrote:

> On 10/07/2015 11:54 AM, Hurugalawadi, Naveen wrote:
> > Move Fold X & (X ^ Y) as X & ~Y to match.pd.
> > Move Fold X & (Y ^ X) as ~Y & X to match.pd.
> 
> I wonder if we shouldn't try to autogenerate patterns such as these. I did
> something like that for a different project a long time ago. Generate
> expressions up to a certain level of complexity, identify which ones are
> equivalent, and pick the one with the lowest cost for simplifications...

Any bitwise expression whose ultimate operands are X, Y, 0 and -1 
(possibly with conversions among types of the same width) could be 
canonicalized to one of: 0, -1, X, Y, ~X, ~Y, X^Y, X^~Y, A&B or A|B (where 
A is X or ~X and B is Y or ~Y).  I don't guarantee those are the best 
canonical forms, but if you're folding this sort of expression you ought 
to be able to make GCC fold all such expressions down to some such form 
(and fold away all equality comparisons among such expressions with 
constant value).

Now, such canonicalization could be done with a finite number of 
autogenerated patterns (if you can fold ~(A BINOP B), (A BINOP B) BINOP C 
and (A BINOP B) BINOP (C BINOP D), for A, B, C, D from 0, -1, X, Y, ~X, 
~Y, folding for more complicated expressions falls out).  I don't know if 
that's the best way to do such folding or not.

Given such folding, autogenerating expressions of the form ((A BINOP B) 
BINOP (C BINOP D)) == CANONICAL_FORM seems a plausible way of getting 
testsuite coverage for the folding (and for that matter for seeing what 
such folding is missing at present).

-- 
Joseph S. Myers
jos...@codesourcery.com


[PATCH] [1/n] Fix minor SSA_NAME leaks

2015-10-08 Thread Jeff Law
This is the first of several patches to fix various SSA_NAME leaks 
throughout the compiler.  I'm hoping we'll get to the point where 
they're all plugged and we institute a no-leak policy in the SSA_NAME 
manager.


Until then, I'll be plugging leaks.  This one is pretty obvious.  We 
call gsi_remove to remove the statement, but never release the 
definitions back to the manager.  In this case we know the statements 
don't have any virtual operands, so gsi_remove;release_defs is the 
proper sequencing.


Bootstrapped and regression tested on x86_64-linux-gnu.  I've got a 
minimized test for the leak.  I pondered adding some debugging dump info 
for released names and could still do that and scan the debugging dumps 
to ensure things are released.  I figured that would bloat the debugging 
dumps horribly, so I didn't implement it.  I'm open to suggestions here.


Installed on the trunk.

Jeff

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index b2e4f6a..9f84b6e 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,8 @@
+2015-10-08  Jeff Law  
+
+   * tree-ssa-phiopt.c (factor_out_conversion): Add missing calls to
+   release_ssa_name.  Fix typo in comment.
+
 2015-10-08  Nathan Sidwell  
 
* config/nvptx/nvptx.h (struct machine_function): Add comment.
diff --git a/gcc/tree-ssa-phiopt.c b/gcc/tree-ssa-phiopt.c
index f33ca5c..cfa3868 100644
--- a/gcc/tree-ssa-phiopt.c
+++ b/gcc/tree-ssa-phiopt.c
@@ -511,10 +511,13 @@ factor_out_conditional_conversion (edge e0, edge e1, gphi 
*phi,
   /* Remove the old cast(s) that has single use.  */
   gsi_for_def = gsi_for_stmt (arg0_def_stmt);
   gsi_remove (&gsi_for_def, true);
+  release_defs (arg0_def_stmt);
+
   if (arg1_def_stmt)
 {
   gsi_for_def = gsi_for_stmt (arg1_def_stmt);
   gsi_remove (&gsi_for_def, true);
+  release_defs (arg1_def_stmt);
 }
 
   add_phi_arg (newphi, new_arg0, e0, locus);
@@ -527,7 +530,7 @@ factor_out_conditional_conversion (edge e0, edge e1, gphi 
*phi,
   gsi = gsi_after_labels (gimple_bb (phi));
   gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 
-  /* Remove he original PHI stmt.  */
+  /* Remove the original PHI stmt.  */
   gsi = gsi_for_stmt (phi);
   gsi_remove (&gsi, true);
   return true;


[nvptx] fix some c++ tests

2015-10-08 Thread Nathan Sidwell
I've committed this to trunk.  The C++ ABI now returns a pointer to the 
passed-in artificial arg that points to the return area.  consequently 
return-in-mem and type_mode(return_type) == VOIDmode are  not tautologies.


nathan
2015-10-08  Nathan Sidwell  

	* config/nvptx/nvptx.h (struct machine_function): Add comment.
	* config/nvptx/nvptx.c (nvptx_declare_function_name): Functions
	may return pointer as well as in memory.
	(nvptx_output_return): Likewise.

Index: gcc/config/nvptx/nvptx.c
===
--- gcc/config/nvptx/nvptx.c	(revision 228617)
+++ gcc/config/nvptx/nvptx.c	(working copy)
@@ -531,13 +531,8 @@ nvptx_declare_function_name (FILE *file,
   nvptx_write_function_decl (s, name, decl);
   fprintf (file, "%s", s.str().c_str());
 
-  bool return_in_mem = false;
-  if (TYPE_MODE (result_type) != VOIDmode)
-{
-  machine_mode mode = TYPE_MODE (result_type);
-  if (!RETURN_IN_REG_P (mode))
-	return_in_mem = true;
-}
+  bool return_in_mem = (TYPE_MODE (result_type) != VOIDmode
+			&& !RETURN_IN_REG_P (TYPE_MODE (result_type)));
 
   fprintf (file, "\n{\n");
 
@@ -547,9 +542,13 @@ nvptx_declare_function_name (FILE *file,
 		   false, return_in_mem);
   if (return_in_mem)
 fprintf (file, "\t.reg.u%d %%ar1;\n", GET_MODE_BITSIZE (Pmode));
-  else if (TYPE_MODE (result_type) != VOIDmode)
+
+  /* C++11 ABI causes us to return a reference to the passed in
+ pointer for return_in_mem.  */
+  if (cfun->machine->ret_reg_mode != VOIDmode)
 {
-  machine_mode mode = arg_promotion (TYPE_MODE (result_type));
+  machine_mode mode = arg_promotion
+	((machine_mode)cfun->machine->ret_reg_mode);
   fprintf (file, "\t.reg%s %%retval;\n",
 	   nvptx_ptx_type_from_mode (mode, false));
 }
@@ -635,17 +634,13 @@ nvptx_declare_function_name (FILE *file,
 const char *
 nvptx_output_return (void)
 {
-  tree fntype = TREE_TYPE (current_function_decl);
-  tree result_type = TREE_TYPE (fntype);
-  if (TYPE_MODE (result_type) != VOIDmode)
+  machine_mode mode = (machine_mode)cfun->machine->ret_reg_mode;
+
+  if (mode != VOIDmode)
 {
-  machine_mode mode = TYPE_MODE (result_type);
-  if (RETURN_IN_REG_P (mode))
-	{
-	  mode = arg_promotion (mode);
-	  fprintf (asm_out_file, "\tst.param%s\t[%%out_retval], %%retval;\n",
-		   nvptx_ptx_type_from_mode (mode, false));
-	}
+  mode = arg_promotion (mode);
+  fprintf (asm_out_file, "\tst.param%s\t[%%out_retval], %%retval;\n",
+	   nvptx_ptx_type_from_mode (mode, false));
 }
 
   return "ret;";
Index: gcc/config/nvptx/nvptx.h
===
--- gcc/config/nvptx/nvptx.h	(revision 228617)
+++ gcc/config/nvptx/nvptx.h	(working copy)
@@ -228,7 +228,7 @@ struct GTY(()) machine_function
   bool has_call_with_varargs;
   bool has_call_with_sc;
   HOST_WIDE_INT outgoing_stdarg_size;
-  int ret_reg_mode;
+  int ret_reg_mode; /* machine_mode not defined yet. */
   int punning_buffer_size;
 };
 #endif


[PATCH, i386, AVX512] PR target/67895: Fix position of embedded rounding/SAE mode in AVX512 vrangep* and vcvt?si2s* instructions.

2015-10-08 Thread Alexander Fomin
Hi All,

This patch addresses PR target/67895. For some AVX512 instructions
we've used  to emit embedded rounding/SAE specifier in a wrong place.
The patch fixes its position for vrange* and vcvt?si2s* instructions.
I've also updated regular expressions for corresponding assembly in
i386 testsuite, so they act like regression tests now.

Bootstrap is OK, waiting for regression testing now.
If the last one is fine, is this patch OK for trunk and 5 branch?

Regards,
Alexander
---
gcc/

PR target/67895
* config/i386/sse.md (define_insn "sse_cvtsi2ss"):
Adjust embedded rounding/SAE specifier position.
(define_insn "sse_cvtsi2ssq"): Likewise.
(define_insn "cvtusi232"):
Likewise.
(define_insn "cvtusi264"):
Likewise.
(define_insn "sse2_cvtsi2sdq"):
Likewise.
(define_insn "avx512dq_rangep"):
Likewise.
(define_insn "avx512dq_ranges"):
Likewise.

gcc/testsuite

PR target/67895
* gcc.target/i386/avx512dq-vrangepd-1.c: Adjust assembly regexp.
* gcc.target/i386/avx512dq-vrangeps-1.c: Likewise.
* gcc.target/i386/avx512dq-vrangesd-1.c: Likewise.
* gcc.target/i386/avx512dq-vrangess-1.c: Likewise.
* gcc.target/i386/avx512f-vcvtsi2sd64-1.c: Likewise.
* gcc.target/i386/avx512f-vcvtsi2ss-1.c: Likewise.
* gcc.target/i386/avx512f-vcvtsi2ss64-1.c: Likewise.
* gcc.target/i386/avx512f-vcvtusi2sd64-1.c: Likewise.
* gcc.target/i386/avx512f-vcvtusi2ss-1.c: Likewise.
* gcc.target/i386/avx512f-vcvtusi2ss64-1.c: Likewise.
---
 gcc/config/i386/sse.md | 14 +++---
 gcc/testsuite/gcc.target/i386/avx512dq-vrangepd-1.c|  6 +++---
 gcc/testsuite/gcc.target/i386/avx512dq-vrangeps-1.c|  6 +++---
 gcc/testsuite/gcc.target/i386/avx512dq-vrangesd-1.c|  2 +-
 gcc/testsuite/gcc.target/i386/avx512dq-vrangess-1.c|  2 +-
 gcc/testsuite/gcc.target/i386/avx512f-vcvtsi2sd64-1.c  |  2 +-
 gcc/testsuite/gcc.target/i386/avx512f-vcvtsi2ss-1.c|  2 +-
 gcc/testsuite/gcc.target/i386/avx512f-vcvtsi2ss64-1.c  |  2 +-
 gcc/testsuite/gcc.target/i386/avx512f-vcvtusi2sd64-1.c |  2 +-
 gcc/testsuite/gcc.target/i386/avx512f-vcvtusi2ss-1.c   |  2 +-
 gcc/testsuite/gcc.target/i386/avx512f-vcvtusi2ss64-1.c |  2 +-
 11 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index e5680f1..43dcc6a 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -4014,7 +4014,7 @@
   "@
cvtsi2ss\t{%2, %0|%0, %2}
cvtsi2ss\t{%2, %0|%0, %2}
-   vcvtsi2ss\t{%2, %1, %0|%0, %1, %2}"
+   vcvtsi2ss\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "isa" "noavx,noavx,avx")
(set_attr "type" "sseicvt")
(set_attr "athlon_decode" "vector,double,*")
@@ -4036,7 +4036,7 @@
   "@
cvtsi2ssq\t{%2, %0|%0, %2}
cvtsi2ssq\t{%2, %0|%0, %2}
-   vcvtsi2ssq\t{%2, %1, %0|%0, %1, %2}"
+   vcvtsi2ssq\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "isa" "noavx,noavx,avx")
(set_attr "type" "sseicvt")
(set_attr "athlon_decode" "vector,double,*")
@@ -4149,7 +4149,7 @@
  (match_operand:VF_128 1 "register_operand" "v")
  (const_int 1)))]
   "TARGET_AVX512F && "
-  "vcvtusi2\t{%2, %1, %0|%0, %1, 
%2}"
+  "vcvtusi2\t{%2, %1, %0|%0, %1, 
%2}"
   [(set_attr "type" "sseicvt")
(set_attr "prefix" "evex")
(set_attr "mode" "")])
@@ -4163,7 +4163,7 @@
  (match_operand:VF_128 1 "register_operand" "v")
  (const_int 1)))]
   "TARGET_AVX512F && TARGET_64BIT"
-  "vcvtusi2\t{%2, %1, %0|%0, %1, 
%2}"
+  "vcvtusi2\t{%2, %1, %0|%0, %1, 
%2}"
   [(set_attr "type" "sseicvt")
(set_attr "prefix" "evex")
(set_attr "mode" "")])
@@ -4429,7 +4429,7 @@
   "@
cvtsi2sdq\t{%2, %0|%0, %2}
cvtsi2sdq\t{%2, %0|%0, %2}
-   vcvtsi2sdq\t{%2, %1, %0|%0, %1, %2}"
+   vcvtsi2sdq\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "isa" "noavx,noavx,avx")
(set_attr "type" "sseicvt")
(set_attr "athlon_decode" "double,direct,*")
@@ -18684,7 +18684,7 @@
   (match_operand:SI 3 "const_0_to_15_operand")]
  UNSPEC_RANGE))]
   "TARGET_AVX512DQ && "
-  "vrange\t{%3, %2, %1, 
%0|%0, %1, %2, %3}"
+  "vrange\t{%3, %2, %1, 
%0|%0, %1, %2, %3}"
   [(set_attr "type" "sse")
(set_attr "prefix" "evex")
(set_attr "mode" "")])
@@ -18700,7 +18700,7 @@
  (match_dup 1)
  (const_int 1)))]
   "TARGET_AVX512DQ"
-  "vrange\t{%3, %2, %1, %0|%0, %1, %2, 
%3}"
+  "vrange\t{%3, %2, %1, %0|%0, %1, 
%2, %3}"
   [(set_attr "type" "sse")
(set_attr "prefix" "evex")
(set_attr "mode" "")])
diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-vrangepd-1.c 
b/gcc/testsuite/gcc.target/i386/avx512dq-vrangepd-1.c
index 034c233..7e5a9cb 100644
--- a/gcc/testsuite/gcc.target/i386/avx512dq-vrangepd-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512dq-vrangepd-1.c
@@ -1,15 +1,15 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512dq -mavx512vl -O2" } */
-/* { dg-final { scan-as

[hsa] Fix segfault in fixup_child_record_type

2015-10-08 Thread Martin Jambor
Hi,

I have committed the following patch to the hsa branch, toput the
bail-out test in fixup_child_record_type all the way to the top where
it should be, otherwise we get segfaults when gridifying kernels with
variable-sized variables.

Martin


2015-10-08  Martin Jambor  

* omp-low.c (fixup_child_record_type): Bail out early if
ctx->receiver_decl is NULL.

diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index 65cc5c3..a21d301 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -1613,6 +1613,8 @@ fixup_child_record_type (omp_context *ctx)
 {
   tree f, type = ctx->record_type;
 
+  if (!ctx->receiver_decl)
+return;
   /* ??? It isn't sufficient to just call remap_type here, because
  variably_modified_type_p doesn't work the way we expect for
  record types.  Testing each field for whether it needs remapping
@@ -1652,9 +1654,8 @@ fixup_child_record_type (omp_context *ctx)
   layout_type (type);
 }
 
-  if (ctx->receiver_decl)
-TREE_TYPE (ctx->receiver_decl)
-  = build_qualified_type (build_reference_type (type), TYPE_QUAL_RESTRICT);
+  TREE_TYPE (ctx->receiver_decl)
+= build_qualified_type (build_reference_type (type), TYPE_QUAL_RESTRICT);
 }
 
 /* Instantiate decls as necessary in CTX to satisfy the data sharing


Re: Move some bit and binary optimizations in simplify and match

2015-10-08 Thread Bernd Schmidt

On 10/07/2015 11:54 AM, Hurugalawadi, Naveen wrote:

Move Fold X & (X ^ Y) as X & ~Y to match.pd.
Move Fold X & (Y ^ X) as ~Y & X to match.pd.


I wonder if we shouldn't try to autogenerate patterns such as these. I 
did something like that for a different project a long time ago. 
Generate expressions up to a certain level of complexity, identify which 
ones are equivalent, and pick the one with the lowest cost for 
simplifications...



Bernd



[PATCH] bb-reorder: Improve the simple algorithm for -Os (PR67864)

2015-10-08 Thread Segher Boessenkool
As the PR points out, the "simple" reorder algorithm makes bigger code
than the STC algorithm did, for -Os, for x86.  I now tested it for many
different targets and it turns out to be worse everywhere.

This simple patch tunes "simple" a bit; this makes it better than STC
almost everywhere.  The only exceptions (for the targets where I have
results) are x86 and mn10300.  For those targets it may be best to switch
the default algorithm for -Os to STC.

The raw results.  This is text size for vmlinux for 31 targets; as you
can see many did not build, but at least all primary targets did.
"none" is no basic block reordering; "trunk" is current trunk; "stc" is
with the STC algorithm; "nodyn" is with simple, but considering all edges
equally likely; "swaps" is that, but prefering the more likely edge from
conditional branches; and "falls" prefers the edge from conditional
branches that is already the fallthrough edge.  This patch is "falls".

   nonetrunk stc nodynswapsfallsbest
 3728663  3721479  3700831  3706407  3717971  3690367  falls  arm
 2097684  2095560  2089484  2094024  2094212  2086720  falls  blackfin
 2096118  2107356  2081894  2092276  2103732  2077162  falls  cris
 3204044  3200972  3187196  3191932  3198156  3177980  falls  frv
 9254222  9340258  9208805  9265886  9331362  9247119   stc   i386
 3353034  3355482  3334726  3334466  3349710  3314970  falls  m32r
 4545720  4549824  4514256  4541832  4544540  4498416  falls  microblaze
 4276743  4266363  4246255  4259923  4261367  4227723  falls  mips
 5779199  5770567  5741663  5757663  5764475  5721803  falls  mips64
 2059078  2086000  2051005  2064365  2083923  2055705   stc   mn10300
 3311925  3320113  3293873  3305949  3317865  3284081  falls  nios2
 6738701  6747077  6710253  6742917  6740965  6696757  falls  parisc64
 8312408  8312744  8261016  8294480  8306488  8237188  falls  powerpc
17782722 17788382 17722326 17749526 17780642 17683810  falls  powerpc64
11016289 11029481 10970081 10980065 11024617 10942409  falls  s390
 1406832  1417224  1400344  1409392  1416172  1399428  falls  shnommu
 377  3776223  3751623  3768459  3771455  3732967  falls  sparc
 6113427  6113527  6074875  6091167  6106015  6049571  falls  sparc64
10449681 10529883 10398908 10458240 10522149 10440814   stc   x86_64
 1905733  1905733  1905733  1905733  1905733  1905733  -  xtensa


Is this okay for trunk?


Segher


2015-10-08  Segher Boessenkool  

PR rtl-optimization/67864
* gcc/bb-reorder (reorder_basic_blocks_simple): Prefer existing
fallthrough edges for conditional jumps.  Don't sort candidate
edges if not optimizing for speed.

---
 gcc/bb-reorder.c | 16 
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/gcc/bb-reorder.c b/gcc/bb-reorder.c
index cb001e8..3b7098e 100644
--- a/gcc/bb-reorder.c
+++ b/gcc/bb-reorder.c
@@ -2318,16 +2318,24 @@ reorder_basic_blocks_simple (void)
 
   if (any_condjump_p (end))
{
- edges[n++] = EDGE_SUCC (bb, 0);
- edges[n++] = EDGE_SUCC (bb, 1);
+ edge e0 = EDGE_SUCC (bb, 0);
+ edge e1 = EDGE_SUCC (bb, 1);
+ /* When optimizing for size it is best to keep the original
+fallthrough edges.  */
+ if (e1->flags & EDGE_FALLTHRU)
+   std::swap (e0, e1);
+ edges[n++] = e0;
+ edges[n++] = e1;
}
   else if (single_succ_p (bb))
edges[n++] = EDGE_SUCC (bb, 0);
 }
 
-  /* Sort the edges, the most desirable first.  */
+  /* Sort the edges, the most desirable first.  When optimizing for size
+ all edges are equally desirable.  */
 
-  std::stable_sort (edges, edges + n, edge_order);
+  if (optimize_function_for_speed_p (cfun))
+std::stable_sort (edges, edges + n, edge_order);
 
   /* Now decide which of those edges to make fallthrough edges.  We set
  BB_VISITED if a block already has a fallthrough successor assigned
-- 
1.8.1.4



Re: Add -foffload-abi support for PPC

2015-10-08 Thread Thomas Schwinge
Hi!

On Thu, 8 Oct 2015 11:19:05 -0500, James Norris  
wrote:
> On 10/07/2015 08:51 AM, David Edelsohn wrote:
> > On Wed, Oct 7, 2015 at 4:02 AM, Thomas Schwinge  
> > wrote:
> >
> >>  From a quick look at the *_TYPE_SIZE definitions in
> >> gcc/config/rs6000/rs6000.h as well as
> >> , "3-1
> >> Fundamental Types", and
> >> ,
> >> I gather we're dealing with regular ilp32/lp64 here.  Then, I assume the
> >> right thing is to use the 64BIT flag from gcc/config/rs6000/sysv4.opt
> >> (which, per gcc/config.gcc I suppose is used for the relevant
> >> powerpc64le-linux-gnu configuration).  (David?)
> >
> > TARGET_64BIT is the appropriate macro to test.
> >
> >>
> >> I'm not sure where to place the TARGET_OFFLOAD_OPTIONS #define and the
> >> function definition in rs6000.c.  (David?)
> >
> > As mentioned earlier, only PPC64LE is supported.
> >
> > I'm not sure if it really matters if this is defined in ELF-specific
> > portion of the file or a general place, although it never will be
> > called by other configurations.
> >
> > Thanks, David
> >
> 
> I've revised the patch from the review comments (thank you) and
> is attached.
> 
> Regtested on x86_64 and powerpcle64.
> 
> OK for trunk?

> --- a/gcc/config/rs6000/rs6000.c
> +++ b/gcc/config/rs6000/rs6000.c

> +/* Implement the TARGET_OFFLOAD_OPTIONS hook.  */
> +static char *
> +rs6000_offload_options (void)
> +{
> +  return xstrdup ("-foffload-abi=lp64");
> +}

Well, that's a stripped-down variant of what I had suggested:

static char *
rs6000_offload_options (void)
{
  if (TARGET_64BIT)
return xstrdup ("-foffload-abi=lp64");
  else
return xstrdup ("-foffload-abi=ilp32");
}

If you return -foffload-abi=lp64 unconditionally, strange things will
happen for -m32 compilation (ABI mismatch).


Grüße,
 Thomas


signature.asc
Description: PGP signature


Re: Move sqrt and cbrt simplifications to match.pd

2015-10-08 Thread Richard Sandiford
Marc Glisse  writes:
> On Mon, 5 Oct 2015, Richard Sandiford wrote:
>
>> +  /* cbrt(sqrt(x)) -> pow(x,1/6).  */
>> +  (simplify
>> +   (sqrts (cbrts @0))
>> +   (pows @0 { build_real_truncate (type, dconst<1, 6> ()); }))
>> +  /* sqrt(cbrt(x)) -> pow(x,1/6).  */
>> +  (simplify
>> +   (cbrts (sqrts @0))
>> +   (pows @0 { build_real_truncate (type, dconst<1, 6> ()); }))
>
> I think you swapped the comments (not that it matters).

Thanks, fixed in the committed version.

Richard



Re: [RFA 1/2]: Don't ignore target_header_dir when deciding inhibit_libc

2015-10-08 Thread Ulrich Weigand
Hans-Peter Nilsson wrote:

> Let me ask you right back: after an installation, should
> installation of a newer gcc *not* automatically pick up the
> header files installed (copied to sys-include) by the previous
> installation when using the same prefix, *without* any
> --with-headers specified in the new configury?

I'm not using sys-include, so I don't really have a strong
opinion on this setup.  However, I found this in the docs:

  @item --with-headers
  @itemx --with-headers=@var{dir}
  Deprecated in favor of @option{--with-sysroot}.
  Specifies that target headers are available when building a cross compiler.
  The @var{dir} argument specifies a directory which has the target include
  files.  These include files will be copied into the @file{gcc} install
  directory.  @emph{This option with the @var{dir} argument is required} when
  building a cross compiler, if @file{@var{prefix}/@var{target}/sys-include}
  doesn't pre-exist.  If @file{@var{prefix}/@var{target}/sys-include} does
  pre-exist, the @var{dir} argument may be omitted.  @command{fixincludes}
  will be run on these files to make them compatible with GCC@.

  @item --without-headers
  Tells GCC not use any target headers from a libc when building a cross
  compiler.  When crossing to GNU/Linux, you need the headers so GCC
  can build the exception handling for libgcc.

This seems to imply to me that --with-headers without any  argument
is supposed to use the pre-existing sys-include directory.

The docs are somewhat silent on what exactly the complete absence of
both --with-headers and --without-headers means.

One potential interpretation might be:

 --with-headers=   Copy headers from  to sys-include
 --with-headers Use existing sys-include directory
   Use headers from prefix include directory
 --without-headers  Do not use any headers

which would at least make it clear that if you want sys-include,
you need to specify --with-headers ...

Another potential interpretation might be:

 --with-headers=   Copy headers from  to sys-include
 --with-headers Use existing sys-include directory
   Same as --with-headers
 --without-headers  Do not use any headers

which simplifies the option space, and makes --with/out-headers
match the behavior of other --with/out- options.  It would basically
require use of sys-include for cross-compilers (which the docs could
be read to imply anyway, already).

> > So I wondering: what is your current setup?  What headers do you have
> > in sys-include, and how did they get there?
> 
> In the setup we're talking about, they're all in sys-include,
> copied "manually" before anything else (IIUC just like what
> would happen if I had the headers elsewhere and specified a
> --with-headers=).

OK, I see.

> >  I'm aware of the copying
> > done when using --with-headers=, but this case should still work,
> > since $target_header_dir is set directly to  in this case anyway.
> 
> Eh... I'm easily confused.  Let me recap my understanding: now,
> with --with-headers=, we copy from  to sys-include
> and still look in (where "look in" means target_header_dir is
> set to and gcc configury inspects)  at configury-time and
> the built gcc will look in include *and* sys-include.  Without
> --with-headers (at all), configury looks in sys-include.  With
> --with-headers (=yes) things are (currently as well as before,
> just not as exposed) broken; we try and look in "yes".
> 
> The recentish (it's only been a year :) exposure being that
> inhibit_libc is *also* activated, as configury sanity-check
> can't find a basic header file.  That sanity-check itself *is*
> sane; gcc header inspection should be able to find whatever
> headers are specified or the default; it's just that the value
> is broken.  I think it's wrong to be ok that the current header
> tests don't matter for your target.

Agreed with everything here so far.
 
> So, ISTM we should change --with-headers (=yes) to either look
> in sys-include or in include.  Setting it to sys-include
> wouldn't help you or anyone else as it's already the default...

On the other hand, the current docs appear to imply that the
intent was for --with-headers (=yes) to look into a pre-existing
sys-include directory for headers.

> > Is there some *other* case, where you do not use --with-headers=,
> > but still have a pre-existing sys-include directory in the prefix?
> 
> (Eh, other than pre-existing?)  Headers are pre-installed in
> sys-include.  No --with-headers options.  GCC looks in
> sys-include by default, both at configury-time and when built.
> Me happy.
> 
> To wit, I think having with_headers=yes (i.e. not a path) have
> the effect of setting target_header_dir to include instead of
> sys-include would be the least evil, least unexpected change,
> that would work for most, including you.
> 
> I wouldn't understand to instead change things around and make
> "include" be inspected by default.  It's only the --with-headers
> optio

Re: Add -foffload-abi support for PPC

2015-10-08 Thread David Edelsohn
On Thu, Oct 8, 2015 at 12:19 PM, James Norris  wrote:

> I've revised the patch from the review comments (thank you) and
> is attached.
>
> Regtested on x86_64 and powerpcle64.
>
> OK for trunk?

What is the goal? Do you want this to return the correct value or only
the value for the supported 64 bit PPC64LE system?

Thanks, David


[HSA] Introduce support for shared libraries and host fallback

2015-10-08 Thread Martin Liška
Hello.

Following patch set introduces HSA support for BRIG shared libraries. Apart 
from that,
it adds new warning option (-Whsa) that pops up when HSA code generation cannot 
expand
a function. Moreover, remaining of patches are follow-up of previous big 
changes.

Thanks,
Martin
>From 89d0a81c84cbbc18af05a6c144ec5f84fbd55a36 Mon Sep 17 00:00:00 2001
From: marxin 
Date: Fri, 2 Oct 2015 10:44:00 +0200
Subject: [PATCH 1/9] HSA: introduce warnings and libgomp host fallback
 support.

gcc/ChangeLog:

2015-10-02  Martin Liska  

	* common.opt: Add new Whsa option.
	* hsa-gen.c (hsa_function_representation::hsa_function_representation):
	Add new member seen_error.
	(hsa_type_for_scalar_tree_type): Use HSA_SORRY_AT{V} instread of
	sorry.
	(hsa_type_for_tree_type): Likewise.
	(hsa_op_immed::hsa_op_immed): Likewise.
	(process_mem_base): Likewise.
	(gen_hsa_addr): Likewise.
	(gen_hsa_insns_for_load): Likewise.
	(gen_hsa_insns_for_store): Likewise.
	(gen_hsa_ctor_assignment): Likewise.
	(gen_hsa_insns_for_single_assignment): Likewise.
	(gen_hsa_cmp_insn_from_gimple): Likewise.
	(gen_hsa_insns_for_operation_assignment): Likewise.
	(verify_function_arguments): Likewise.
	(gen_hsa_insns_for_direct_call): Likewise.
	(gen_hsa_insns_for_return): Likewise.
	(get_address_from_value): Likewise.
	(gen_hsa_insns_for_call): Likewise.
	(gen_hsa_insns_for_gimple_stmt): Likewise.
	(gen_hsa_phi_from_gimple_phi): Likewise.
	(gen_body_from_gimple): Use aforementioned hsa_cfun->seen_error.
	(generate_hsa): Likewise.
	* hsa.c (hsa_deinit_compilation_unit_data): Release
	hsa_failed_functions.
	(hsa_seen_error): New function.
	* hsa.h (hsa_failed_functions): New variable.

libgomp/ChangeLog:

2015-10-02  Martin Liska  

	* libgomp.h (struct gomp_device_descr): Add new function
	pointer (can_run_func).
	* plugin/plugin-hsa.c (init_enviroment_variables): Rename this
	function from init_debug.
	(hsa_warn): New function.
	(struct kernel_info): Add new member variable
	initialization_failed.
	(struct agent_info): Add new member variable
	prog_finalized_error.
	(get_kernel_in_module): Do not call GOMP_PLUGIN_fatal.
	(init_hsa_context): Use HSA_DEBUG macro.
	(GOMP_OFFLOAD_init_device): Likewise.
	(destroy_hsa_program): Likewise.
	(GOMP_OFFLOAD_load_image): Produce warnings instread
	of failures.
	(GOMP_OFFLOAD_unload_image): Do not check if the agent
	is initialized, the check is in the called function.
	(GOMP_OFFLOAD_fini_device): Likewise.
	(create_and_finalize_hsa_program): Likewise.
	(release_kernel_dispatch): Use HSA_DEBUG macro.
	(init_single_kernel): Produce warnings instread of failures.
	(init_kernel): Use HSA_DEBUG macro.
	(parse_launch_attributes): Likewise.
	(GOMP_OFFLOAD_can_run): New function.
	(GOMP_OFFLOAD_run): Remove part of initialization that
	is moved to GOMP_OFFLOAD_can_run.
	(GOMP_OFFLOAD_fini_device): Fix coding style.
	* target.c (run_on_host): New function.
	(GOMP_target): Use the function.
	(gomp_load_plugin_for_device): Dynamically load the new hook.
---
 gcc/common.opt  |   4 +
 gcc/hsa-gen.c   | 238 
 gcc/hsa.c   |  14 ++-
 gcc/hsa.h   |   4 +
 libgomp/libgomp.h   |   1 +
 libgomp/plugin/plugin-hsa.c | 230 --
 libgomp/target.c|  42 +---
 7 files changed, 359 insertions(+), 174 deletions(-)

diff --git a/gcc/common.opt b/gcc/common.opt
index 7b0ec96..5ef6f46 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -581,6 +581,10 @@ Wfree-nonheap-object
 Common Var(warn_free_nonheap_object) Init(1) Warning
 Warn when attempting to free a non-heap object
 
+Whsa
+Common Var(warn_hsa) Init(1) Warning
+Warn when a function cannot be expanded to HSAIL
+
 Winline
 Common Var(warn_inline) Warning
 Warn when an inlined function cannot be inlined
diff --git a/gcc/hsa-gen.c b/gcc/hsa-gen.c
index 291d650..fb17a25 100644
--- a/gcc/hsa-gen.c
+++ b/gcc/hsa-gen.c
@@ -77,6 +77,32 @@ along with GCC; see the file COPYING3.  If not see
 #include "cfgloop.h"
 #include "cfganal.h"
 
+/* Print a warning message and set that we have seen an error.  */
+
+#define HSA_SORRY_MSG "could not emit HSAIL for the function"
+
+#define HSA_SORRY_ATV(location, message, ...) \
+  do \
+  { \
+hsa_cfun->seen_error = true; \
+if (warning_at (EXPR_LOCATION (hsa_cfun->decl), OPT_Whsa, \
+		HSA_SORRY_MSG)) \
+  inform (location, message, ##__VA_ARGS__); \
+  } \
+  while (false);
+
+/* Same as previous, but highlight a location.  */
+
+#define HSA_SORRY_AT(location, message) \
+  do \
+  { \
+hsa_cfun->seen_error = true; \
+if (warning_at (EXPR_LOCATION (hsa_cfun->decl), OPT_Whsa, \
+		HSA_SORRY_MSG)) \
+  inform (location, message); \
+  } \
+  while (false);
+
 /* Following structures are defined in the final version
of HSA specification.  */
 
@@ -196,6 +222,7 @@ hsa_function_representation::hsa_function_representation ()
   shadow_reg = NULL;
   kernel_dispat

Re: [PR c/64765, c/64880] Support OpenACC Combined Directives in C, C++

2015-10-08 Thread Joseph Myers
The C front-end changes are OK, but please follow up with fixes for any 
issues Jakub identifies.

-- 
Joseph S. Myers
jos...@codesourcery.com


[PR c/64765, c/64880] Support OpenACC Combined Directives in C, C++

2015-10-08 Thread Thomas Schwinge
Hi!

Some bits extracted out of gomp-4_0-branch, and some other bits
rewritten; here is a patch to support OpenACC Combined Directives in C,
C++.  (The Fortran front end already does support these.)

As far as I know, Jakub is not available at this time, so maybe the C
(Joseph) and C++ (Jason, Nathan) front end maintainers could please
review this, instead of him?  (The front end changes as well as the few
other cleanup changes should all be straight forward.)  OK for trunk once
bootstrap tested?

commit 9626356d641129381306f2ad5d884d5b7f7a5fc7
Author: Thomas Schwinge 
Date:   Thu Oct 8 15:59:54 2015 +0200

[PR c/64765, c/64880] Support OpenACC Combined Directives in C, C++

gcc/c-family/
PR c/64765
PR c/64880
* c-common.h (c_oacc_split_loop_clauses): Declare function.
* c-omp.c (c_oacc_split_loop_clauses): New function.
gcc/c/
PR c/64765
PR c/64880
* c-parser.c (c_parser_oacc_loop): Add mask, cclauses formal
parameters, and handle these.  Adjust all users.
(c_parser_oacc_kernels, c_parser_oacc_parallel): Merge functions
into...
(c_parser_oacc_kernels_parallel): ... this new function.  Adjust
all users.
* c-tree.h (c_finish_oacc_parallel, c_finish_oacc_kernels): Don't
declare functions.
(c_finish_omp_construct): Declare function.
* c-typeck.c (c_finish_oacc_parallel, c_finish_oacc_kernels):
Merge functions into...
(c_finish_omp_construct): ... this new function.
* gcc/cp/
PR c/64765
PR c/64880
* cp-tree.h (finish_oacc_kernels, finish_oacc_parallel): Don't
declare functions.
(finish_omp_construct): Declare function.
* parser.c (cp_parser_oacc_loop): Add p_name, mask, cclauses
formal parameters, and handle these.  Adjust all users.
(cp_parser_oacc_kernels, cp_parser_oacc_parallel): Merge functions
into...
(cp_parser_oacc_kernels_parallel): ... this new function.  Adjust
all users.
* semantics.c (finish_oacc_kernels, finish_oacc_parallel): Merge 
functions into...
(finish_omp_construct): ... this new function.
gcc/
* tree.h (OACC_PARALLEL_BODY, OACC_PARALLEL_CLAUSES)
(OACC_KERNELS_BODY, OACC_KERNELS_CLAUSES, OACC_KERNELS_COMBINED)
(OACC_PARALLEL_COMBINED): Don't define macros.  Adjust all users.
gcc/testsuite/
PR c/64765
PR c/64880
* c-c++-common/goacc/loop-1.c: Don't skip for C++.  Don't prune
sorry message.
(PR64765): New function.
* gfortran.dg/goacc/coarray_2.f90: XFAIL.
* gfortran.dg/goacc/combined_loop.f90: Extend.  Don't prune
sorry message.
* gfortran.dg/goacc/cray.f95: Refine prune directive.
* gfortran.dg/goacc/parameter.f95: Likewise.
libgomp/
* testsuite/libgomp.oacc-c-c++-common/combdir-1.c: New file.
* testsuite/libgomp.oacc-fortran/combdir-1.f90: Likewise.
---
 gcc/c-family/c-common.h|   1 +
 gcc/c-family/c-omp.c   |  39 +-
 gcc/c/c-parser.c   | 148 +---
 gcc/c/c-tree.h |   3 +-
 gcc/c/c-typeck.c   |  36 ++---
 gcc/cp/cp-tree.h   |   3 +-
 gcc/cp/parser.c| 149 -
 gcc/cp/semantics.c |  54 +++-
 gcc/fortran/trans-openmp.c |   4 -
 gcc/gimplify.c |  16 +--
 gcc/testsuite/c-c++-common/goacc/loop-1.c  |  10 +-
 gcc/testsuite/gfortran.dg/goacc/coarray_2.f90  |   1 +
 gcc/testsuite/gfortran.dg/goacc/combined_loop.f90  |   9 +-
 gcc/testsuite/gfortran.dg/goacc/cray.f95   |   2 +-
 gcc/testsuite/gfortran.dg/goacc/parameter.f95  |   2 +-
 gcc/tree-pretty-print.c|  11 +-
 gcc/tree.def   |   8 +-
 gcc/tree.h |  20 ---
 .../libgomp.oacc-c-c++-common/combdir-1.c  |  52 +++
 .../testsuite/libgomp.oacc-fortran/combdir-1.f90   |  37 +
 20 files changed, 336 insertions(+), 269 deletions(-)

diff --git gcc/c-family/c-common.h gcc/c-family/c-common.h
index d5fb499..94c68b9 100644
--- gcc/c-family/c-common.h
+++ gcc/c-family/c-common.h
@@ -1268,6 +1268,7 @@ extern void c_finish_omp_taskyield (location_t);
 extern tree c_finish_omp_for (location_t, enum tree_code, tree, tree, tree,
  tree, tree, tree);
 extern tree c_finish_oacc_wait (location_t, tree, tree);
+extern tree c_oacc_split_loop_clauses (tree, tree *);
 extern void c_omp_split_clauses (location_t, enum tree_code, omp_clause_mask,
 tree, tree *);
 extern tree c_omp_declare_simd_clauses_t

Re: [PATCH] New attribute to create target clones

2015-10-08 Thread Jeff Law

On 09/22/2015 03:09 PM, Bernd Schmidt wrote:

On 09/22/2015 09:41 PM, Jeff Law wrote:

Essentially it allows us to more easily support
per-microarchitecture-optimized versions of functions.   You list just
have to list the microarchitectures and the compiler handles the rest.
Very simple, very easy.  I'd think it'd be particularly helpful for
vectorization.

You could emulate this with compiling the same source multiple times
with different flags/defines and wire up on ifunc by hand.  But Evgeny's
approach is vastly simpler.


As far as I can tell the ifunc is generated automatically (and the
functionality is documented as such), so the new target_clone doesn't
buy much. But one thing I didn't was that the existing support is only
available in C++, while Evgeny's patch works for C. That is probably an
argument that could be made for its inclusion.
In multi-versioning, you have a distinct source implementation for each 
function.  ie


__attribute__ ((target ("default")))
int foo ()
{
  // The default version of foo.
  return 0;
}

__attribute__ ((target ("sse4.2")))
int foo ()
{
  // foo version for SSE4.2
  return 1;
}

And so-on for each processor version you want to support.

In Evgeny's patch we'd have a single source implementation of foo which 
gets compiled multiple times, once for each target specified by the 
attribute.


I wasn't aware that multi-versioning was only implemented for C++, that 
seems fairly lame.  I hope I didn't approve that :-)


Jeff



[DOC PATCH]: Mention that x86-64 now supports stack realignment from word-aligned stack pointer

2015-10-08 Thread Uros Bizjak
Hello!

Attached patch documents new functionality in gcc-6 release notes.

OK for GCC docs?

Uros.
Index: htdocs/gcc-6/changes.html
===
RCS file: /cvs/gcc/wwwdocs/htdocs/gcc-6/changes.html,v
retrieving revision 1.34
diff -r1.34 changes.html
200a201,207
>  
>x86-64 targets now allow stack realignment from word-aligned stack
>pointer using the command-line option -fstackrealign or
>__attribute__ ((force_align_arg_pointer)).  This allows
>functions compiled with a vector-aligned stack to be invoked from
>objects that keep only word-alignment.
>  


Re: [PATCH] Cleanup of IPA-CP alignment lattices

2015-10-08 Thread Martin Jambor
Hi,

On Thu, Oct 08, 2015 at 12:08:15AM +0200, Jan Hubicka wrote:
> > > This patch broke Solaris bootstrap in stage 1 with g++ 4.9:
> > > 
> > > /vol/gcc/src/hg/trunk/solaris/gcc/ipa-cp.c: In member function 'bool 
> > > ipcp_alignment_lattice::meet_with_1(unsigned int, unsigned int)':
> > > /vol/gcc/src/hg/trunk/solaris/gcc/ipa-cp.c:855:56: error: call of 
> > > overloaded 'abs(unsigned int)' is ambiguous
> > >int diff = abs (misalign - (new_misalign % align));
> > 
> > Calling abs on an unsigned type does not sound right anyway (that
> > almost sounds like a good candidate for a warning).  I suppose the
> 
> Yep, I guess that may be a good idea ;)
> 
> > correct fix is to cast both subtraction operands to signed int, would
> > such a change be pre-approved?
> 
>   if (misalign != (new_misalign % align))
> {
>   int diff = abs (misalign - (new_misalign % align));
>   align = MIN (align, (unsigned) diff & -diff);
>   if (align)
> misalign = misalign % align;
>   else
> set_to_bottom ();
>   changed = true;
> }
> 
> So the logic here is that you compute differnce of missaligns and want to set 
> align to be at least that.  Why
>   align = MIN (align, (unsigned) diff & -diff);
> I suppose ALIGN should be always greater than that, because diff is at most 
> ALIGN.
> So I suppose you want 
>   align = (unsigned) diff & -diff
> 
> The changes are pre-approved.
> 


Thanks, I have committed the following after bootstrapping and testing
on x86_64-linux.

Martin


2015-10-08  Martin Jambor  

* ipa-cp.c (meet_with_1): Make the argument of abs signed.  Remove
unnecessary MIN.

diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c
index 0d9fdee..d9d81f1 100644
--- a/gcc/ipa-cp.c
+++ b/gcc/ipa-cp.c
@@ -852,8 +852,8 @@ ipcp_alignment_lattice::meet_with_1 (unsigned new_align, 
unsigned new_misalign)
 }
   if (misalign != (new_misalign % align))
 {
-  int diff = abs (misalign - (new_misalign % align));
-  align = MIN (align, (unsigned) diff & -diff);
+  int diff = abs ((int) misalign - (int) (new_misalign % align));
+  align = (unsigned) diff & -diff;
   if (align)
misalign = misalign % align;
   else



Re: [patch 4/3] Header file reduction - Tools for contrib

2015-10-08 Thread David Malcolm
On Tue, 2015-10-06 at 14:02 +0200, Bernd Schmidt wrote:
[...]
> > No commenting on the quality of python code... :-) I was
> > learning python on the fly.Im sure some things are QUITE awful.,

[...]

> > + def ii_base (iinfo):
> > +   return iinfo[0]
> > +
> > + def ii_path (iinfo):
> > +   return iinfo[1]
> > +
> > + def ii_include_list (iinfo):
> > +   return iinfo[2]
> > +
> > + def ii_include_list_cond (iinfo):
> > +   return iinfo[3]
> > +
> > + def ii_include_list_non_cond (iinfo):
> > +   l = ii_include_list (iinfo)
> > +   for n in ii_include_list_cond (iinfo):
> > + l.remove (n)
> > +   return l
> > +
> > + def ii_macro_consume (iinfo):
> > +   return iinfo[4]
> > +
> > + def ii_macro_define (iinfo):
> > +   return iinfo[5]
> > +
> > + def ii_src (iinfo):
> > +   return iinfo[6]
> > +
> > + def ii_src_line (iinfo):
> > +   return iinfo[7]
> 
> That's a lot of little functions with pretty much no clue for the reader 
> what's going on. It looks like maybe there's an array where a struct 
> should have been used?

FWIW, this kind of thing is often made a lot neater and easier to debug
by using "namedtuple" from within the "collections" module in the
standard library:

https://docs.python.org/2/library/collections.html#collections.namedtuple

which lets you refer e.g. to field 5 of the tuple as a "define"
attribute.

  iinfo.define

and avoid all these accessor functions (and you can add methods and
properties, giving e.g. a "list_non_cond").

Not that I'm asking you to rewrite it; merely that namedtuple is one of
many gems in the python stdlib that are worth knowing about.

[...]

Hope this is constructive
Dave




Re: Do not use TYPE_CANONICAL in useless_type_conversion

2015-10-08 Thread Jan Hubicka
> 
> Heh, we need a calling convention checker in 
> gimple_builtin_call_types_compatible_p and friends!

Yep, i will try to play with this.  So far I did not managed to create
a testcase that would expose a wrong code in ICF. 
> 
> Btw, with the TYPE_CANONICAL changes I wonder whether we can make
> gimple_get_alias_set (the LTO get_alias_set langhook) just glob
> to TYPE_CANONICAL?

At the moment we only glob signed and unsigned for char and size_t,
not for other types.
For me the TYPE_CANONICAL globing is for the cases that propagate up
(i.e. structures containing the different types are compatible) and
gimple_get_alias_set for cases that doesn't (like the signed/unsigned
C thing)

Honza
> 
> Richard.
> 
> > Honza
> > > 
> > > Thanks,
> > > Richard.
> > > 
> > > > +return useless_type_conversion_p (TREE_TYPE (outer_type),
> > > > + TREE_TYPE (inner_type))
> > > > +  && useless_type_conversion_p
> > > > +   (TYPE_OFFSET_BASETYPE (outer_type),
> > > > +TYPE_OFFSET_BASETYPE (inner_type));
> > > >  
> > > >return false;
> > > >  }
> > 
> > 
> 
> -- 
> Richard Biener 
> SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 
> 21284 (AG Nuernberg)


Re: Add -foffload-abi support for PPC

2015-10-08 Thread James Norris

Hi!

On 10/07/2015 08:51 AM, David Edelsohn wrote:

On Wed, Oct 7, 2015 at 4:02 AM, Thomas Schwinge  wrote:


 From a quick look at the *_TYPE_SIZE definitions in
gcc/config/rs6000/rs6000.h as well as
, "3-1
Fundamental Types", and
,
I gather we're dealing with regular ilp32/lp64 here.  Then, I assume the
right thing is to use the 64BIT flag from gcc/config/rs6000/sysv4.opt
(which, per gcc/config.gcc I suppose is used for the relevant
powerpc64le-linux-gnu configuration).  (David?)


TARGET_64BIT is the appropriate macro to test.



I'm not sure where to place the TARGET_OFFLOAD_OPTIONS #define and the
function definition in rs6000.c.  (David?)


As mentioned earlier, only PPC64LE is supported.

I'm not sure if it really matters if this is defined in ELF-specific
portion of the file or a general place, although it never will be
called by other configurations.

Thanks, David



I've revised the patch from the review comments (thank you) and
is attached.

Regtested on x86_64 and powerpcle64.

OK for trunk?

Thanks!
Jim

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index e095f03..e775e9a 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -1690,6 +1690,9 @@ static const struct attribute_spec rs6000_attribute_table[] =
 #define TARGET_LIBGCC_SHIFT_COUNT_MODE rs6000_abi_word_mode
 #undef TARGET_UNWIND_WORD_MODE
 #define TARGET_UNWIND_WORD_MODE rs6000_abi_word_mode
+
+#undef TARGET_OFFLOAD_OPTIONS
+#define TARGET_OFFLOAD_OPTIONS rs6000_offload_options
 
 
 /* Processor table.  */
@@ -9530,6 +9533,13 @@ rs6000_abi_word_mode (void)
   return TARGET_32BIT ? SImode : DImode;
 }
 
+/* Implement the TARGET_OFFLOAD_OPTIONS hook.  */
+static char *
+rs6000_offload_options (void)
+{
+  return xstrdup ("-foffload-abi=lp64");
+}
+
 /* On rs6000, function arguments are promoted, as are function return
values.  */
 


Re: Fix more of C/fortran canonical type issues

2015-10-08 Thread Jan Hubicka
> On Thu, 8 Oct 2015, Jan Hubicka wrote:
> 
> > Hello,
> > here is updated version of the patch, this time without need to modify
> > useless_type_conversion.  Just to recall the issue, Fortran C 
> > interoperability
> > requires size_t to interoperate with signed version produced by Fortran FE.
> > Unlike the existing logic in aliasing that makes signed and unsigned share
> > alias sets this propagate up to the structures.  I.e. structure containing 
> > size_t
> > is interoperable with structure containing ptrdiff_t.
> 
> Hmm, note that size_t and ptrdiff_t do not have to have the same
> precision.  C11 just says size_t is the result of sizeof () and
> ptrdiff_t is the result of subtracting two pointers.  So instead
> of using BITS_PER_UNIT and POINTER_SIZE please look at the
> global C ABI types (size_type_node for size_t interoperability
> and char_type_node for char interoperability).

OK, I wll compare with TYPE_PRECISION of these.

> 
> Please skim over the testcases to see whether you falsely test
> interoperability of ptrdiff_t (I see at least intptr_t which
> would need an extra handling of TYPE_PRECISION equal to that
> of ptrdiff_type_node).

No, I only test size_t. I used ptrdiff_t only in the explanation letter ;)
The other types are just listing all integer types explicitely claimed
by the standard to be inter-operable.
> 
> As you duplicate the code in two places it would be nice to
> split this out into a function maybe?  
> integer_precisions_with_interoperable_signedness () (uh... ;))

Uhm, yep, that is a good idea. I will prepare updated patch after
fixing that Ada issue.

Honza
> 
> Ok with that change(s).
> 
> Thanks,
> Richard.
> 
> > Bootstrapped/regtested ppc64le-linux, OK?
> > 
> > Honza
> > 
> > 
> > * tree.c (gimple_canonical_types_compatible_p): Do not compare
> > TYPE_UNSIGNED for size_t and char compatible types.
> > 
> > * lto.c (hash_canonical_type): Do not hash TYPE_UNSIGNED for size_t
> > and char compatible types.
> > 
> > * gfortran.dg/lto/bind_c-2_0.f90: New testcase.
> > * gfortran.dg/lto/bind_c-2_1.c: New testcase.
> > * gfortran.dg/lto/bind_c-3_0.f90: New testcase.
> > * gfortran.dg/lto/bind_c-3_1.c: New testcase.
> > * gfortran.dg/lto/bind_c-4_0.f90: New testcase.
> > * gfortran.dg/lto/bind_c-4_1.c: New testcase.
> > * gfortran.dg/lto/bind_c-5_0.f90: New testcase.
> > * gfortran.dg/lto/bind_c-5_1.c: New testcase.
> > Index: lto/lto.c
> > ===
> > --- lto/lto.c   (revision 228586)
> > +++ lto/lto.c   (working copy)
> > @@ -288,6 +288,7 @@
> >  hash_canonical_type (tree type)
> >  {
> >inchash::hash hstate;
> > +  enum tree_code code;
> >  
> >/* We compute alias sets only for types that needs them.
> >   Be sure we do not recurse to something else as we can not hash 
> > incomplete
> > @@ -299,7 +300,8 @@
> >   smaller sets; when searching for existing matching types to merge,
> >   only existing types having the same features as the new type will be
> >   checked.  */
> > -  hstate.add_int (tree_code_for_canonical_type_merging (TREE_CODE (type)));
> > +  code = tree_code_for_canonical_type_merging (TREE_CODE (type));
> > +  hstate.add_int (code);
> >hstate.add_int (TYPE_MODE (type));
> >  
> >/* Incorporate common features of numerical types.  */
> > @@ -309,8 +311,15 @@
> >|| TREE_CODE (type) == OFFSET_TYPE
> >|| POINTER_TYPE_P (type))
> >  {
> > -  hstate.add_int (TYPE_UNSIGNED (type));
> >hstate.add_int (TYPE_PRECISION (type));
> > +  /* Ignore sign for char and size_t.  This is needed for fortran
> > +C_SIGNED_CHAR to be interoperable with both signed char and
> > +unsigned char (as stadnard requires).  Similarly fortran FE builds
> > +C_SIZE_T is signed type, while C defines it unsigned.  */
> > +  if (code != INTEGER_TYPE
> > + || (TYPE_PRECISION (type) != BITS_PER_UNIT
> > + && TYPE_PRECISION (type) != POINTER_SIZE))
> > +hstate.add_int (TYPE_UNSIGNED (type));
> >  }
> >  
> >if (VECTOR_TYPE_P (type))
> > Index: testsuite/gfortran.dg/lto/bind_c-2_0.f90
> > ===
> > --- testsuite/gfortran.dg/lto/bind_c-2_0.f90(revision 0)
> > +++ testsuite/gfortran.dg/lto/bind_c-2_0.f90(working copy)
> > @@ -0,0 +1,21 @@
> > +! { dg-lto-do run }
> > +! { dg-lto-options {{ -O3 -flto }} }
> > +! This testcase will abort if C_PTR is not interoperable with both int *
> > +! and float *
> > +module lto_type_merge_test
> > +  use, intrinsic :: iso_c_binding
> > +  implicit none
> > +
> > +  type, bind(c) :: MYFTYPE_1
> > + integer(c_signed_char) :: chr
> > + integer(c_signed_char) :: chrb
> > +  end type MYFTYPE_1
> > +
> > +  type(myftype_1), bind(c, name="myVar") :: myVar
> > +
> > +contains
> > +  subroutine types_test() bind(c)
> > +myVar%chr = myVar%chrb
> > +

Small C++ PATCH to add SIMPLE_TARGET_EXPR_P macro

2015-10-08 Thread Jason Merrill
While working on 67557 I noticed that this pattern was checked in a few 
places and decided to make it a macro, even though I didn't end up using 
it in the fix for that bug.


Tested x86_64-pc-linux-gnu, applying to trunk.
commit 853122e015d3f20f80161bd820f616a34d3d5c16
Author: Jason Merrill 
Date:   Wed Oct 7 16:37:37 2015 -0400

	* cp-tree.h (SIMPLE_TARGET_EXPR_P): New.
	* init.c (get_nsdmi): Use it.
	* typeck2.c (massage_init_elt): Use it.

diff --git a/gcc/cp/cp-tree.h b/gcc/cp/cp-tree.h
index 5acb065..f650c76 100644
--- a/gcc/cp/cp-tree.h
+++ b/gcc/cp/cp-tree.h
@@ -4490,6 +4490,12 @@ more_aggr_init_expr_args_p (const aggr_init_expr_arg_iterator *iter)
 #define TARGET_EXPR_DIRECT_INIT_P(NODE) \
   TREE_LANG_FLAG_2 (TARGET_EXPR_CHECK (NODE))
 
+/* True if NODE is a TARGET_EXPR that just expresses a copy of its INITIAL; if
+   the initializer has void type, it's doing something more complicated.  */
+#define SIMPLE_TARGET_EXPR_P(NODE)\
+  (TREE_CODE (NODE) == TARGET_EXPR\
+   && !VOID_TYPE_P (TREE_TYPE (TARGET_EXPR_INITIAL (NODE
+
 /* True if EXPR expresses direct-initialization of a TYPE.  */
 #define DIRECT_INIT_EXPR_P(TYPE,EXPR)	\
   (TREE_CODE (EXPR) == TARGET_EXPR && TREE_LANG_FLAG_2 (EXPR)		\
diff --git a/gcc/cp/init.c b/gcc/cp/init.c
index 1ed8f6c..57a6406 100644
--- a/gcc/cp/init.c
+++ b/gcc/cp/init.c
@@ -588,8 +588,7 @@ get_nsdmi (tree member, bool in_ctor)
 	}
   /* Strip redundant TARGET_EXPR so we don't need to remap it, and
 	 so the aggregate init code below will see a CONSTRUCTOR.  */
-  if (init && TREE_CODE (init) == TARGET_EXPR
-	  && !VOID_TYPE_P (TREE_TYPE (TARGET_EXPR_INITIAL (init
+  if (init && SIMPLE_TARGET_EXPR_P (init))
 	init = TARGET_EXPR_INITIAL (init);
   init = break_out_target_exprs (init);
 }
diff --git a/gcc/cp/typeck2.c b/gcc/cp/typeck2.c
index 1d106c7..2c9143e 100644
--- a/gcc/cp/typeck2.c
+++ b/gcc/cp/typeck2.c
@@ -1208,8 +1208,7 @@ massage_init_elt (tree type, tree init, tsubst_flags_t complain)
 {
   init = digest_init_r (type, init, true, LOOKUP_IMPLICIT, complain);
   /* Strip a simple TARGET_EXPR when we know this is an initializer.  */
-  if (TREE_CODE (init) == TARGET_EXPR
-  && !VOID_TYPE_P (TREE_TYPE (TARGET_EXPR_INITIAL (init
+  if (SIMPLE_TARGET_EXPR_P (init))
 init = TARGET_EXPR_INITIAL (init);
   /* When we defer constant folding within a statement, we may want to
  defer this folding as well.  */


Re: Use OEP_ADDRESS_OF in emit-rtl.c

2015-10-08 Thread Jan Hubicka
> On Wed, 7 Oct 2015, Jan Hubicka wrote:
> 
> > > 
> > > Did you audit all callers of mem_attrs_eq_p to see if they really
> > > only care about that?  After all MEM_EXPR, via access paths, encode
> > > type-based alias info and thus replacing one with the other (cse.c use
> > > or ifcvt.c use) is only valid if that doesn't break dependences.
> > 
> > Hmm, expr is used by ao_ref_from_mem and nonoverlapping_memrefs_p.
> > The alias set of the access is not taken from expr, but from alias set info
> > stored in the memory attribute itself (and it is checked by those to match)
> 
> But the alias-set is not everything and yes, via ao_ref_from_mem MEM_EXPR
> "leaks" to the tree oracle which happily calls 
> nonoverlapping_component_refs_of_decl_p or nonoverlapping_component_refs_p
> on it.
> 
> > I still think it is an address of the expression that matters, not the 
> > value.
> > I think operand_equal_p may, for example, consider two different VAR_DECL 
> > equivalent
> > if their constructors are, because the value is (it doesn't do that), but 
> > their
> > addresses differ.
> 
> It's structural equality of the MEM_EXPR that matters.  That neither
> operand_equal_p (..., 0) nor operand_equal_p (..., OEP_ADDRESS_OF) is
> an exact implementation for this (well, I think with '0' flags it was
> designed to be this, at least for memory references) is of course
> suspicious.  But that doesn't make using OEP_ADDRESS_OF the correct thing
> to do.

Hmm, I see.  I wonder how complex the expressions are and if we can't simply
compare AO properties of MEM_REF at toplevel and then dispatch to
operand_equal_p (..., OEP_ADDRESS_OF)
which would make more sense to me.

I would basically expect decls and mem_refs here.  Reason why I started to look
into that is that I added sanity check that operand_equal_p (...,0) is not 
called
on things that do not have value (function types and incomplete types) and this
is one of places that fires.

> 
> > I will look more into nonoverlapping_memrefs_p and ao_ref_from_mem. The 
> > first
> > one may need some update to tree-alias infrastructure
> 
> I'd rather remove it completely (at least that was my plan eventually).
> rtx_refs_may_alias_p is supposed to handle everything it handles.

Yep, that was my feeling from looking into that yesterday

Honza


Re: Do not use TYPE_CANONICAL in useless_type_conversion

2015-10-08 Thread Andreas Schwab
Eric Botcazou  writes:

>> Thank you! I commited the patch.
>
> It breaks the Ada build on x86-64 though:

Also on ia64:

/usr/local/gcc/test/Build/./prev-gcc/xgcc 
-B/usr/local/gcc/test/Build/./prev-gcc/ -B/usr/ia64-suse-linux/bin/ 
-B/usr/ia64-suse-linux/bin/ -B/usr/ia64-suse-linux/lib/ -isystem 
/usr/ia64-suse-linux/include -isystem /usr/ia64-suse-linux/sys-include-c -g 
-O2 -gtoggle  -gnatpg -gnata -W -Wall -nostdinc -I- -I. -Iada/generated -Iada 
-I../../gcc/ada -I../../gcc/ada/gcc-interface ../../gcc/ada/eval_fat.adb -o 
ada/eval_fat.o
+===GNAT BUG DETECTED==+
| 6.0.0 20151007 (experimental) (ia64-suse-linux) GCC error:   |
| in convert_move, at expr.c:282   |
| Error detected around ../../gcc/ada/eval_fat.adb:191:21  |

Andreas.

-- 
Andreas Schwab, SUSE Labs, sch...@suse.de
GPG Key fingerprint = 0196 BAD8 1CE9 1970 F4BE  1748 E4D4 88E3 0EEA B9D7
"And now for something completely different."


Re: Do not use TYPE_CANONICAL in useless_type_conversion

2015-10-08 Thread Jan Hubicka
> > Thank you! I commited the patch.
> 
> It breaks the Ada build on x86-64 though:

Hmm, I tested Ada on ppc64 only. I will look into that today.

Honza
> 
> eric@polaris:~/build/gcc/native/gcc/ada/rts> 
> /home/eric/build/gcc/native/./gcc/xgcc -B/home/eric/build/gcc/native/./gcc/ -
> B/home/eric/install/gcc/x86_64-suse-linux/bin/ -
> B/home/eric/install/gcc/x86_64-suse-linux/lib/ -isystem 
> /home/eric/install/gcc/x86_64-suse-linux/include -isystem 
> /home/eric/install/gcc/x86_64-suse-linux/sys-include-c -g -O2  -fpic  -W -
> Wall -gnatpg -nostdinc   s-regpat.adb -o s-regpat.o
> +===GNAT BUG DETECTED==+
> | 6.0.0 20151008 (experimental) [trunk revision 228597] (x86_64-suse-linux) 
> GCC error:|
> | in gen_lowpart_common, at emit-rtl.c:1399|
> | Error detected around s-regpat.adb:1029:22   |
> | Please submit a bug report; see http://gcc.gnu.org/bugs.html.|
> | Use a subject line meaningful to you and us to track the bug.|
> | Include the entire contents of this bug box in the report.   |
> | Include the exact command that you entered.  |
> | Also include sources listed below.   |
> +==+
> 
> -- 
> Eric Botcazou


[mask-vec_cond, patch 2/2, i386] Add patterns for vcond_mask_optab

2015-10-08 Thread Ilya Enkovich
Hi,

This patch add patterns for vcond_mask_optab.  No new expand code is required, 
existing ix86_expand_sse_movcc is used.

Thanks,
Ilya
--
gcc/ChangeLog:

2015-10-08  Ilya Enkovich  

* config/i386/i386-protos.h (ix86_expand_sse_movcc): New.
* config/i386/i386.c (ix86_expand_sse_movcc): Make public.
Cast mask to FP mode if required.
* config/i386/sse.md (vcond_mask_): New.
(vcond_mask_): New.
(vcond_mask_): New.
(vcond_mask_): New.
(vcond_mask_v2div2di): New.
(vcond_mask_): New.
(vcond_mask_): New.


diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index e22aa57..6a0e437 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -132,6 +132,7 @@ extern bool ix86_expand_vec_perm_const (rtx[]);
 extern bool ix86_expand_mask_vec_cmp (rtx[]);
 extern bool ix86_expand_int_vec_cmp (rtx[]);
 extern bool ix86_expand_fp_vec_cmp (rtx[]);
+extern void ix86_expand_sse_movcc (rtx, rtx, rtx, rtx);
 extern void ix86_expand_sse_unpack (rtx, rtx, bool, bool);
 extern bool ix86_expand_int_addcc (rtx[]);
 extern rtx ix86_expand_call (rtx, rtx, rtx, rtx, rtx, bool);
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index a8e3538..0619b9a 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -21497,7 +21497,7 @@ ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx 
cmp_op0, rtx cmp_op1,
 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
operations.  This is used for both scalar and vector conditional moves.  */
 
-static void
+void
 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
 {
   machine_mode mode = GET_MODE (dest);
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 48424fc..1e5a455 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -3015,6 +3015,87 @@
   DONE;
 })
 
+(define_expand "vcond_mask_"
+  [(set (match_operand:V48_AVX512VL 0 "register_operand")
+   (vec_merge:V48_AVX512VL
+ (match_operand:V48_AVX512VL 1 "nonimmediate_operand")
+ (match_operand:V48_AVX512VL 2 "vector_move_operand")
+ (match_operand: 3 "register_operand")))]
+  "TARGET_AVX512F")
+
+(define_expand "vcond_mask_"
+  [(set (match_operand:VI12_AVX512VL 0 "register_operand")
+   (vec_merge:VI12_AVX512VL
+ (match_operand:VI12_AVX512VL 1 "nonimmediate_operand")
+ (match_operand:VI12_AVX512VL 2 "vector_move_operand")
+ (match_operand: 3 "register_operand")))]
+  "TARGET_AVX512BW")
+
+(define_expand "vcond_mask_"
+  [(set (match_operand:VI_256 0 "register_operand")
+   (vec_merge:VI_256
+ (match_operand:VI_256 1 "nonimmediate_operand")
+ (match_operand:VI_256 2 "vector_move_operand")
+ (match_operand: 3 "register_operand")))]
+  "TARGET_AVX2"
+{
+  ix86_expand_sse_movcc (operands[0], operands[3],
+operands[1], operands[2]);
+  DONE;
+})
+
+(define_expand "vcond_mask_"
+  [(set (match_operand:VI124_128 0 "register_operand")
+   (vec_merge:VI124_128
+ (match_operand:VI124_128 1 "nonimmediate_operand")
+ (match_operand:VI124_128 2 "vector_move_operand")
+ (match_operand: 3 "register_operand")))]
+  "TARGET_SSE2"
+{
+  ix86_expand_sse_movcc (operands[0], operands[3],
+operands[1], operands[2]);
+  DONE;
+})
+
+(define_expand "vcond_mask_v2div2di"
+  [(set (match_operand:V2DI 0 "register_operand")
+   (vec_merge:V2DI
+ (match_operand:V2DI 1 "nonimmediate_operand")
+ (match_operand:V2DI 2 "vector_move_operand")
+ (match_operand:V2DI 3 "register_operand")))]
+  "TARGET_SSE4_2"
+{
+  ix86_expand_sse_movcc (operands[0], operands[3],
+operands[1], operands[2]);
+  DONE;
+})
+
+(define_expand "vcond_mask_"
+  [(set (match_operand:VF_256 0 "register_operand")
+   (vec_merge:VF_256
+ (match_operand:VF_256 1 "nonimmediate_operand")
+ (match_operand:VF_256 2 "vector_move_operand")
+ (match_operand: 3 "register_operand")))]
+  "TARGET_AVX"
+{
+  ix86_expand_sse_movcc (operands[0], operands[3],
+operands[1], operands[2]);
+  DONE;
+})
+
+(define_expand "vcond_mask_"
+  [(set (match_operand:VF_128 0 "register_operand")
+   (vec_merge:VF_128
+ (match_operand:VF_128 1 "nonimmediate_operand")
+ (match_operand:VF_128 2 "vector_move_operand")
+ (match_operand: 3 "register_operand")))]
+  "TARGET_SSE"
+{
+  ix86_expand_sse_movcc (operands[0], operands[3],
+operands[1], operands[2]);
+  DONE;
+})
+
 ;
 ;;
 ;; Parallel floating point logical operations


[mask-vec_cond, patch 1/2] Support vectorization of VEC_COND_EXPR with no embedded comparison

2015-10-08 Thread Ilya Enkovich
Hi,

This patch allows COND_EXPR with no embedded comparison to be vectorized.  It's 
applied on top of vectorized comparison support series.  New optab 
vcond_mask_optab is introduced for such statements.  Bool patterns now avoid 
comparison in COND_EXPR in case vector comparison is supported by target.

Thanks,
Ilya
--
gcc/

2015-10-08  Ilya Enkovich  

* optabs-query.h (get_vcond_mask_icode): New.
* optabs-tree.c (expand_vec_cond_expr_p): Use
get_vcond_mask_icode for VEC_COND_EXPR with mask.
* optabs.c (expand_vec_cond_mask_expr): New.
(expand_vec_cond_expr): Use get_vcond_mask_icode
when possible.
* optabs.def (vcond_mask_optab): New.
* tree-vect-patterns.c (vect_recog_bool_pattern): Don't
generate redundant comparison for COND_EXPR.
* tree-vect-stmts.c (vect_is_simple_cond): Allow SSA_NAME
as a condition.
(vectorizable_condition): Likewise.


diff --git a/gcc/optabs-query.h b/gcc/optabs-query.h
index 162d2e9..48bcf7c 100644
--- a/gcc/optabs-query.h
+++ b/gcc/optabs-query.h
@@ -98,6 +98,15 @@ get_vcond_icode (machine_mode vmode, machine_mode cmode, 
bool uns)
   return icode;
 }
 
+/* Return insn code for a conditional operator with a mask mode
+   MMODE resulting in a value of mode VMODE.  */
+
+static inline enum insn_code
+get_vcond_mask_icode (machine_mode vmode, machine_mode mmode)
+{
+  return convert_optab_handler (vcond_mask_optab, vmode, mmode);
+}
+
 /* Enumerates the possible extraction_insn operations.  */
 enum extraction_pattern { EP_insv, EP_extv, EP_extzv };
 
diff --git a/gcc/optabs-tree.c b/gcc/optabs-tree.c
index aa863cf..d887619 100644
--- a/gcc/optabs-tree.c
+++ b/gcc/optabs-tree.c
@@ -342,6 +342,9 @@ expand_vec_cond_expr_p (tree value_type, tree cmp_op_type)
 {
   machine_mode value_mode = TYPE_MODE (value_type);
   machine_mode cmp_op_mode = TYPE_MODE (cmp_op_type);
+  if (VECTOR_BOOLEAN_TYPE_P (cmp_op_type))
+return get_vcond_mask_icode (TYPE_MODE (value_type),
+TYPE_MODE (cmp_op_type)) != CODE_FOR_nothing;
   if (GET_MODE_SIZE (value_mode) != GET_MODE_SIZE (cmp_op_mode)
   || GET_MODE_NUNITS (value_mode) != GET_MODE_NUNITS (cmp_op_mode)
   || get_vcond_icode (TYPE_MODE (value_type), TYPE_MODE (cmp_op_type),
diff --git a/gcc/optabs.c b/gcc/optabs.c
index ca1a6e7..d26b8f8 100644
--- a/gcc/optabs.c
+++ b/gcc/optabs.c
@@ -5346,6 +5346,38 @@ expand_vec_perm (machine_mode mode, rtx v0, rtx v1, rtx 
sel, rtx target)
   return tmp;
 }
 
+/* Generate insns for a VEC_COND_EXPR with mask, given its TYPE and its
+   three operands.  */
+
+rtx
+expand_vec_cond_mask_expr (tree vec_cond_type, tree op0, tree op1, tree op2,
+  rtx target)
+{
+  struct expand_operand ops[4];
+  machine_mode mode = TYPE_MODE (vec_cond_type);
+  machine_mode mask_mode = TYPE_MODE (TREE_TYPE (op0));
+  enum insn_code icode = get_vcond_mask_icode (mode, mask_mode);
+  rtx mask, rtx_op1, rtx_op2;
+
+  if (icode == CODE_FOR_nothing)
+return 0;
+
+  mask = expand_normal (op0);
+  rtx_op1 = expand_normal (op1);
+  rtx_op2 = expand_normal (op2);
+
+  mask = force_reg (GET_MODE (mask), mask);
+  rtx_op1 = force_reg (GET_MODE (rtx_op1), rtx_op1);
+
+  create_output_operand (&ops[0], target, mode);
+  create_input_operand (&ops[1], rtx_op1, mode);
+  create_input_operand (&ops[2], rtx_op2, mode);
+  create_input_operand (&ops[3], mask, mask_mode);
+  expand_insn (icode, 4, ops);
+
+  return ops[0].value;
+}
+
 /* Generate insns for a VEC_COND_EXPR, given its TYPE and its
three operands.  */
 
@@ -5371,12 +5403,21 @@ expand_vec_cond_expr (tree vec_cond_type, tree op0, 
tree op1, tree op2,
 }
   else
 {
-  /* Fake op0 < 0.  */
   gcc_assert (VECTOR_BOOLEAN_TYPE_P (TREE_TYPE (op0)));
-  op0a = op0;
-  op0b = build_zero_cst (TREE_TYPE (op0));
-  tcode = LT_EXPR;
-  unsignedp = false;
+  if (get_vcond_mask_icode (mode, TYPE_MODE (TREE_TYPE (op0)))
+ != CODE_FOR_nothing)
+   return expand_vec_cond_mask_expr (vec_cond_type, op0, op1,
+ op2, target);
+  /* Fake op0 < 0.  */
+  else
+   {
+ gcc_assert (GET_MODE_CLASS (TYPE_MODE (TREE_TYPE (op0)))
+ == MODE_VECTOR_INT);
+ op0a = op0;
+ op0b = build_zero_cst (TREE_TYPE (op0));
+ tcode = LT_EXPR;
+ unsignedp = false;
+   }
 }
   cmp_op_mode = TYPE_MODE (TREE_TYPE (op0a));
 
diff --git a/gcc/optabs.def b/gcc/optabs.def
index 9804378..70530a6 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -61,6 +61,7 @@ OPTAB_CD(vec_load_lanes_optab, "vec_load_lanes$a$b")
 OPTAB_CD(vec_store_lanes_optab, "vec_store_lanes$a$b")
 OPTAB_CD(vcond_optab, "vcond$a$b")
 OPTAB_CD(vcondu_optab, "vcondu$a$b")
+OPTAB_CD(vcond_mask_optab, "vcond_mask_$a$b")
 OPTAB_CD(vec_cmp_optab, "vec_cmp$a$b")
 OPTAB_CD(vec_cmpu_optab, "vec_cmpu$a$b")
 OPTAB_CD(maskload_optab, "maskload$a$b"

Re: [patch 0/3] Header file reduction.

2015-10-08 Thread Michael Matz
Hi,

On Wed, 7 Oct 2015, Richard Biener wrote:

> > I'm probably the last person in the world that still generally prefers 
> > -cp :-)  I'm getting to the point where I can tolerate -u.
> 
> No, I prefer -cp too - diff just too easily makes a mess out of diffs 
> with -u, esp. if you have re-indenting going on as well.

Actually -c was the recommended form of sending patches for many years 
even in our own guidelines.  It only got changed to -up or -cp when moving 
instructions from the texinfo files to the website in 2001.  From gcc 3.0 
(https://gcc.gnu.org/onlinedocs/gcc-3.0/gcc_10.html):

  Use `diff -c' to make your diffs. Diffs without context are hard for us 
  to install reliably. More than that, they make it hard for us to study 
  the diffs to decide whether we want to install them. Unidiff format is 
  better than contextless diffs, but not as easy to read as `-c' format.
  If you have GNU diff, use `diff -cp', which shows the name of the 
  function that each change occurs in.


;-)  (IMHO it depends on what the patch does if -c or -u is better, if 
the _change_ is important -u might be better, if the new state is the 
more interesting thing, -c is)


Ciao,
Michael.


[mask-load, patch 2/2, i386] Add/modify mask load/store patterns

2015-10-08 Thread Ilya Enkovich
Hi,

This patch reflects changes in maskload and maskstore optabs and adds patterns 
for AVX-512.

Thanks,
Ilya
--
2015-10-08  Ilya Enkovich  

* config/i386/sse.md (maskload): Rename to ...
(maskload): ... this.
(maskstore): Rename to ...
(maskstore): ... this.
(maskload): New.
(maskstore): New.


diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 3a9d2d3..48424fc 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -18153,7 +18153,7 @@
(set_attr "btver2_decode" "vector") 
(set_attr "mode" "")])
 
-(define_expand "maskload"
+(define_expand "maskload"
   [(set (match_operand:V48_AVX2 0 "register_operand")
(unspec:V48_AVX2
  [(match_operand: 2 "register_operand")
@@ -18161,7 +18161,23 @@
  UNSPEC_MASKMOV))]
   "TARGET_AVX")
 
-(define_expand "maskstore"
+(define_expand "maskload"
+  [(set (match_operand:V48_AVX512VL 0 "register_operand")
+   (vec_merge:V48_AVX512VL
+ (match_operand:V48_AVX512VL 1 "memory_operand")
+ (match_dup 0)
+ (match_operand: 2 "register_operand")))]
+  "TARGET_AVX512F")
+
+(define_expand "maskload"
+  [(set (match_operand:VI12_AVX512VL 0 "register_operand")
+   (vec_merge:VI12_AVX512VL
+ (match_operand:VI12_AVX512VL 1 "memory_operand")
+ (match_dup 0)
+ (match_operand: 2 "register_operand")))]
+  "TARGET_AVX512BW")
+
+(define_expand "maskstore"
   [(set (match_operand:V48_AVX2 0 "memory_operand")
(unspec:V48_AVX2
  [(match_operand: 2 "register_operand")
@@ -18170,6 +18186,22 @@
  UNSPEC_MASKMOV))]
   "TARGET_AVX")
 
+(define_expand "maskstore"
+  [(set (match_operand:V48_AVX512VL 0 "memory_operand")
+   (vec_merge:V48_AVX512VL
+ (match_operand:V48_AVX512VL 1 "register_operand")
+ (match_dup 0)
+ (match_operand: 2 "register_operand")))]
+  "TARGET_AVX512F")
+
+(define_expand "maskstore"
+  [(set (match_operand:VI12_AVX512VL 0 "memory_operand")
+   (vec_merge:VI12_AVX512VL
+ (match_operand:VI12_AVX512VL 1 "register_operand")
+ (match_dup 0)
+ (match_operand: 2 "register_operand")))]
+  "TARGET_AVX512BW")
+
 (define_insn_and_split "avx__"
   [(set (match_operand:AVX256MODE2P 0 "nonimmediate_operand" "=x,m")
(unspec:AVX256MODE2P


[mask-load, patch 1/2] Use boolean predicate for masked loads and store

2015-10-08 Thread Ilya Enkovich
Hi,

This patch replaces integer mask argument for MASK_LOAD ans MASK_STORE calls 
with a boolean one.  To allow various boolean vector modes assigned by a target 
maskload and maskstore optabs were transformed into convert_optab to get mask 
mode as a second operand.  Patch applies on top of boolean vector patch series.

Thanks,
Ilya
--
gcc/

2015-10-08  Ilya Enkovich  

* internal-fn.c (expand_MASK_LOAD): Adjust to maskload optab changes.
(expand_MASK_STORE): Adjust to maskstore optab changes.
* optabs-query.c (can_vec_mask_load_store_p): Add MASK_MODE arg.
 Adjust to maskload, maskstore optab changes.
* optabs-query.h (can_vec_mask_load_store_p): Add MASK_MODE arg.
* optabs.def (maskload_optab): Transform into convert optab.
(maskstore_optab): Likewise.
* tree-if-conv.c (ifcvt_can_use_mask_load_store): Adjust to
can_vec_mask_load_store_p signature change.
(predicate_mem_writes): Use boolean mask.
* tree-vect-stmts.c (vectorizable_mask_load_store): Adjust to
can_vec_mask_load_store_p signature change.  Allow invariant masks.


diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
index 71f811c..5ea3c0d 100644
--- a/gcc/internal-fn.c
+++ b/gcc/internal-fn.c
@@ -1885,7 +1885,9 @@ expand_MASK_LOAD (gcall *stmt)
   create_output_operand (&ops[0], target, TYPE_MODE (type));
   create_fixed_operand (&ops[1], mem);
   create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
-  expand_insn (optab_handler (maskload_optab, TYPE_MODE (type)), 3, ops);
+  expand_insn (convert_optab_handler (maskload_optab, TYPE_MODE (type),
+ TYPE_MODE (TREE_TYPE (maskt))),
+  3, ops);
 }
 
 static void
@@ -1908,7 +1910,9 @@ expand_MASK_STORE (gcall *stmt)
   create_fixed_operand (&ops[0], mem);
   create_input_operand (&ops[1], reg, TYPE_MODE (type));
   create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
-  expand_insn (optab_handler (maskstore_optab, TYPE_MODE (type)), 3, ops);
+  expand_insn (convert_optab_handler (maskstore_optab, TYPE_MODE (type),
+ TYPE_MODE (TREE_TYPE (maskt))),
+  3, ops);
 }
 
 static void
diff --git a/gcc/optabs-query.c b/gcc/optabs-query.c
index 254089f..c20597c 100644
--- a/gcc/optabs-query.c
+++ b/gcc/optabs-query.c
@@ -466,7 +466,9 @@ can_mult_highpart_p (machine_mode mode, bool uns_p)
 /* Return true if target supports vector masked load/store for mode.  */
 
 bool
-can_vec_mask_load_store_p (machine_mode mode, bool is_load)
+can_vec_mask_load_store_p (machine_mode mode,
+  machine_mode mask_mode,
+  bool is_load)
 {
   optab op = is_load ? maskload_optab : maskstore_optab;
   machine_mode vmode;
@@ -474,7 +476,7 @@ can_vec_mask_load_store_p (machine_mode mode, bool is_load)
 
   /* If mode is vector mode, check it directly.  */
   if (VECTOR_MODE_P (mode))
-return optab_handler (op, mode) != CODE_FOR_nothing;
+return convert_optab_handler (op, mode, mask_mode) != CODE_FOR_nothing;
 
   /* Otherwise, return true if there is some vector mode with
  the mask load/store supported.  */
@@ -485,7 +487,12 @@ can_vec_mask_load_store_p (machine_mode mode, bool is_load)
   if (!VECTOR_MODE_P (vmode))
 return false;
 
-  if (optab_handler (op, vmode) != CODE_FOR_nothing)
+  mask_mode = targetm.vectorize.get_mask_mode (GET_MODE_NUNITS (vmode),
+  GET_MODE_SIZE (vmode));
+  if (mask_mode == VOIDmode)
+return false;
+
+  if (convert_optab_handler (op, vmode, mask_mode) != CODE_FOR_nothing)
 return true;
 
   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
@@ -496,8 +503,10 @@ can_vec_mask_load_store_p (machine_mode mode, bool is_load)
   if (cur <= GET_MODE_SIZE (mode))
continue;
   vmode = mode_for_vector (mode, cur / GET_MODE_SIZE (mode));
+  mask_mode = targetm.vectorize.get_mask_mode (GET_MODE_NUNITS (vmode),
+  cur);
   if (VECTOR_MODE_P (vmode)
- && optab_handler (op, vmode) != CODE_FOR_nothing)
+ && convert_optab_handler (op, vmode, mask_mode) != CODE_FOR_nothing)
return true;
 }
   return false;
diff --git a/gcc/optabs-query.h b/gcc/optabs-query.h
index 81ac362..162d2e9 100644
--- a/gcc/optabs-query.h
+++ b/gcc/optabs-query.h
@@ -140,7 +140,7 @@ enum insn_code find_widening_optab_handler_and_mode (optab, 
machine_mode,
 machine_mode, int,
 machine_mode *);
 int can_mult_highpart_p (machine_mode, bool);
-bool can_vec_mask_load_store_p (machine_mode, bool);
+bool can_vec_mask_load_store_p (machine_mode, machine_mode, bool);
 bool can_compare_and_swap_p (machine_mode, bool);
 bool can_atomic_exchange_p (machine_mode, bool);
 bool lshift_cheap_p (bool);
diff --git a

[PR 67794] Also remap SSA_NAMEs defined in ASMs in IPA-SRA

2015-10-08 Thread Martin Jambor
Hi,

the following fixes PR 67794 by properly remapping SSA_NAMEs which are
based on PARM_DECLs which are about to be removed as unnecessary.  And
by "properly" I mean also when they are defined by a GIMPL_ASM
statement.  In fact, it switches to using an iterator over definitions
to make sure it always handles everything...  well,except for PHIs
which are still handled specially because, from a quick glance over
their source, it seemed to me that the iterator does not support them.

Bootstrapped and tested on x86_64-linux.  OK for trunk?
The issue is most probably latent on a number of old branches, do we
want to backport the patch to any of them?

Thanks,

Martin


2015-10-08  Martin Jambor  

tree-optimization/67794
* tree-sra.c (replace_removed_params_ssa_names): Do not distinguish
between types of state,ents but accept original definitions as a
parameter.
(ipa_sra_modify_function_body): Use FOR_EACH_SSA_DEF_OPERAND to
iterate over definitions.

testsuite/
* gcc.dg/ipa/ipa-sra-10.c: Nw test.
* gcc.dg/torture/pr67794.c: Likewise.

diff --git a/gcc/testsuite/gcc.dg/ipa/ipa-sra-10.c 
b/gcc/testsuite/gcc.dg/ipa/ipa-sra-10.c
new file mode 100644
index 000..24b64d1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/ipa/ipa-sra-10.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fipa-sra -fdump-tree-eipa_sra-details"  } */
+
+extern void consume (int);
+extern int glob, glob1, glob2;
+extern int get (void);
+
+
+static void __attribute__ ((noinline))
+foo (int a)
+{
+  a = glob;
+  consume (a);
+  a = get ();
+  consume (a);
+  __asm__ volatile("" : : ""(a));
+  consume (a);
+
+  if (glob1)
+a = glob1;
+  else
+a = glob2;
+  consume (a);
+}
+
+int
+bar (int a)
+{
+  foo (a);
+  glob = a;
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "replacing an SSA name of a removed 
param" 4 "eipa_sra" } } */
diff --git a/gcc/testsuite/gcc.dg/torture/pr67794.c 
b/gcc/testsuite/gcc.dg/torture/pr67794.c
new file mode 100644
index 000..5489e56
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/pr67794.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+
+int *b;
+static void fn1(int *best, int *dmin) {
+  int a[64];
+  dmin = a;
+  __asm__ volatile("" : "+&r"(dmin) : ""(best));
+}
+
+__attribute__((always_inline)) static inline void fn2(int *best) { fn1(best, 
b); }
+
+void fn3(void) {
+  int c[1];
+  fn2(c);
+}
diff --git a/gcc/tree-sra.c b/gcc/tree-sra.c
index 4327990..f2a4e72 100644
--- a/gcc/tree-sra.c
+++ b/gcc/tree-sra.c
@@ -4612,61 +4612,45 @@ get_adjustment_for_base (ipa_parm_adjustment_vec 
adjustments, tree base)
   return NULL;
 }
 
-/* If the statement STMT defines an SSA_NAME of a parameter which is to be
-   removed because its value is not used, replace the SSA_NAME with a one
-   relating to a created VAR_DECL together all of its uses and return true.
-   ADJUSTMENTS is a pointer to an adjustments vector.  */
+/* If OLD_NAME, which is being defined by statement STMT, is an SSA_NAME of a
+   parameter which is to be removed because its value is not used, create a new
+   SSA_NAME relating to a replacement VAR_DECL, replace all uses of the
+   original with it and return it.  If there is no need to re-map, return true.
+   ADJUSTMENTS is a pointer to a vector of IPA-SRA adjustments.  */
 
-static bool
-replace_removed_params_ssa_names (gimple *stmt,
+static tree
+replace_removed_params_ssa_names (tree old_name, gimple *stmt,
  ipa_parm_adjustment_vec adjustments)
 {
   struct ipa_parm_adjustment *adj;
-  tree lhs, decl, repl, name;
-
-  if (gimple_code (stmt) == GIMPLE_PHI)
-lhs = gimple_phi_result (stmt);
-  else if (is_gimple_assign (stmt))
-lhs = gimple_assign_lhs (stmt);
-  else if (is_gimple_call (stmt))
-lhs = gimple_call_lhs (stmt);
-  else
-gcc_unreachable ();
+  tree decl, repl, new_name;
 
-  if (TREE_CODE (lhs) != SSA_NAME)
-return false;
+  if (TREE_CODE (old_name) != SSA_NAME)
+return NULL;
 
-  decl = SSA_NAME_VAR (lhs);
+  decl = SSA_NAME_VAR (old_name);
   if (decl == NULL_TREE
   || TREE_CODE (decl) != PARM_DECL)
-return false;
+return NULL;
 
   adj = get_adjustment_for_base (adjustments, decl);
   if (!adj)
-return false;
+return NULL;
 
   repl = get_replaced_param_substitute (adj);
-  name = make_ssa_name (repl, stmt);
+  new_name = make_ssa_name (repl, stmt);
 
   if (dump_file)
 {
   fprintf (dump_file, "replacing an SSA name of a removed param ");
-  print_generic_expr (dump_file, lhs, 0);
+  print_generic_expr (dump_file, old_name, 0);
   fprintf (dump_file, " with ");
-  print_generic_expr (dump_file, name, 0);
+  print_generic_expr (dump_file, new_name, 0);
   fprintf (dump_file, "\n");
 }
 
-  if (is_gimple_assign (stmt))
-gimple_assign_set_lhs (stmt, name);
-  else if (is_gimple_call (stmt))
-gimple_call_set_lhs (stmt, name);
-  else
-gimple_phi_set_result (as_a  (stmt), name

[vec-cmp, patch 6/6, i386] Add i386 support for vector comparison

2015-10-08 Thread Ilya Enkovich
Hi,

This patch adds patterns for vec_cmp optabs.  Vector comparison expand code was 
moved from VEC_COND_EXPR expanders into a separate functions.  AVX-512 patterns 
use more simple masked versions.

Thanks,
Ilya
--
gcc/

2015-10-08  Ilya Enkovich  

* config/i386/i386-protos.h (ix86_expand_mask_vec_cmp): New.
(ix86_expand_int_vec_cmp): New.
(ix86_expand_fp_vec_cmp): New.
* config/i386/i386.c (ix86_expand_sse_cmp): Allow NULL for
op_true and op_false.
(ix86_int_cmp_code_to_pcmp_immediate): New.
(ix86_fp_cmp_code_to_pcmp_immediate): New.
(ix86_cmp_code_to_pcmp_immediate): New.
(ix86_expand_mask_vec_cmp): New.
(ix86_expand_fp_vec_cmp): New.
(ix86_expand_int_sse_cmp): New.
(ix86_expand_int_vcond): Use ix86_expand_int_sse_cmp.
(ix86_expand_fp_vcond): Use ix86_expand_sse_cmp.
(ix86_expand_int_vec_cmp): New.
(ix86_get_mask_mode): New.
(TARGET_VECTORIZE_GET_MASK_MODE): New.
* config/i386/sse.md (avx512fmaskmodelower): New.
(vec_cmp): New.
(vec_cmp): New.
(vec_cmpv2div2di): New.
(vec_cmpu): New.
(vec_cmpu): New.
(vec_cmpuv2div2di): New.


diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 6a17ef4..e22aa57 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -129,6 +129,9 @@ extern bool ix86_expand_fp_vcond (rtx[]);
 extern bool ix86_expand_int_vcond (rtx[]);
 extern void ix86_expand_vec_perm (rtx[]);
 extern bool ix86_expand_vec_perm_const (rtx[]);
+extern bool ix86_expand_mask_vec_cmp (rtx[]);
+extern bool ix86_expand_int_vec_cmp (rtx[]);
+extern bool ix86_expand_fp_vec_cmp (rtx[]);
 extern void ix86_expand_sse_unpack (rtx, rtx, bool, bool);
 extern bool ix86_expand_int_addcc (rtx[]);
 extern rtx ix86_expand_call (rtx, rtx, rtx, rtx, rtx, bool);
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 8a26f68..a8e3538 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -21446,8 +21446,8 @@ ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx 
cmp_op0, rtx cmp_op1,
 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
 
   if (optimize
-  || reg_overlap_mentioned_p (dest, op_true)
-  || reg_overlap_mentioned_p (dest, op_false))
+  || (op_true && reg_overlap_mentioned_p (dest, op_true))
+  || (op_false && reg_overlap_mentioned_p (dest, op_false)))
 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
 
   /* Compare patterns for int modes are unspec in AVX512F only.  */
@@ -21508,6 +21508,14 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, 
rtx op_false)
 
   rtx t2, t3, x;
 
+  /* If we have an integer mask and FP value then we need
+ to cast mask to FP mode.  */
+  if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
+{
+  cmp = force_reg (cmpmode, cmp);
+  cmp = gen_rtx_SUBREG (mode, cmp, 0);
+}
+
   if (vector_all_ones_operand (op_true, mode)
   && rtx_equal_p (op_false, CONST0_RTX (mode))
   && !maskcmp)
@@ -21719,34 +21727,127 @@ ix86_expand_fp_movcc (rtx operands[])
   return true;
 }
 
-/* Expand a floating-point vector conditional move; a vcond operation
-   rather than a movcc operation.  */
+/* Helper for ix86_cmp_code_to_pcmp_immediate for int modes.  */
+
+static int
+ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
+{
+  switch (code)
+{
+case EQ:
+  return 0;
+case LT:
+case LTU:
+  return 1;
+case LE:
+case LEU:
+  return 2;
+case NE:
+  return 4;
+case GE:
+case GEU:
+  return 5;
+case GT:
+case GTU:
+  return 6;
+default:
+  gcc_unreachable ();
+}
+}
+
+/* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes.  */
+
+static int
+ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
+{
+  switch (code)
+{
+case EQ:
+  return 0x08;
+case NE:
+  return 0x04;
+case GT:
+  return 0x16;
+case LE:
+  return 0x1a;
+case GE:
+  return 0x15;
+case LT:
+  return 0x19;
+default:
+  gcc_unreachable ();
+}
+}
+
+/* Return immediate value to be used in UNSPEC_PCMP
+   for comparison CODE in MODE.  */
+
+static int
+ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
+{
+  if (FLOAT_MODE_P (mode))
+return ix86_fp_cmp_code_to_pcmp_immediate (code);
+  return ix86_int_cmp_code_to_pcmp_immediate (code);
+}
+
+/* Expand AVX-512 vector comparison.  */
 
 bool
-ix86_expand_fp_vcond (rtx operands[])
+ix86_expand_mask_vec_cmp (rtx operands[])
 {
-  enum rtx_code code = GET_CODE (operands[3]);
+  machine_mode mask_mode = GET_MODE (operands[0]);
+  machine_mode cmp_mode = GET_MODE (operands[2]);
+  enum rtx_code code = GET_CODE (operands[1]);
+  rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
+  int unspec_code;
+  rtx unspec;
+
+  switch (code)
+{
+case LEU:
+case GTU:
+case GEU:
+case LTU:
+   

Re: [patch] header file re-ordering.

2015-10-08 Thread Jeff Law

On 10/08/2015 07:37 AM, Andrew MacLeod wrote:

On 10/07/2015 06:02 PM, Jeff Law wrote:

On 10/01/2015 08:33 PM, Andrew MacLeod wrote:

these are all in the main gcc directory. 297 files total.

Everything bootstraps on x86_64-pc-linux-gnu and
powerpc64le-unknown-linux-gnu.  All targets in config-list.mk still
build. Regressions tests also came up clean.

OK for trunk?

So as I look at this and make various spot checks, what really stands
out is how often something like alias.h gets included, often in places
that have absolutely no business/need to be looking at that file.
Cut-n-paste at its worst.  It happens to many others, but alias.h
seems to have gotten its grubby self into just about everywhere for
reasons unkonwn.

I find myself also wondering if a two step approach would make this
easier.  Step #1 being ordering the headers, step #2 being removal of
the duplicates.  As you note, the downside is two checkins that would
affect most files in the tree.  I guess I'll keep slogging through the
patch as is...

jeff

Heres the patch for reordered headers.  Building as we speak.  Hard to
fully verify since Ada doesn't seem to bootstrap on trunk at the moment:
Saw in IRC it was Jan's patch that broke Ada bootstrap.  So you might 
consider reverting that bit locally to restore bootstrapping for Ada.





However, the tool has been run, and I've made the minor adjustments
required to the source files to make it work.  (ie, a few multi-line
comments and the fact that mul-tables.c is generated on the tile* targets.

So this is what it should look like.  I used -cp.Other languages are
bootstrapping, and I have yet to build all the targets... that'll just
take a day.   Be nice if ada worked tho.
OK.  I'll take a look at this version and I think running the reducer 
over the weekend sounds good.


Jeff



[PING*2][PATCH, rs6000] Add memory barriers to tbegin, tend, etc.

2015-10-08 Thread Peter Bergner
Ping*2.

Torvald, David approved the code portion of the patch.
How does the documentation part you asked for look to you?

  https://gcc.gnu.org/ml/gcc-patches/2015-09/msg00315.html

Peter




New Chinese (traditional) PO file for 'cpplib' (version 5.2.0)

2015-10-08 Thread Translation Project Robot
Hello, gentle maintainer.

This is a message from the Translation Project robot.

A revised PO file for textual domain 'cpplib' has been submitted
by the Chinese (traditional) team of translators.  The file is available at:

http://translationproject.org/latest/cpplib/zh_TW.po

(This file, 'cpplib-5.2.0.zh_TW.po', has just now been sent to you in
a separate email.)

All other PO files for your package are available in:

http://translationproject.org/latest/cpplib/

Please consider including all of these in your next release, whether
official or a pretest.

Whenever you have a new distribution with a new version number ready,
containing a newer POT file, please send the URL of that distribution
tarball to the address below.  The tarball may be just a pretest or a
snapshot, it does not even have to compile.  It is just used by the
translators when they need some extra translation context.

The following HTML page has been updated:

http://translationproject.org/domain/cpplib.html

If any question arises, please contact the translation coordinator.

Thank you for all your work,

The Translation Project robot, in the
name of your translation coordinator.




Contents of PO file 'cpplib-5.2.0.zh_TW.po'

2015-10-08 Thread Translation Project Robot


cpplib-5.2.0.zh_TW.po.gz
Description: Binary data
The Translation Project robot, in the
name of your translation coordinator.



[vec-cmp, patch 5/6] Disable bool patterns when possible

2015-10-08 Thread Ilya Enkovich
Hi,

This patch disables transformation of boolean computations into integer ones in 
case target supports vector comparison.  Pattern still applies to transform 
resulting boolean value into integer or avoid COND_EXPR with SSA_NAME as 
condition.

Thanks,
Ilya
--
2015-10-08  Ilya Enkovich  

* tree-vect-patterns.c (check_bool_pattern): Check fails
if we can vectorize comparison directly.
(search_type_for_mask): New.
(vect_recog_bool_pattern): Support cases when bool pattern
check fails.


diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c
index 830801a..e3be3d1 100644
--- a/gcc/tree-vect-patterns.c
+++ b/gcc/tree-vect-patterns.c
@@ -2951,7 +2951,7 @@ check_bool_pattern (tree var, loop_vec_info loop_vinfo, 
bb_vec_info bb_vinfo)
 default:
   if (TREE_CODE_CLASS (rhs_code) == tcc_comparison)
{
- tree vecitype, comp_vectype;
+ tree vecitype, comp_vectype, mask_type;
 
  /* If the comparison can throw, then is_gimple_condexpr will be
 false and we can't make a COND_EXPR/VEC_COND_EXPR out of it.  */
@@ -2962,6 +2962,11 @@ check_bool_pattern (tree var, loop_vec_info loop_vinfo, 
bb_vec_info bb_vinfo)
  if (comp_vectype == NULL_TREE)
return false;
 
+ mask_type = get_mask_type_for_scalar_type (TREE_TYPE (rhs1));
+ if (mask_type
+ && expand_vec_cmp_expr_p (comp_vectype, mask_type))
+   return false;
+
  if (TREE_CODE (TREE_TYPE (rhs1)) != INTEGER_TYPE)
{
  machine_mode mode = TYPE_MODE (TREE_TYPE (rhs1));
@@ -3186,6 +3191,75 @@ adjust_bool_pattern (tree var, tree out_type, tree 
trueval,
 }
 
 
+/* Try to determine a proper type for converting bool VAR
+   into an integer value.  The type is chosen so that
+   conversion has the same number of elements as a mask
+   producer.  */
+
+static tree
+search_type_for_mask (tree var, loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
+{
+  gimple *def_stmt;
+  enum vect_def_type dt;
+  tree def, rhs1;
+  enum tree_code rhs_code;
+  tree res = NULL;
+
+  if (TREE_CODE (var) != SSA_NAME)
+return NULL;
+
+  if ((TYPE_PRECISION (TREE_TYPE (var)) != 1
+   || !TYPE_UNSIGNED (TREE_TYPE (var)))
+  && TREE_CODE (TREE_TYPE (var)) != BOOLEAN_TYPE)
+return NULL;
+
+  if (!vect_is_simple_use (var, NULL, loop_vinfo, bb_vinfo, &def_stmt, &def,
+  &dt))
+return NULL;
+
+  if (dt != vect_internal_def)
+return NULL;
+
+  if (!is_gimple_assign (def_stmt))
+return NULL;
+
+  rhs_code = gimple_assign_rhs_code (def_stmt);
+  rhs1 = gimple_assign_rhs1 (def_stmt);
+
+  switch (rhs_code)
+{
+case SSA_NAME:
+case BIT_NOT_EXPR:
+CASE_CONVERT:
+  res = search_type_for_mask (rhs1, loop_vinfo, bb_vinfo);
+  break;
+
+case BIT_AND_EXPR:
+case BIT_IOR_EXPR:
+case BIT_XOR_EXPR:
+  if (!(res = search_type_for_mask (rhs1, loop_vinfo, bb_vinfo)))
+   res = search_type_for_mask (gimple_assign_rhs2 (def_stmt),
+   loop_vinfo, bb_vinfo);
+  break;
+
+default:
+  if (TREE_CODE_CLASS (rhs_code) == tcc_comparison)
+   {
+ if (TREE_CODE (TREE_TYPE (rhs1)) != INTEGER_TYPE
+ || !TYPE_UNSIGNED (TREE_TYPE (rhs1)))
+   {
+ machine_mode mode = TYPE_MODE (TREE_TYPE (rhs1));
+ res = build_nonstandard_integer_type (GET_MODE_BITSIZE (mode), 1);
+   }
+ else
+   res = TREE_TYPE (rhs1);
+   }
+}
+
+  return res;
+}
+
+
 /* Function vect_recog_bool_pattern
 
Try to find pattern like following:
@@ -3243,6 +3317,7 @@ vect_recog_bool_pattern (vec *stmts, tree 
*type_in,
   enum tree_code rhs_code;
   tree var, lhs, rhs, vectype;
   stmt_vec_info stmt_vinfo = vinfo_for_stmt (last_stmt);
+  stmt_vec_info new_stmt_info;
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_vinfo);
   gimple *pattern_stmt;
@@ -3268,16 +3343,53 @@ vect_recog_bool_pattern (vec *stmts, tree 
*type_in,
   if (vectype == NULL_TREE)
return NULL;
 
-  if (!check_bool_pattern (var, loop_vinfo, bb_vinfo))
-   return NULL;
-
-  rhs = adjust_bool_pattern (var, TREE_TYPE (lhs), NULL_TREE, stmts);
-  lhs = vect_recog_temp_ssa_var (TREE_TYPE (lhs), NULL);
-  if (useless_type_conversion_p (TREE_TYPE (lhs), TREE_TYPE (rhs)))
-   pattern_stmt = gimple_build_assign (lhs, SSA_NAME, rhs);
+  if (check_bool_pattern (var, loop_vinfo, bb_vinfo))
+   {
+ rhs = adjust_bool_pattern (var, TREE_TYPE (lhs), NULL_TREE, stmts);
+ lhs = vect_recog_temp_ssa_var (TREE_TYPE (lhs), NULL);
+ if (useless_type_conversion_p (TREE_TYPE (lhs), TREE_TYPE (rhs)))
+   pattern_stmt = gimple_build_assign (lhs, SSA_NAME, rhs);
+ else
+   pattern_stmt
+ = gimple_build_assign (lhs, NOP_EXPR, rhs);
+   }
  

[vec-cmp, patch 4/6] Support vector mask invariants

2015-10-08 Thread Ilya Enkovich
Hi,

This patch adds a special handling of boolean vector invariants.  We need 
additional code to determine type of generated invariant.  For VEC_COND_EXPR 
case we even provide this type directly because statement vectype doesn't allow 
us to compute it.  Separate code is used to generate and expand such vectors.

Thanks,
Ilya
--
gcc/

2015-10-08  Ilya Enkovich  

* expr.c (const_vector_mask_from_tree): New.
(const_vector_from_tree): Use const_vector_mask_from_tree
for boolean vectors.
* tree-vect-stmts.c (vect_init_vector): Support boolean vector
invariants.
(vect_get_vec_def_for_operand): Add VECTYPE arg.
(vectorizable_condition): Directly provide vectype for invariants
used in comparison.
* tree-vectorizer.h (vect_get_vec_def_for_operand): Add VECTYPE
arg.


diff --git a/gcc/expr.c b/gcc/expr.c
index 88da8cb..a624a34 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -11320,6 +11320,40 @@ try_tablejump (tree index_type, tree index_expr, tree 
minval, tree range,
   return 1;
 }
 
+/* Return a CONST_VECTOR rtx representing vector mask for
+   a VECTOR_CST of booleans.  */
+static rtx
+const_vector_mask_from_tree (tree exp)
+{
+  rtvec v;
+  unsigned i;
+  int units;
+  tree elt;
+  machine_mode inner, mode;
+
+  mode = TYPE_MODE (TREE_TYPE (exp));
+  units = GET_MODE_NUNITS (mode);
+  inner = GET_MODE_INNER (mode);
+
+  v = rtvec_alloc (units);
+
+  for (i = 0; i < VECTOR_CST_NELTS (exp); ++i)
+{
+  elt = VECTOR_CST_ELT (exp, i);
+
+  gcc_assert (TREE_CODE (elt) == INTEGER_CST);
+  if (integer_zerop (elt))
+   RTVEC_ELT (v, i) = CONST0_RTX (inner);
+  else if (integer_onep (elt)
+  || integer_minus_onep (elt))
+   RTVEC_ELT (v, i) = CONSTM1_RTX (inner);
+  else
+   gcc_unreachable ();
+}
+
+  return gen_rtx_CONST_VECTOR (mode, v);
+}
+
 /* Return a CONST_VECTOR rtx for a VECTOR_CST tree.  */
 static rtx
 const_vector_from_tree (tree exp)
@@ -11335,6 +11369,9 @@ const_vector_from_tree (tree exp)
   if (initializer_zerop (exp))
 return CONST0_RTX (mode);
 
+  if (VECTOR_BOOLEAN_TYPE_P (TREE_TYPE (exp)))
+  return const_vector_mask_from_tree (exp);
+
   units = GET_MODE_NUNITS (mode);
   inner = GET_MODE_INNER (mode);
 
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 6949c71..337ea7b 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -1308,27 +1308,61 @@ vect_init_vector_1 (gimple *stmt, gimple *new_stmt, 
gimple_stmt_iterator *gsi)
 tree
 vect_init_vector (gimple *stmt, tree val, tree type, gimple_stmt_iterator *gsi)
 {
+  tree val_type = TREE_TYPE (val);
+  machine_mode mode = TYPE_MODE (type);
+  machine_mode val_mode = TYPE_MODE(val_type);
   tree new_var;
   gimple *init_stmt;
   tree vec_oprnd;
   tree new_temp;
 
   if (TREE_CODE (type) == VECTOR_TYPE
-  && TREE_CODE (TREE_TYPE (val)) != VECTOR_TYPE)
-{
-  if (!types_compatible_p (TREE_TYPE (type), TREE_TYPE (val)))
+  && TREE_CODE (val_type) != VECTOR_TYPE)
+{
+  /* Handle vector of bool represented as a vector of
+integers here rather than on expand because it is
+a default mask type for targets.  Vector mask is
+built in a following way:
+
+tmp = (int)val
+vec_tmp = {tmp, ..., tmp}
+vec_cst = VIEW_CONVERT_EXPR(vec_tmp);  */
+  if (TREE_CODE (val_type) == BOOLEAN_TYPE
+ && VECTOR_MODE_P (mode)
+ && SCALAR_INT_MODE_P (GET_MODE_INNER (mode))
+ && GET_MODE_INNER (mode) != val_mode)
{
- if (CONSTANT_CLASS_P (val))
-   val = fold_unary (VIEW_CONVERT_EXPR, TREE_TYPE (type), val);
- else
+ unsigned size = GET_MODE_BITSIZE (GET_MODE_INNER (mode));
+ tree stype = build_nonstandard_integer_type (size, 1);
+ tree vectype = get_vectype_for_scalar_type (stype);
+
+ new_temp = make_ssa_name (stype);
+ init_stmt = gimple_build_assign (new_temp, NOP_EXPR, val);
+ vect_init_vector_1 (stmt, init_stmt, gsi);
+
+ val = make_ssa_name (vectype);
+ new_temp = build_vector_from_val (vectype, new_temp);
+ init_stmt = gimple_build_assign (val, new_temp);
+ vect_init_vector_1 (stmt, init_stmt, gsi);
+
+ val = build1 (VIEW_CONVERT_EXPR, type, val);
+   }
+  else
+   {
+ if (!types_compatible_p (TREE_TYPE (type), val_type))
{
- new_temp = make_ssa_name (TREE_TYPE (type));
- init_stmt = gimple_build_assign (new_temp, NOP_EXPR, val);
- vect_init_vector_1 (stmt, init_stmt, gsi);
- val = new_temp;
+ if (CONSTANT_CLASS_P (val))
+   val = fold_unary (VIEW_CONVERT_EXPR, TREE_TYPE (type), val);
+ else
+   {
+ new_temp = make_ssa_name (TREE_TYPE (type));
+ init_stmt = gimple_build_assign (new_temp, NOP_EXPR, val);
+ vect_init_vector_

[vec-cmp, patch 3/6] Vectorize comparison

2015-10-08 Thread Ilya Enkovich
Hi,

This patch supports comparison statements vectrization basing on introduced 
optabs.

Thanks,
Ilya
--
gcc/

2015-10-08  Ilya Enkovich  

* tree-vect-data-refs.c (vect_get_new_vect_var): Support vect_mask_var.
(vect_create_destination_var): Likewise.
* tree-vect-stmts.c (vectorizable_comparison): New.
(vect_analyze_stmt): Add vectorizable_comparison.
(vect_transform_stmt): Likewise.
* tree-vectorizer.h (enum vect_var_kind): Add vect_mask_var.
(enum stmt_vec_info_type): Add comparison_vec_info_type.
(vectorizable_comparison): New.


diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index 3befa38..9edc663 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -3849,6 +3849,9 @@ vect_get_new_vect_var (tree type, enum vect_var_kind 
var_kind, const char *name)
   case vect_scalar_var:
 prefix = "stmp";
 break;
+  case vect_mask_var:
+prefix = "mask";
+break;
   case vect_pointer_var:
 prefix = "vectp";
 break;
@@ -4403,7 +4406,11 @@ vect_create_destination_var (tree scalar_dest, tree 
vectype)
   tree type;
   enum vect_var_kind kind;
 
-  kind = vectype ? vect_simple_var : vect_scalar_var;
+  kind = vectype
+? VECTOR_BOOLEAN_TYPE_P (vectype)
+? vect_mask_var
+: vect_simple_var
+: vect_scalar_var;
   type = vectype ? vectype : TREE_TYPE (scalar_dest);
 
   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 8eda8e9..6949c71 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -7525,6 +7525,211 @@ vectorizable_condition (gimple *stmt, 
gimple_stmt_iterator *gsi,
   return true;
 }
 
+/* vectorizable_comparison.
+
+   Check if STMT is comparison expression that can be vectorized.
+   If VEC_STMT is also passed, vectorize the STMT: create a vectorized
+   comparison, put it in VEC_STMT, and insert it at GSI.
+
+   Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
+
+bool
+vectorizable_comparison (gimple *stmt, gimple_stmt_iterator *gsi,
+gimple **vec_stmt, tree reduc_def,
+slp_tree slp_node)
+{
+  tree lhs, rhs1, rhs2;
+  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+  tree vectype1 = NULL_TREE, vectype2 = NULL_TREE;
+  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+  tree vec_rhs1 = NULL_TREE, vec_rhs2 = NULL_TREE;
+  tree vec_compare;
+  tree new_temp;
+  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+  tree def;
+  enum vect_def_type dt, dts[4];
+  unsigned nunits;
+  int ncopies;
+  enum tree_code code;
+  stmt_vec_info prev_stmt_info = NULL;
+  int i, j;
+  bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
+  vec vec_oprnds0 = vNULL;
+  vec vec_oprnds1 = vNULL;
+  tree mask_type;
+  tree mask;
+
+  if (!VECTOR_BOOLEAN_TYPE_P (vectype))
+return false;
+
+  mask_type = vectype;
+  nunits = TYPE_VECTOR_SUBPARTS (vectype);
+
+  if (slp_node || PURE_SLP_STMT (stmt_info))
+ncopies = 1;
+  else
+ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
+
+  gcc_assert (ncopies >= 1);
+  if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
+return false;
+
+  if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
+  && !(STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
+  && reduc_def))
+return false;
+
+  if (STMT_VINFO_LIVE_P (stmt_info))
+{
+  if (dump_enabled_p ())
+   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+"value used after loop.\n");
+  return false;
+}
+
+  if (!is_gimple_assign (stmt))
+return false;
+
+  code = gimple_assign_rhs_code (stmt);
+
+  if (TREE_CODE_CLASS (code) != tcc_comparison)
+return false;
+
+  rhs1 = gimple_assign_rhs1 (stmt);
+  rhs2 = gimple_assign_rhs2 (stmt);
+
+  if (TREE_CODE (rhs1) == SSA_NAME)
+{
+  gimple *rhs1_def_stmt = SSA_NAME_DEF_STMT (rhs1);
+  if (!vect_is_simple_use_1 (rhs1, stmt, loop_vinfo, bb_vinfo,
+&rhs1_def_stmt, &def, &dt, &vectype1))
+   return false;
+}
+  else if (TREE_CODE (rhs1) != INTEGER_CST && TREE_CODE (rhs1) != REAL_CST
+  && TREE_CODE (rhs1) != FIXED_CST)
+return false;
+
+  if (TREE_CODE (rhs2) == SSA_NAME)
+{
+  gimple *rhs2_def_stmt = SSA_NAME_DEF_STMT (rhs2);
+  if (!vect_is_simple_use_1 (rhs2, stmt, loop_vinfo, bb_vinfo,
+&rhs2_def_stmt, &def, &dt, &vectype2))
+   return false;
+}
+  else if (TREE_CODE (rhs2) != INTEGER_CST && TREE_CODE (rhs2) != REAL_CST
+  && TREE_CODE (rhs2) != FIXED_CST)
+return false;
+
+  if (vectype1 && vectype2
+  && TYPE_VECTOR_SUBPARTS (vectype1) != TYPE_VECTOR_SUBPARTS (vectype2))
+return false;
+
+  vectype = vectype1 ? vectype1 : vectype2;
+
+  /* Invariant comparison.  */
+  if (!vectype)
+{
+  vectype = build_vector_type (TREE_TYPE (rhs1), nunits);
+  if (tree_to_shwi 

[vec-cmp, patch 2/6] Vectorization factor computation

2015-10-08 Thread Ilya Enkovich
Hi,

This patch handles statements with boolean result in vectorization factor 
computation.  For comparison its operands type is used instead of restult type 
to compute VF.  Other boolean statements are ignored for VF.

Vectype for comparison is computed using type of compared values.  Computed 
type is propagated into other boolean operations.

Thanks,
Ilya
--
gcc/

2015-10-08  Ilya Enkovich  

* tree-vect-loop.c (vect_determine_vectorization_factor):  Ignore mask
operations for VF.  Add mask type computation.
* tree-vect-stmts.c (get_mask_type_for_scalar_type): New.
* tree-vectorizer.h (get_mask_type_for_scalar_type): New.


diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 63e29aa..c7e8067 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -183,19 +183,21 @@ vect_determine_vectorization_factor (loop_vec_info 
loop_vinfo)
 {
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
-  int nbbs = loop->num_nodes;
+  unsigned nbbs = loop->num_nodes;
   unsigned int vectorization_factor = 0;
   tree scalar_type;
   gphi *phi;
   tree vectype;
   unsigned int nunits;
   stmt_vec_info stmt_info;
-  int i;
+  unsigned i;
   HOST_WIDE_INT dummy;
   gimple *stmt, *pattern_stmt = NULL;
   gimple_seq pattern_def_seq = NULL;
   gimple_stmt_iterator pattern_def_si = gsi_none ();
   bool analyze_pattern_stmt = false;
+  bool bool_result;
+  auto_vec mask_producers;
 
   if (dump_enabled_p ())
 dump_printf_loc (MSG_NOTE, vect_location,
@@ -414,6 +416,8 @@ vect_determine_vectorization_factor (loop_vec_info 
loop_vinfo)
  return false;
}
 
+ bool_result = false;
+
  if (STMT_VINFO_VECTYPE (stmt_info))
{
  /* The only case when a vectype had been already set is for stmts
@@ -434,6 +438,32 @@ vect_determine_vectorization_factor (loop_vec_info 
loop_vinfo)
scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
  else
scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
+
+ /* Bool ops don't participate in vectorization factor
+computation.  For comparison use compared types to
+compute a factor.  */
+ if (TREE_CODE (scalar_type) == BOOLEAN_TYPE)
+   {
+ mask_producers.safe_push (stmt_info);
+ bool_result = true;
+
+ if (gimple_code (stmt) == GIMPLE_ASSIGN
+ && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
+== tcc_comparison
+ && TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (stmt)))
+!= BOOLEAN_TYPE)
+   scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
+ else
+   {
+ if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
+   {
+ pattern_def_seq = NULL;
+ gsi_next (&si);
+   }
+ continue;
+   }
+   }
+
  if (dump_enabled_p ())
{
  dump_printf_loc (MSG_NOTE, vect_location,
@@ -456,7 +486,8 @@ vect_determine_vectorization_factor (loop_vec_info 
loop_vinfo)
  return false;
}
 
- STMT_VINFO_VECTYPE (stmt_info) = vectype;
+ if (!bool_result)
+   STMT_VINFO_VECTYPE (stmt_info) = vectype;
 
  if (dump_enabled_p ())
{
@@ -469,8 +500,9 @@ vect_determine_vectorization_factor (loop_vec_info 
loop_vinfo)
  /* The vectorization factor is according to the smallest
 scalar type (or the largest vector size, but we only
 support one vector size per loop).  */
- scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
-  &dummy);
+ if (!bool_result)
+   scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
+&dummy);
  if (dump_enabled_p ())
{
  dump_printf_loc (MSG_NOTE, vect_location,
@@ -545,6 +577,100 @@ vect_determine_vectorization_factor (loop_vec_info 
loop_vinfo)
 }
   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 
+  for (i = 0; i < mask_producers.length (); i++)
+{
+  tree mask_type = NULL;
+  bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (mask_producers[i]);
+
+  stmt = STMT_VINFO_STMT (mask_producers[i]);
+
+  if (gimple_code (stmt) == GIMPLE_ASSIGN
+ && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
+ && TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (stmt))) != BOOLEAN_TYPE)
+   {
+ scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
+ mask_type = get_mask_type_for_scalar_type (scalar_type);
+
+  

[vec-cmp, patch 1/6] Add optabs for vector comparison

2015-10-08 Thread Ilya Enkovich
Hi,

This series introduces autogeneration of vector comparison and its support on 
i386 target.  It lets comparison statements to be vectorized into vector 
comparison instead of VEC_COND_EXPR.  This allows to avoid some restrictions 
implied by boolean patterns.  This series applies on top of bolean vectors 
series [1].

This patch introduces optabs for vector comparison.

[1] https://gcc.gnu.org/ml/gcc-patches/2015-10/msg00215.html

Thanks,
Ilya
--
gcc/

2015-10-08  Ilya Enkovich  

* expr.c (do_store_flag): Use expand_vec_cmp_expr for mask results.
* optabs-query.h (get_vec_cmp_icode): New.
* optabs-tree.c (expand_vec_cmp_expr_p): New.
* optabs-tree.h (expand_vec_cmp_expr_p): New.
* optabs.c (vector_compare_rtx): Add OPNO arg.
(expand_vec_cond_expr): Adjust to vector_compare_rtx change.
(expand_vec_cmp_expr): New.
* optabs.def (vec_cmp_optab): New.
(vec_cmpu_optab): New.
* optabs.h (expand_vec_cmp_expr): New.
* tree-vect-generic.c (expand_vector_comparison): Add vector
comparison optabs check.


diff --git a/gcc/expr.c b/gcc/expr.c
index 0bbfccd..88da8cb 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -11025,9 +11025,15 @@ do_store_flag (sepops ops, rtx target, machine_mode 
mode)
   if (TREE_CODE (ops->type) == VECTOR_TYPE)
 {
   tree ifexp = build2 (ops->code, ops->type, arg0, arg1);
-  tree if_true = constant_boolean_node (true, ops->type);
-  tree if_false = constant_boolean_node (false, ops->type);
-  return expand_vec_cond_expr (ops->type, ifexp, if_true, if_false, 
target);
+  if (VECTOR_BOOLEAN_TYPE_P (ops->type))
+   return expand_vec_cmp_expr (ops->type, ifexp, target);
+  else
+   {
+ tree if_true = constant_boolean_node (true, ops->type);
+ tree if_false = constant_boolean_node (false, ops->type);
+ return expand_vec_cond_expr (ops->type, ifexp, if_true,
+  if_false, target);
+   }
 }
 
   /* Get the rtx comparison code to use.  We know that EXP is a comparison
diff --git a/gcc/optabs-query.h b/gcc/optabs-query.h
index 73f2729..81ac362 100644
--- a/gcc/optabs-query.h
+++ b/gcc/optabs-query.h
@@ -74,6 +74,16 @@ trapv_binoptab_p (optab binoptab)
  || binoptab == smulv_optab);
 }
 
+/* Return insn code for a comparison operator with VMODE
+   resultin MASK_MODE, unsigned if UNS is true.  */
+
+static inline enum insn_code
+get_vec_cmp_icode (machine_mode vmode, machine_mode mask_mode, bool uns)
+{
+  optab tab = uns ? vec_cmpu_optab : vec_cmp_optab;
+  return convert_optab_handler (tab, vmode, mask_mode);
+}
+
 /* Return insn code for a conditional operator with a comparison in
mode CMODE, unsigned if UNS is true, resulting in a value of mode VMODE.  */
 
diff --git a/gcc/optabs-tree.c b/gcc/optabs-tree.c
index 3b03338..aa863cf 100644
--- a/gcc/optabs-tree.c
+++ b/gcc/optabs-tree.c
@@ -320,6 +320,19 @@ supportable_convert_operation (enum tree_code code,
   return false;
 }
 
+/* Return TRUE if appropriate vector insn is available
+   for vector comparison expr with vector type VALUE_TYPE
+   and resulting mask with MASK_TYPE.  */
+
+bool
+expand_vec_cmp_expr_p (tree value_type, tree mask_type)
+{
+  enum insn_code icode = get_vec_cmp_icode (TYPE_MODE (value_type),
+   TYPE_MODE (mask_type),
+   TYPE_UNSIGNED (value_type));
+  return (icode != CODE_FOR_nothing);
+}
+
 /* Return TRUE iff, appropriate vector insns are available
for vector cond expr with vector type VALUE_TYPE and a comparison
with operand vector types in CMP_OP_TYPE.  */
diff --git a/gcc/optabs-tree.h b/gcc/optabs-tree.h
index bf6c9e3..5b966ca 100644
--- a/gcc/optabs-tree.h
+++ b/gcc/optabs-tree.h
@@ -39,6 +39,7 @@ optab optab_for_tree_code (enum tree_code, const_tree, enum 
optab_subtype);
 optab scalar_reduc_to_vector (optab, const_tree);
 bool supportable_convert_operation (enum tree_code, tree, tree, tree *,
enum tree_code *);
+bool expand_vec_cmp_expr_p (tree, tree);
 bool expand_vec_cond_expr_p (tree, tree);
 void init_tree_optimization_optabs (tree);
 
diff --git a/gcc/optabs.c b/gcc/optabs.c
index 8d9d742..ca1a6e7 100644
--- a/gcc/optabs.c
+++ b/gcc/optabs.c
@@ -5100,11 +5100,13 @@ get_rtx_code (enum tree_code tcode, bool unsignedp)
 }
 
 /* Return comparison rtx for COND. Use UNSIGNEDP to select signed or
-   unsigned operators. Do not generate compare instruction.  */
+   unsigned operators.  OPNO holds an index of the first comparison
+   operand in insn with code ICODE.  Do not generate compare instruction.  */
 
 static rtx
 vector_compare_rtx (enum tree_code tcode, tree t_op0, tree t_op1,
-   bool unsignedp, enum insn_code icode)
+   bool unsignedp, enum insn_code icode,
+   unsigned int opno)
 {
   struct expand_operand ops[2];
   rtx rtx_op0

C++ PATCH for c++/67557 (copy elision clobbering tail padding)

2015-10-08 Thread Jason Merrill
In this testcase, the FontTag constructor was storing the result of 
fontToStartTag into a stack temporary and then bitwise copying the 
temporary into the StartTag base subobject, which is broken for a class 
with a non-trivial copy constructor.  This bogus copy turns out to have 
been introduced by store_field, because the base field is smaller than 
the type of the call.


In fact, the copy elision that the front end is expecting here is 
unsafe, because we can't be sure that the called function won't clobber 
tail padding in the base subobject; we need to force a real copy 
constructor call from the function return value to the base subobject.


I also want to add an assert to store_field to avoid inappropriate 
bitwise copying of TREE_ADDRESSABLE types, but that is waiting for my 
patch for the CONSTRUCTOR case.


Tested x86_64-pc-linux-gnu, applying to trunk.

commit 80057a9d21415f5ccd29328183a2e3d6b3a0c5e1
Author: Jason Merrill 
Date:   Wed Oct 7 16:39:13 2015 -0400

	PR c++/67557

	* call.c (is_base_field_ref): New.
	(unsafe_copy_elision_p): New.
	(build_over_call): Use it.

diff --git a/gcc/cp/call.c b/gcc/cp/call.c
index 93e28dc..f8db2df 100644
--- a/gcc/cp/call.c
+++ b/gcc/cp/call.c
@@ -7094,6 +7094,39 @@ call_copy_ctor (tree a, tsubst_flags_t complain)
   return r;
 }
 
+/* Return true iff T refers to a base field.  */
+
+static bool
+is_base_field_ref (tree t)
+{
+  STRIP_NOPS (t);
+  if (TREE_CODE (t) == ADDR_EXPR)
+t = TREE_OPERAND (t, 0);
+  if (TREE_CODE (t) == COMPONENT_REF)
+t = TREE_OPERAND (t, 1);
+  if (TREE_CODE (t) == FIELD_DECL)
+return DECL_FIELD_IS_BASE (t);
+  return false;
+}
+
+/* We can't elide a copy from a function returning by value to a base
+   subobject, as the callee might clobber tail padding.  Return true iff this
+   could be that case.  */
+
+static bool
+unsafe_copy_elision_p (tree target, tree exp)
+{
+  tree type = TYPE_MAIN_VARIANT (TREE_TYPE (exp));
+  if (type == CLASSTYPE_AS_BASE (type))
+return false;
+  if (!is_base_field_ref (target)
+  && resolves_to_fixed_type_p (target, NULL))
+return false;
+  tree init = TARGET_EXPR_INITIAL (exp);
+  return (TREE_CODE (init) == AGGR_INIT_EXPR
+	  && !AGGR_INIT_VIA_CTOR_P (init));
+}
+
 /* Subroutine of the various build_*_call functions.  Overload resolution
has chosen a winning candidate CAND; build up a CALL_EXPR accordingly.
ARGS is a TREE_LIST of the unconverted arguments to the call.  FLAGS is a
@@ -7513,7 +7546,9 @@ build_over_call (struct z_candidate *cand, int flags, tsubst_flags_t complain)
 	  else if (trivial)
 	return force_target_expr (DECL_CONTEXT (fn), arg, complain);
 	}
-  else if (TREE_CODE (arg) == TARGET_EXPR || trivial)
+  else if (trivial
+	   || (TREE_CODE (arg) == TARGET_EXPR
+		   && !unsafe_copy_elision_p (fa, arg)))
 	{
 	  tree to = stabilize_reference (cp_build_indirect_ref (fa, RO_NULL,
 complain));
diff --git a/gcc/testsuite/g++.dg/init/elide3.C b/gcc/testsuite/g++.dg/init/elide3.C
new file mode 100644
index 000..7eb0389
--- /dev/null
+++ b/gcc/testsuite/g++.dg/init/elide3.C
@@ -0,0 +1,50 @@
+// PR c++/67557
+// { dg-do run }
+
+namespace std
+{
+  struct string
+  {
+typedef unsigned long size_type;
+const char* _M_p;
+char_M_local_buf[1];
+
+string(const char* s) : _M_p(_M_local_buf)
+{
+  __builtin_printf("%p constructed\n", this);
+}
+
+string(const string& s) : _M_p(_M_local_buf)
+{
+  __builtin_printf("%p copied from %p\n", this, &s);
+}
+
+~string()
+{
+  __builtin_printf("%p destroyed\n", this);
+  if (_M_p != _M_local_buf)
+	__builtin_abort();
+}
+  };
+}
+
+struct StartTag
+{
+  explicit StartTag(std::string const & tag) : tag_(tag), keepempty_(false) {}
+  std::string tag_;
+  bool keepempty_;
+};
+
+StartTag fontToStartTag() { return StartTag(""); }
+
+struct FontTag : public StartTag
+{
+  FontTag() : StartTag(fontToStartTag()) {}
+};
+
+int main()
+{
+  FontTag x;
+  __builtin_printf("%p x.tag_ in main()\n", &x.tag_);
+  return 0;
+}


Re: [PATCH ARM]: PR67745: Fix function alignment after __attribute__ 2/2

2015-10-08 Thread Bernd Schmidt

On 10/08/2015 03:50 PM, Christian Bruel wrote:

Humm, I don't know what kind of alignment optimization for functions we
have based on a declaration only. greping DECL_ALIGN on functions there
are some bits in the ipa-icf code that seems to merge code using this
information, but I think we have a definition at that point.
but honestly, I'm very unfamiliar with this pass. Do you have something
else in mind ?


I had a vague memory of us optimizing that, but I can't find the code 
either and maybe it's just not there. That doesn't mean someone isn't 
going to add it in the future, and I'm uncomfortable leaving incorrect 
DECL_ALIGN values around.


It looks like rest_of_decl_compilation may be a good place to take care 
of declarations, but using FUNCTION_BOUNDARY is probably going to give 
the wrong results. So maybe a target hook, function_boundary_for_decl, 
defaulting to just returning FUNCTION_BOUNDARY? Eventually it could 
replace the macro entirely.



Bernd




[PATCH] Disable AIX DWARF debug frame section

2015-10-08 Thread David Edelsohn
The standard DWARF2 unwind information is not compatible with AIX
DWARF debug unwind section, so this patch disables that section
completely.

bootstrapped on powerpc-ibm-aix7.1.2.0

Thanks, David

* config/rs6000/rs6000.c (rs6000_xcoff_debug_unwind_info): Always
return UI_NONE.

Index: config/rs6000/rs6000.c
===
--- config/rs6000/rs6000.c  (revision 228599)
+++ config/rs6000/rs6000.c  (working copy)
@@ -30709,10 +30709,7 @@ rs6000_elf_file_end (void)
 static enum unwind_info_type
 rs6000_xcoff_debug_unwind_info (void)
 {
-  if (HAVE_XCOFF_DWARF_EXTRAS)
-return UI_DWARF2;
-  else
-return UI_NONE;
+  return UI_NONE;
 }

 static void


Re: [gomp4.1] Doacross library implementation

2015-10-08 Thread Aldy Hernandez

On 10/08/2015 05:48 AM, Torvald Riegel wrote:

On Thu, 2015-09-24 at 20:32 +0200, Jakub Jelinek wrote:

Torvald, can you please have a look at it, if I got all the atomics / memory
models right?


More detailed comments below, but in general, I'd really suggest to add
more code comments for the synchronization parts.  In the end, the level
of detail of documentation of libgomp is your decision, but, for
example, the lack of comments in synchronization code in glibc has made
maintaining this code and fixing issues in it very costly.  It has also
been hard to understand for many.

My suggestion would be both to (1) document the high-level, abstract
synchronization scheme and (2) how that scheme is implemented.  The
first point is important in my experience because typically, the
high-level scheme and the actual thinking behind it (or, IOW, the intent
of the original author) is much harder to reconstruct in case of
concurrent code than it is for sequential code; you can't just simply
follow the program along line by line, but have to consider
interleavings.


I couldn't agree more.  After having spent the last month trying to make 
sense of libgomp/task.c, I can honestly say that we need better internal 
documentation.  I know this isn't Jakub's fault, as Richard started the 
non-documenting party, but clearly defined descriptions, functions, and 
implementation go a long way.  APIs and abstractions also make things a 
_lot_ easier to follow.


It could also be that I'm very new to runtime work, specifically 
parallel runtime work, but it was hard to understand.  I think I finally 
have a firm grasp on it (I hope), but it did take me until early this 
week.  Consequently, I took it upon myself to documenting big pieces of 
task.c this week.  I assume anyone not jakub/rth coming after me will 
benefit from it.  So yeah, my upcoming patch will have some variables 
renamed, many more functions with better descriptions (or descriptions 
at all, etc), and a clearly defined API.


Maybe my brain is small; but this stuff is hard.  Every little bit helps :).

p.s. Ironically, it seems that the longer I spend looking at this code, 
the less I feel I need to comment because things are now "obvious", 
which perhaps is an indication that either putting newbies on the 
projects is a good thing, or documenting things early is good practice.


Aldy


[Patch PR target/67366 2/2] [gimple-fold.c] Support movmisalign optabs in gimple-fold.c

2015-10-08 Thread Ramana Radhakrishnan
This patch by Richard allows for movmisalign optabs to be supported
in gimple-fold.c. This caused a bit of pain in the testsuite with strlenopt-8.c
in conjunction with the ARM support for movmisalign_optabs as the test
was coded up to do different things depending on whether the target
supported misaligned access or not. However now with unaligned access
being allowed for different levels of the architecture in the arm backend,
the concept of the helper function non_strict_align mapping identically
to the definition of STRICT_ALIGNMENT disappears.

Adjusted thusly for ARM. The testsuite/lib changes were tested with an
arm-none-eabi multilib that included architecture variants that did not
support unaligned access and architecture variants that did.

The testing matrix for this patch was:

1. x86_64 bootstrap and regression test - no regressions.
2. armhf bootstrap and regression test - no regressions.
3. arm-none-eabi cross build and regression test for

{-marm/-march=armv7-a/-mfpu=vfpv3-d16/-mfloat-abi=softfp}
{-mthumb/-march=armv8-a/-mfpu=crypto-neon-fp-armv8/-mfloat-abi=hard}
{-marm/-mcpu=arm7tdmi/-mfloat-abi=soft}
{-mthumb/-mcpu=arm7tdmi/-mfloat-abi=soft}

with no regressions.

Ok to apply ?

Ramana

2015-10-08  Richard Biener  

* gimple-fold.c (optabs-query.h): Include
(gimple_fold_builtin_memory_op): Allow unaligned stores
when movmisalign_optabs are available.

2015-10-08  Ramana Radhakrishnan  

PR target/67366
* lib/target-supports.exp (check_effective_target_non_strict_align):
Adjust for arm*-*-*.
* gcc.target/arm/pr67366.c: New test.
---
 gcc/gimple-fold.c  | 11 +--
 gcc/testsuite/gcc.target/arm/pr67366.c | 14 ++
 gcc/testsuite/lib/target-supports.exp  |  9 +
 3 files changed, 32 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/arm/pr67366.c

diff --git a/gcc/gimple-fold.c b/gcc/gimple-fold.c
index a6caaa4..59d496b 100644
--- a/gcc/gimple-fold.c
+++ b/gcc/gimple-fold.c
@@ -63,6 +63,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-eh.h"
 #include "gimple-match.h"
 #include "gomp-constants.h"
+#include "optabs-query.h"
+
 
 /* Return true when DECL can be referenced from current unit.
FROM_DECL (if non-null) specify constructor of variable DECL was taken from.
@@ -709,7 +711,9 @@ gimple_fold_builtin_memory_op (gimple_stmt_iterator *gsi,
  /* If the destination pointer is not aligned we must be able
 to emit an unaligned store.  */
  && (dest_align >= GET_MODE_ALIGNMENT (TYPE_MODE (type))
- || !SLOW_UNALIGNED_ACCESS (TYPE_MODE (type), dest_align)))
+ || !SLOW_UNALIGNED_ACCESS (TYPE_MODE (type), dest_align)
+ || (optab_handler (movmisalign_optab, TYPE_MODE (type))
+ != CODE_FOR_nothing)))
{
  tree srctype = type;
  tree desttype = type;
@@ -721,7 +725,10 @@ gimple_fold_builtin_memory_op (gimple_stmt_iterator *gsi,
srcmem = tem;
  else if (src_align < GET_MODE_ALIGNMENT (TYPE_MODE (type))
   && SLOW_UNALIGNED_ACCESS (TYPE_MODE (type),
-src_align))
+src_align)
+  && (optab_handler (movmisalign_optab,
+ TYPE_MODE (type))
+  == CODE_FOR_nothing))
srcmem = NULL_TREE;
  if (srcmem)
{
diff --git a/gcc/testsuite/gcc.target/arm/pr67366.c 
b/gcc/testsuite/gcc.target/arm/pr67366.c
new file mode 100644
index 000..1e8b672
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/pr67366.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_unaligned } */
+/* { dg-options "-O2" } */
+
+typedef unsigned int u32;
+u32
+read32 (const void* ptr)
+{
+  u32 v;
+  __builtin_memcpy (&v, ptr, sizeof(v));
+  return v;
+}
+
+/* { dg-final { scan-assembler "@ unaligned" } } */
diff --git a/gcc/testsuite/lib/target-supports.exp 
b/gcc/testsuite/lib/target-supports.exp
index 9057a27..4d5b0a3d 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -6262,6 +6262,15 @@ proc check_vect_support_and_set_flags { } {
 # Return 1 if the target does *not* require strict alignment.
 
 proc check_effective_target_non_strict_align {} {
+
+# On ARM, the default is to use STRICT_ALIGNMENT, but there
+# are interfaces defined for misaligned access and thus
+# depending on the architecture levels unaligned access is
+# available.
+if [istarget "arm*-*-*"] {
+   return [check_effective_target_arm_unaligned]
+}
+
 return [check_no_compiler_messages non_strict_align assembly {
char *y;
typ

[Patch PR target/67366 1/2] [ARM] - Add movmisalignhi / si patterns

2015-10-08 Thread Ramana Radhakrishnan
This adds movmisalignhi and movmisalignsi expanders when unaligned
access is allowed by the architecture. This allows the mid-end
to expand to misaligned loads and stored.

Compared code generated for the Linux kernel and
it changes code generation for a handful of files all for the better
basically by reducing the stack usage.

Tested by :

1. armhf bootstrap and regression test - no regressions.
2.. arm-none-eabi cross build and regression test for

{-marm/-march=armv7-a/-mfpu=vfpv3-d16/-mfloat-abi=softfp}
{-mthumb/-march=armv8-a/-mfpu=crypto-neon-fp-armv8/-mfloat-abi=hard}
{-marm/-mcpu=arm7tdmi/-mfloat-abi=soft}
{-mthumb/-mcpu=arm7tdmi/-mfloat-abi=soft}

Will apply to trunk once 2/2 is approved.

regards
Ramana

2015-09-15  Ramana Radhakrishnan  

PR target/67366
* config/arm/arm.md (movmisalign): New.
* config/arm/iterators.md (HSI): New.
---
 gcc/config/arm/arm.md   | 35 +++
 gcc/config/arm/iterators.md |  3 +++
 2 files changed, 38 insertions(+)

diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index b4c555b..9a3f7bd 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -11506,6 +11506,41 @@
   }"
 )
 
+;; movmisalign patterns for HImode and SImode.
+(define_expand "movmisalign"
+  [(match_operand:HSI 0 "general_operand")
+   (match_operand:HSI 1 "general_operand")]
+  "unaligned_access"
+{
+  /* This pattern is not permitted to fail during expansion: if both arguments
+ are non-registers (e.g. memory := constant), force operand 1 into a
+ register.  */
+  rtx (* gen_unaligned_load)(rtx, rtx);
+  rtx tmp_dest = operands[0];
+  if (!s_register_operand (operands[0], mode)
+  && !s_register_operand (operands[1], mode))
+operands[1] = force_reg (mode, operands[1]);
+
+  if (mode == HImode)
+   {
+gen_unaligned_load = gen_unaligned_loadhiu;
+tmp_dest = gen_reg_rtx (SImode);
+   }
+  else
+gen_unaligned_load = gen_unaligned_loadsi;
+
+  if (MEM_P (operands[1]))
+   {
+emit_insn (gen_unaligned_load (tmp_dest, operands[1]));
+if (mode == HImode)
+  emit_move_insn (operands[0], gen_lowpart (HImode, tmp_dest));
+   }
+  else
+emit_insn (gen_unaligned_store (operands[0], operands[1]));
+
+  DONE;
+})
+
 ;; Vector bits common to IWMMXT and Neon
 (include "vec-common.md")
 ;; Load the Intel Wireless Multimedia Extension patterns
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 47cc1ee..6a54125 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -33,6 +33,9 @@
 ;; A list of integer modes that are up to one word long
 (define_mode_iterator QHSI [QI HI SI])
 
+;; A list of integer modes that are half and one word long
+(define_mode_iterator HSI [HI SI])
+
 ;; A list of integer modes that are less than a word
 (define_mode_iterator NARROW [QI HI])
 
-- 
1.9.1



[Patch PR target/67366 0/2] Handle misaligned loads and stores for scalars on STRICT_ALIGNMENT targets

2015-10-08 Thread Ramana Radhakrishnan
This set of 2 patches attempts to fix PR target/67366 by introducing
use of the movmisalign optabs in gimple-fold.c to allow detecting
memcpys of the form shown in the testcase in the PR.

Patch 1/2 fixes the ARM backend to produce movmisalign patterns when
unaligned access is supported.

Patch 2/2 fixes gimple-fold.c to handle the appropriate cases
and deals with the testsuite fall out that this caused on ARM.



Re: [PATCH ARM]: PR67745: Fix function alignment after __attribute__ 2/2

2015-10-08 Thread Bernd Schmidt

On 10/08/2015 04:01 PM, Christian Bruel wrote:


OK, Similar pattern occurs at many other places, that changed also in
the attached proposal.
Not fully tested (in particular the java part) and no ChangeLog. Just to
make sure that we agree on the interface first.


That looks like a plain diff rather than context or unified, which is 
very hard to read, but I think this is the approach I was looking for. 
Well done spotting the other places.



Bernd


Re: [PATCH] Simple 2-lines fix for outer-loop vectorization.

2015-10-08 Thread Richard Biener
On Thu, Oct 8, 2015 at 3:52 PM, Yuri Rumyantsev  wrote:
> Hi All,
>
> Here is a simple patch which allows to have phi with virtual operands
> in inner-loop loop-closed exit phi in outer-loop vectorization (i.e. a
> phi in the tail of the outer-loop).
>
> Bootstrap and regression testing did not show any new failures.
>
> Is it OK for trunk?

Ok.

> ChangeLog:
>
> 2015-10-08  Yuri Rumyantsev  
>
> * tree-vect-loop.c (vect_analyze_loop_operations): Skip virtual phi
> in the tail of outer-loop.
>
>gcc/testsuite/ChangeLog:
> * gcc.dg/vect/vect-outer-simd-3.c: New test.


[PATCH] Simple 2-lines fix for outer-loop vectorization.

2015-10-08 Thread Yuri Rumyantsev
Hi All,

Here is a simple patch which allows to have phi with virtual operands
in inner-loop loop-closed exit phi in outer-loop vectorization (i.e. a
phi in the tail of the outer-loop).

Bootstrap and regression testing did not show any new failures.

Is it OK for trunk?

ChangeLog:

2015-10-08  Yuri Rumyantsev  

* tree-vect-loop.c (vect_analyze_loop_operations): Skip virtual phi
in the tail of outer-loop.

   gcc/testsuite/ChangeLog:
* gcc.dg/vect/vect-outer-simd-3.c: New test.


patch.outer-vec
Description: Binary data


Re: Generalize gimple_val_nonnegative_real_p

2015-10-08 Thread Richard Biener
On Thu, Oct 8, 2015 at 12:10 PM, Richard Sandiford
 wrote:
> Richard Biener  writes:
>> On Mon, Oct 5, 2015 at 5:02 PM, Richard Sandiford
>>  wrote:
>>> The upcoming patch to move sqrt and cbrt simplifications to match.pd
>>> caused a regression because the (abs @0)->@0 simplification didn't
>>> trigger for:
>>>
>>> (abs (convert (abs X)))
>>>
>>> The simplification is based on tree_expr_nonnegative_p, which is
>>> pretty weak for gimple (it gives up if it sees an SSA_NAME).
>>>
>>> We have the stronger gimple_val_nonnegative_real_p, but (a) as its
>>> name implies, it's specific to reals and (b) in its current form it
>>> doesn't handle converts.  This patch:
>>>
>>> - generalises the routine all types
>>> - reuses tree_{unary,binary,call}_nonnegative_warnv_p for the leaf cases
>>> - makes the routine handle CONVERT_EXPR
>>> - allows a nesting depth of 1 for CONVERT_EXPR
>>> - uses the routine instead of tree_expr_nonnegative_p for gimple.
>>>
>>> Limiting the depth to 1 is a little arbitrary but adding a param seemed
>>> over the top.
>>>
>>> Bootstrapped & regression-tested on x86_64-linux-gnu.  I didn't write
>>> a specific test because this is already covered by the testsuite if
>>> the follow-on patch is also applied.  OK to install?
>>
>> Hmm.  I don't like having essentially two copies of the same machinery.
>> Can you instead fold gimple_val_nonnegative_real_p into a
>> tree_ssa_name_nonnegative_warnv_p used by tree_expr_nonnegative_warnv_p?
>> For integers it's also possible to look at SSA name range info.
>> You'd still limit recursion appropriately (by passing down a depth arg
>> everywhere,
>> defaulted to 0 I guess).
>
> OK.  I wanted to combine the functions originally but, with
> gimple_val_nonnegative_real_p being an obvious partial cut-&-paste
> of fold-const.c, I assumed things were the way they were because
> having a single routine would be breaking some abstraction barrier.
>
> This patch moves the vrp code for querying gimple statements to
> gimple-fold.c, adds a function to fold-const.c for querying SSA names,
> and adds a depth parameter to both sets of functions.  As discussed
> on IRC, it has the downside that gimple-fold.c calls fold-const.c
> and fold-const.c calls gimple-fold.c.
>
> Also as discussed on IRC, a knock-on effect is that we can now prove
> _i_589 < 0 is false in sequences like:
>
>   i_1917 = ASSERT_EXPR ;
>   _i_589 = (const int) i_1917;
>   _i_1507 = ASSERT_EXPR <_i_589, _i_589 < 0>;
>
> This defeats an assert in tree-vrp.c that ASSERT_EXPR conditions
> are never known to be false.  Previously the assert only ever used
> local knowledge and so would be limited to cases like x != x for
> integer x.  Now that we use global knowledge it's possible to prove
> the assertion condition is false in blocks that are in practice
> unreachable.  I've removed the assert in the patch below.
>
> (FWIW the first hit was during stage2 of a bootstrap, in cfgcleanup.c)
>
>> Note that the comment in gimple_val_nonnegative_real_p is correct in
>> that we really shouldn't recurse (but maybe handle fixed patterns -
>> like you do here) as the appropriate way is to have a "nonnegative"
>> lattice.  SSA name range info may already provide enough info here
>> (well, not for reals - time to add basic real range support to VRP!).
>
> I retained a form of this comment in tree_ssa_name_nonnegative_warnv_p.
>
> Bootstrapped & regression-tested on x86_64-linux-gnu.  I didn't write
> a specific test because this is already covered by the testsuite if
> the follow-on patch is also applied.  OK to install?

Ok.

Thanks,
Richard.

> Thanks,
> Richard
>
>
> gcc/
> * params.def (PARAM_MAX_SSA_NAME_QUERY_DEPTH): New param.
> * doc/invoke.texi (--param max-ssa-name-query-depth): Document.
> * fold-const.h (tree_unary_nonnegative_warnv_p)
> (tree_single_nonnegative_warnv_p, tree_call_nonnegative_warnv_p)
> (tree_expr_nonnegative_warnv_p): Add depth parameters.
> * fold-const.c: Include gimple-fold.h and params.h.
> (tree_ssa_name_nonnegative_warnv_p): New function.
> (tree_unary_nonnegative_warnv_p, tree_binary_nonnegative_warnv_p)
> (tree_single_nonnegative_warnv_p, tree_call_nonnegative_warnv_p)
> (tree_invalid_nonnegative_warnv_p, tree_expr_nonnegative_warnv_p):
> Add a depth parameter and increment it for recursive calls to
> tree_expr_nonnegative_warnv_p.  Use tree_ssa_name_nonnegative_warnv_p
> to handle SSA names.
> * gimple-fold.h (gimple_val_nonnegative_real_p): Delete.
> (gimple_stmt_nonnegative_warnv_p): Declare.
> * tree-vrp.c (remove_range_assertions): Remove assert that condition
> cannot be proven false.
> (gimple_assign_nonnegative_warnv_p, gimple_call_nonnegative_warnv_p)
> (gimple_stmt_nonnegative_warnv_p): Move to...
> * gimple-fold.c: ...here.  Add depth parameters and pass them
> down to the tree rout

Re: using scratchpads to enhance RTL-level if-conversion: revised patch

2015-10-08 Thread Bernd Schmidt

On 10/08/2015 01:29 AM, Abe wrote:

Attached please find my revised patch to the RTL if converter.  This
patch enables the
if-conversion of half-hammocks with a store in them that the internal
GCC machinery
otherwise considers too hazardous to if-convert.  This is made safe by
using the
"scratchpad" technique, i.e. throwing away the store into a safe
location where nothing
of any importance is currently stored.  The scratchpads are allocated in
the stack frame.


So, one conceptual issue first. Obviously this increases the size of the 
stack frame, which makes the transformation more expensive. The patch 
does not appear to attempt to estimate costs. However, why do we need to 
allocate anything in the first place? If you want to store something 
that will be thrown away, just pick an address below the stack pointer. 
I think some ports may need different strategies due to stack bias or 
red zones, so a target hook is in order, with one safe default to fail, 
and one default implementation that can be used by most targets, and 
then specialized versions in target-dependent code where necessary:


rtx
default_get_scratchpad_fail (HOST_WIDE_INT size)
{
  return NULL_RTX;
}

rtx
default_get_scratchpad (HOST_WIDE_INT size)
{
  /* Possibly also take STACK_BOUNDARY into account so as to not
 make unaligned locations.  */
  if (size >= param (SCRATCHPAD_MAX_SIZE))
return NULL_RTX;
  return plus_constant (stack_pointer_rtx, gen_int_mode (-size, Pmode));
}

With that, I think all the code to keep track of scratchpads can just be 
deleted.


There's this preexisting comment:

/* ??? This is overconservative. Storing to two different mems is
as easy as conditionally computing the address. Storing to a
single mem merely requires a scratch memory to use as one of the
destination addresses; often the memory immediately below the
stack pointer is available for this. */

suggesting that it ought to be possible to generalize the technique to 
stores to different addresses.



To the patch itself. The code still has many stylistic problems and does 
not follow the required guidelines.



+#include "diagnostic-color.h"


Why?



-/* Return true if a write into MEM may trap or fault.  */
+/* Return true if a write into MEM may trap or fault
+   even in the presence of scratchpad support.  */




+/* Return true if a write into MEM may trap or fault
+   without scratchpad support.  */


Please explain the rationale for these changes. What exactly is 
different with scratchpads?



+ /* The next "if": quoting "noce_emit_cmove":
+  If we can't create new pseudos, though, don't bother.  */
+ if (reload_completed)
+   return FALSE;
+
+ if (optimize<2)
+   return FALSE;
+
+ if (optimize_function_for_size_p (cfun))
+   return FALSE;
+
+ if (targetm.have_conditional_execution () || ! HAVE_conditional_move)
+   return FALSE;


Merge the conditions into one if. Watch spacing around operators.


+
+ const bool not_a_scratchpad_candidate =
+   noce_mem_write_may_trap_or_fault_p_1 (orig_x);
+ if (! not_a_scratchpad_candidate)


The = should start a line, but what you really should do is just put the 
condition into the if and eliminate the variable.



+ const size_t size_of_MEM = MEM_SIZE (orig_x);


Identifiers are still too verbose. This is typically just called size, 
or memsize if there are other sizes to keep track of.



+
+   for (rtx_insn* insn = BB_HEAD (then_bb); insn && insn != insn_a
+ && insn != BB_END (then_bb); insn=NEXT_INSN (insn))
+ {
+if (! (NOTE_INSN_BASIC_BLOCK_P (insn) || DEBUG_INSN_P 
(insn)))


There are six different coding style violations in this block. Please 
identify and fix them (elsewhere as well). In addition, I think it would 
be better to start each part of the for statement on its own line for 
clarity.


I still need to figure out what is going on in this insn-copying loop.

> +  /* Done copying the needed insns between the start of the
> + THEN block and the set of 'a', if any.  */
> +
> +  if (CONSTANT_P (XEXP (cond, 0)) && CONSTANT_P (XEXP (cond, 1)))
> +{
> +  end_sequence ();
> +  return FALSE;
> +}

This should be done earlier before you go to the effort of copying insns.


+   MEM_NOTRAP_P (mem) = true;


So I'm still not entirely sure which cases you are trying to optimize 
and which ones not, but couldn't this technique allow a trapping store here?



Bernd


Re: Move some bit and binary optimizations in simplify and match

2015-10-08 Thread Richard Biener
On Wed, Oct 7, 2015 at 11:54 AM, Hurugalawadi, Naveen
 wrote:
> Hi,
>
> Please find attached the patch that moves some more patterns from
> fold-const using simplify and match.
>
> Please review the patch and let me know if any modifications are required.

+/* Fold X + (X / CST) * -CST to X % CST.  */
+(simplify
+ (plus @0 (mult:s (trunc_div:s @0 INTEGER_CST@1) (negate @1)))

that's a bit too literal -- (negate @1) won't match for -@1

+  (if (INTEGRAL_TYPE_P (type) || VECTOR_INTEGER_TYPE_P (type))
+  (trunc_mod @0 @1)))

+/* Fold (A & ~B) - (A & B) into (A ^ B) - B.  */
+(simplify
+ (minus (bit_and:s @0 (bit_not:s @1)) (bit_and:s @0 @1))
+  (if (! FLOAT_TYPE_P (type))
+  (minus (bit_xor @0 @1) @1)))

Likewise the fold code handles both constant and non-constant B.
To mimic this you need a second pattern for the constant case
or add a predicate matching @1 with ~@1.

+/* (-A) * (-B) -> A * B  */
+(simplify
+ (mult:c (negate @0) (negate @1))
+  (mult @0 @1))

the fold-const.c code handles sign-conversions around the negates
so you should add (convert?)s around them and verify useless_type_conversions.

+/* Fold (a * (1 << b)) into (a << b)  */
+(simplify
+ (mult:c @0 (lshift integer_onep@1 @2))
+  (if (! FLOAT_TYPE_P (type))
+  (lshift @0 @2)))

Likewise (sign-conversion on the lshift).  Though I'm not sure
this won't trap ubsan for signed left-shift of negative values.

+/* Fold (C1/X)*C2 into (C1*C2)/X.  */
+(simplify
+ (mult (rdiv REAL_CST@0 @1) REAL_CST@2)
+  (if (FLOAT_TYPE_P (type)
+   && flag_associative_math)
+  (rdiv (mult @0 @2) @1)))

the fold-const.c code avoids the transform if @0 * @2 doesn't simplify
(nans/infs and flag combos).  Not sure if we care though.

+/* Simplify (X & ~Y) | (~X & Y) is X ^ Y.  */
+(simplify
+ (bit_ior (bit_and:s @0 (bit_not:s @1)) (bit_and:s (bit_not:s @0) @1))
+  (bit_xor @0 @1))

fold again handles also constants for X and Y.  I suggest to re-use
the matching predicate you need to add for the above ~ pattern.
fold also handles sign-converted bit-ands.

+/* Simplify ~X & X as zero.  */
+(simplify
+ (bit_and:c @0 (bit_not:s @0))
+  { build_zero_cst (type); })

I was sure we already have this...  looks I was wrong.  Again fold
handles sign-conversions on @0 resp. the bit_not.

+/* Simplify (X == 0) & X as zero.  */
+(simplify
+ (bit_and:c @0 (eq @0 integer_zerop@1))
+  @1)

I think we have this one see logical_inverted_value and uses:

(simplify
 (bit_and:c @0 (logical_inverted_value @0))
 { build_zero_cst (type); })

+/* Fold X & (X ^ Y) as X & ~Y.  */
+(simplify
+ (bit_and @0 (bit_xor:s @0 @1))
+  (bit_and @0 (bit_not @1)))
+
+/* Fold X & (Y ^ X) as ~Y & X.  */
+(simplify
+ (bit_and @0 (bit_xor:s @1 @0))
+  (bit_and (bit_not @1) @0))

add :c on the bit_and and the bit_xor and then merge the patterns.

Thanks,
Richard.




> Tested the patch on X86 without any regressions.
>
> Thanks,
> Naveen
>
> ChangeLog
>
> 2015-10-07  Naveen H.S  
>
> * fold-const.c (fold_binary_loc) : Move X + (X / CST) * -CST ->
> X % CST to match.pd.
> Move Fold (A & ~B) - (A & B) into (A ^ B) - B to match.pd.
> Move (-A) * (-B) -> A * B to match.pd.
> Move (a * (1 << b)) is (a << b) to match.pd.
> Move convert (C1/X)*C2 into (C1*C2)/X to match.pd.
> Move (X & ~Y) | (~X & Y) is X ^ Y to match.pd.
> Move ~X & X, (X == 0) & X, and !X & X are zero to match.pd.
> Move X & ~X , X & (X == 0), and X & !X are zero to match.pd.
> Move Fold X & (X ^ Y) as X & ~Y to match.pd.
> Move Fold X & (Y ^ X) as ~Y & X to match.pd.
>
> * match.pd (plus @0 (mult:s (trunc_div:s @0 INTEGER_CST@1)
> (negate @1))): New simplifier.
> (minus (bit_and:s @0 (bit_not:s @1)) (bit_and:s @0 @1)) :
> New simplifier.
> (mult:c @0 (lshift integer_onep@1 @2)): New simplifier.
> (mult:c (plus @0 @0) INTEGER_CST@1): New simplifier.
> (mult (rdiv REAL_CST@0 @1) REAL_CST@2): New simplifier.
> (bit_ior (bit_and:s @0 (bit_not:s @1)) (bit_and:s (bit_not:s @0) @1))
> : New simplifier.
> (bit_and:c @0 (bit_not:s @0)): New simplifier.
> (bit_and:c @0 (eq @0 integer_zerop@1)): New simplifier.
> (bit_and @0 (bit_xor:s @0 @1)): New simplifier.
> (bit_and @0 (bit_xor:s @1 @0)): New simplifier.
> (mult:c (negate @0) (negate @1)): New simplifier.


[patch] header file re-ordering.

2015-10-08 Thread Andrew MacLeod

On 10/07/2015 06:02 PM, Jeff Law wrote:

On 10/01/2015 08:33 PM, Andrew MacLeod wrote:

these are all in the main gcc directory. 297 files total.

Everything bootstraps on x86_64-pc-linux-gnu and
powerpc64le-unknown-linux-gnu.  All targets in config-list.mk still
build. Regressions tests also came up clean.

OK for trunk?
So as I look at this and make various spot checks, what really stands 
out is how often something like alias.h gets included, often in places 
that have absolutely no business/need to be looking at that file. 
Cut-n-paste at its worst.  It happens to many others, but alias.h 
seems to have gotten its grubby self into just about everywhere for 
reasons unkonwn.


I find myself also wondering if a two step approach would make this 
easier.  Step #1 being ordering the headers, step #2 being removal of 
the duplicates.  As you note, the downside is two checkins that would 
affect most files in the tree.  I guess I'll keep slogging through the 
patch as is...


jeff
Heres the patch for reordered headers.  Building as we speak.  Hard to 
fully verify since Ada doesn't seem to bootstrap on trunk at the moment:


+===GNAT BUG DETECTED==+
| 6.0.0 20151008 (experimental) (x86_64-pc-linux-gnu) GCC error:   |
| in gen_lowpart_common, at emit-rtl.c:1399|
| Error detected around s-regpat.adb:1029:22   |

<...>
raised TYPES.UNRECOVERABLE_ERROR : comperr.adb:423
../gcc-interface/Makefile:311: recipe for target 's-regpat.o' failed


However, the tool has been run, and I've made the minor adjustments 
required to the source files to make it work.  (ie, a few multi-line 
comments and the fact that mul-tables.c is generated on the tile* targets.


So this is what it should look like.  I used -cp.Other languages are 
bootstrapping, and I have yet to build all the targets... that'll just 
take a day.   Be nice if ada worked tho.


I can run the reduction tool over the weekend (its a long weekend here 
:-) on this if you want...  the other patch is a couple of weeks out of 
date anyway now.


Andrew



backend-order.patch.bz2
Description: application/bzip


FE-order.patch.bz2
Description: application/bzip


config-order.patch.bz2
Description: application/bzip


[PATCH] Random shuffle moveable: container size

2015-10-08 Thread Aurelio Remonda
This patch reduces the size of the array A (the array that contains
the values being shuffled) so the test can pass while running the
stdlibc++ testsuite.
It also make some minor changes such as:
*Deleting a useless call to fill_ascending function on test02.
*Changing N from const int to const unsigned int.
I have a company-wide copyright assignment, but I don't have commit access.

---
 ChangeLog   |  6 ++
 moveable.cc | 13 ++---
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/libstdc++-v3/ChangeLog b/libstdc++-v3/ChangeLog
index 91d2957..2c4e127 100644
--- a/libstdc++-v3/ChangeLog
+++ b/libstdc++-v3/ChangeLog
@@ -1,3 +1,9 @@
+2015-10-08  Aurelio Remonda  
+
+   * testsuite/25_algorithms/random_shuffle/moveable.cc: Change variable
+   N from const int N = 20 to const unsigned int N = 1.
+   Delete useless fill_ascending function call.
+
 2015-09-07  Jonathan Wakely  
 
* include/bits/shared_ptr_base.h (__shared_ptr::operator->): Change
diff --git a/libstdc++-v3/testsuite/25_algorithms/random_shuffle/moveable.cc 
b/libstdc++-v3/testsuite/25_algorithms/random_shuffle/moveable.cc
index e854c38..dabe9e3 100644
--- a/libstdc++-v3/testsuite/25_algorithms/random_shuffle/moveable.cc
+++ b/libstdc++-v3/testsuite/25_algorithms/random_shuffle/moveable.cc
@@ -34,8 +34,8 @@ using __gnu_test::rvalstruct;
 
 typedef test_container Container;
 
-const int N = 20;
-int A[N];
+const unsigned int N = 1;
+int A[N]; // This is made global because we don't want it on the stack
 
 void fill_ascending()
 {
@@ -56,10 +56,10 @@ test01()
 
   // The chance that random_shuffle leaves the order as is by coincidence
   // is negligible, so we expect it to be permuted
-  VERIFY( !std::equal(rv, rv + N, A) );
+  VERIFY(!std::equal(rv, rv + N, A));
 
   std::sort(con.begin(), con.end());
-  VERIFY( std::equal(rv, rv + N, A) );
+  VERIFY(std::equal(rv, rv + N, A));
 }
 
 int random_generator(int)
@@ -70,14 +70,13 @@ test02()
 {
   bool test __attribute__((unused)) = true;
 
-  fill_ascending();
   rvalstruct rv[10] = {1,2,3,4,5,6,7,8,9,10};
   int result[10] = {10,1,2,3,4,5,6,7,8,9};
   Container con(rv, rv + 10);
   std::random_shuffle(con.begin(), con.end(), random_generator);
   // The above answer was generated by hand. It is not required by the 
standard,
   // but is produced by the current algorithm.
-  VERIFY( std::equal(rv, rv + 10, result) );
+  VERIFY(std::equal(rv, rv + 10, result));
 }
 
 int
@@ -86,4 +85,4 @@ main()
   test01();
   test02();
   return 0;
-}
+}
\ No newline at end of file
-- 
1.9.1



Re: using scratchpads to enhance RTL-level if-conversion: revised patch

2015-10-08 Thread Bernd Schmidt

+ /* We must copy the insns between the start of the THEN block
+   and the set of 'a', if they exist, since they may be needed
+   for the converted code as well, but we must not copy a
+   start-of-BB note if one is present, nor debug "insn"s.  */
+
+ for (rtx_insn* insn = BB_HEAD (then_bb); insn && insn != insn_a
+  && insn != BB_END (then_bb); insn=NEXT_INSN (insn))
+  {

Please remove the braces: the loop body is a single stmt.


Oh, I miscounted. That makes seven then.


Bernd



Re: [PATCH ARM]: PR67745: Fix function alignment after __attribute__ 2/2

2015-10-08 Thread Bernd Schmidt

On 10/08/2015 03:14 PM, Christian Bruel wrote:


Probably at the time of start_decl, because DECL_ALIGN will have the
boundary given by the global target_flags at that time. But this
shouldn't be a problem since what matters is the DECL_ALIGN recomputed
with the definition when there is something to layout.


I'm not so sure. Don't we take DECL_ALIGN into account when optimizing 
things like alignment tests?



Bernd


Re: using scratchpads to enhance RTL-level if-conversion: revised patch

2015-10-08 Thread Sebastian Pop
Abe,

please avoid comments that are not needed.

+ /* We must copy the insns between the start of the THEN block
+   and the set of 'a', if they exist, since they may be needed
+   for the converted code as well, but we must not copy a
+   start-of-BB note if one is present, nor debug "insn"s.  */
+
+ for (rtx_insn* insn = BB_HEAD (then_bb); insn && insn != insn_a
+  && insn != BB_END (then_bb); insn=NEXT_INSN (insn))
+  {

Please remove the braces: the loop body is a single stmt.

+ if (! (NOTE_INSN_BASIC_BLOCK_P (insn) || DEBUG_INSN_P (insn)))
+   duplicate_insn_chain (insn, insn);
+   /* A return of 0 from "duplicate_insn_chain" is _not_
+  a failure; it just returns the "NEXT_INSN" of the
+  last insn it duplicated.  */

Please remove this comment.

+  }
+
+ /* Done copying the needed insns between the start of the
+   THEN block and the set of 'a', if any.  */

This comment duplicates the same content as the comment before the loop.
Please remove.


On Thu, Oct 8, 2015 at 8:08 AM, Sebastian Pop  wrote:
> Hi Abe,
>
> could you please avoid double negations, and
> please use early returns rather than huge right indentations:
>
> +  if (! not_a_scratchpad_candidate)
> +  {
> +if (MEM_SIZE_KNOWN_P (orig_x))
> +{
> +  const size_t size_of_MEM = MEM_SIZE (orig_x);
> +
> +  if (size_of_MEM <= SCRATCHPAD_MAX_SIZE)
> +  {
> [...]
> +  }
> +}
> +  }
> +  return FALSE;
>
> Just rewrite as:
>
> if (not_a_scratchpad_candidate
> || !MEM_SIZE_KNOWN_P (orig_x))
>   return FALSE;
>
> const size_t size_of_MEM = MEM_SIZE (orig_x);
> if (size_of_MEM > SCRATCHPAD_MAX_SIZE)
>   return FALSE;
>
> That will save 3 levels of indent.
>
> Also some of your braces do not seem to be correctly placed.
> Please use clang-format on your patch to solve the indentation issues.
>
> Thanks,
> Sebastian
>
>
> On Wed, Oct 7, 2015 at 6:29 PM, Abe  wrote:
>> Dear all,
>>
>> Attached please find my revised patch to the RTL if converter.  This patch
>> enables the
>> if-conversion of half-hammocks with a store in them that the internal GCC
>> machinery
>> otherwise considers too hazardous to if-convert.  This is made safe by using
>> the
>> "scratchpad" technique, i.e. throwing away the store into a safe location
>> where nothing
>> of any importance is currently stored.  The scratchpads are allocated in the
>> stack frame.
>>
>> Here is an example of code which is newly converted with this patch,
>> at least when targeting AArch64:
>>
>>   int A[10];
>>
>>   void half_hammock() {
>> if (A[0])
>>   A[1] = 2;
>>   }
>>
>>
>> Both tested against trunk and bootstrapped OK with defaults* on
>> AMD64-AKA-"x86_64" GNU/Linux.
>>
>> '*': [except for "--prefix"]
>>
>>
>> I`m sending the patch as an attachment to avoid it
>> being corrupted/reformatted by any e-mail troubles.
>>
>> I look forward to your feedback.
>>
>> Regards,
>>
>> Abe
>>


[PATCH][AArch64] Update patterns to support FP zero

2015-10-08 Thread Wilco Dijkstra
This patch improves support for instructions that allow FP zero immediate. All 
FP compares generated
by various patterns should use aarch64_fp_compare_operand. LDP/STP uses 
aarch64_reg_or_fp_zero.
Passes regression on AArch64.

OK for commit?

ChangeLog:
2015-10-08  Wilco Dijkstra  

* gcc/config/aarch64/aarch64.md (cbranch4):
Use aarch64_fp_compare_operand.
(store_pairsf): Use aarch64_reg_or_fp_zero.
(store_pairdf): Likewise.
(cstore4): Use aarch64_fp_compare_operand.
(cmov6): Likewise.
* gcc/config/aarch64/aarch64-ldpstp.md: Use aarch64_reg_or_fp_zero.

---
 gcc/config/aarch64/aarch64-ldpstp.md | 12 ++--
 gcc/config/aarch64/aarch64.md| 14 +++---
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-ldpstp.md 
b/gcc/config/aarch64/aarch64-ldpstp.md
index 8d6d882..54cf34c 100644
--- a/gcc/config/aarch64/aarch64-ldpstp.md
+++ b/gcc/config/aarch64/aarch64-ldpstp.md
@@ -80,9 +80,9 @@
 
 (define_peephole2
   [(set (match_operand:GPF 0 "aarch64_mem_pair_operand" "")
-   (match_operand:GPF 1 "register_operand" ""))
+   (match_operand:GPF 1 "aarch64_reg_or_fp_zero" ""))
(set (match_operand:GPF 2 "memory_operand" "")
-   (match_operand:GPF 3 "register_operand" ""))]
+   (match_operand:GPF 3 "aarch64_reg_or_fp_zero" ""))]
   "aarch64_operands_ok_for_ldpstp (operands, false, mode)"
   [(parallel [(set (match_dup 0) (match_dup 1))
  (set (match_dup 2) (match_dup 3))])]
@@ -308,13 +308,13 @@
 (define_peephole2
   [(match_scratch:DI 8 "r")
(set (match_operand:GPF 0 "memory_operand" "")
-   (match_operand:GPF 1 "aarch64_reg_or_zero" ""))
+   (match_operand:GPF 1 "aarch64_reg_or_fp_zero" ""))
(set (match_operand:GPF 2 "memory_operand" "")
-   (match_operand:GPF 3 "aarch64_reg_or_zero" ""))
+   (match_operand:GPF 3 "aarch64_reg_or_fp_zero" ""))
(set (match_operand:GPF 4 "memory_operand" "")
-   (match_operand:GPF 5 "aarch64_reg_or_zero" ""))
+   (match_operand:GPF 5 "aarch64_reg_or_fp_zero" ""))
(set (match_operand:GPF 6 "memory_operand" "")
-   (match_operand:GPF 7 "aarch64_reg_or_zero" ""))
+   (match_operand:GPF 7 "aarch64_reg_or_fp_zero" ""))
(match_dup 8)]
   "aarch64_operands_adjust_ok_for_ldpstp (operands, false, mode)"
   [(const_int 0)]
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index f2d1be1..67ce01b 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -240,7 +240,7 @@
 (define_expand "cbranch4"
   [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
[(match_operand:GPF 1 "register_operand" "")
-(match_operand:GPF 2 "aarch64_reg_or_zero" "")])
+(match_operand:GPF 2 "aarch64_fp_compare_operand" 
"")])
   (label_ref (match_operand 3 "" ""))
   (pc)))]
   ""
@@ -1336,9 +1336,9 @@
 ;; fairly lax checking on the second memory operation.
 (define_insn "store_pairsf"
   [(set (match_operand:SF 0 "aarch64_mem_pair_operand" "=Ump,Ump")
-   (match_operand:SF 1 "register_operand" "w,*r"))
+   (match_operand:SF 1 "aarch64_reg_or_fp_zero" "w,*rY"))
(set (match_operand:SF 2 "memory_operand" "=m,m")
-   (match_operand:SF 3 "register_operand" "w,*r"))]
+   (match_operand:SF 3 "aarch64_reg_or_fp_zero" "w,*rY"))]
   "rtx_equal_p (XEXP (operands[2], 0),
plus_constant (Pmode,
   XEXP (operands[0], 0),
@@ -1352,9 +1352,9 @@
 
 (define_insn "store_pairdf"
   [(set (match_operand:DF 0 "aarch64_mem_pair_operand" "=Ump,Ump")
-   (match_operand:DF 1 "register_operand" "w,*r"))
+   (match_operand:DF 1 "aarch64_reg_or_fp_zero" "w,*rY"))
(set (match_operand:DF 2 "memory_operand" "=m,m")
-   (match_operand:DF 3 "register_operand" "w,*r"))]
+   (match_operand:DF 3 "aarch64_reg_or_fp_zero" "w,*rY"))]
   "rtx_equal_p (XEXP (operands[2], 0),
plus_constant (Pmode,
   XEXP (operands[0], 0),
@@ -2901,7 +2901,7 @@
   [(set (match_operand:SI 0 "register_operand" "")
(match_operator:SI 1 "aarch64_comparison_operator"
 [(match_operand:GPF 2 "register_operand" "")
- (match_operand:GPF 3 "register_operand" "")]))]
+ (match_operand:GPF 3 "aarch64_fp_compare_operand" "")]))]
   ""
   "
   operands[2] = aarch64_gen_compare_reg (GET_CODE (operands[1]), operands[2],
@@ -2971,7 +2971,7 @@
(if_then_else:GPF
 (match_operator 1 "aarch64_comparison_operator"
  [(match_operand:GPF 2 "register_operand" "")
-  (match_operand:GPF 3 "register_operand" "")])
+  (match_operand:GPF 3 "aarch64_fp_compare_operand" "")])
 (match_operand:GPF 4 "register_operand" "")
 (match_operand:GPF 5 "register_operand" "")))]
   ""
-- 
1.9.1




Re: using scratchpads to enhance RTL-level if-conversion: revised patch

2015-10-08 Thread Sebastian Pop
Hi Abe,

could you please avoid double negations, and
please use early returns rather than huge right indentations:

+  if (! not_a_scratchpad_candidate)
+  {
+if (MEM_SIZE_KNOWN_P (orig_x))
+{
+  const size_t size_of_MEM = MEM_SIZE (orig_x);
+
+  if (size_of_MEM <= SCRATCHPAD_MAX_SIZE)
+  {
[...]
+  }
+}
+  }
+  return FALSE;

Just rewrite as:

if (not_a_scratchpad_candidate
|| !MEM_SIZE_KNOWN_P (orig_x))
  return FALSE;

const size_t size_of_MEM = MEM_SIZE (orig_x);
if (size_of_MEM > SCRATCHPAD_MAX_SIZE)
  return FALSE;

That will save 3 levels of indent.

Also some of your braces do not seem to be correctly placed.
Please use clang-format on your patch to solve the indentation issues.

Thanks,
Sebastian


On Wed, Oct 7, 2015 at 6:29 PM, Abe  wrote:
> Dear all,
>
> Attached please find my revised patch to the RTL if converter.  This patch
> enables the
> if-conversion of half-hammocks with a store in them that the internal GCC
> machinery
> otherwise considers too hazardous to if-convert.  This is made safe by using
> the
> "scratchpad" technique, i.e. throwing away the store into a safe location
> where nothing
> of any importance is currently stored.  The scratchpads are allocated in the
> stack frame.
>
> Here is an example of code which is newly converted with this patch,
> at least when targeting AArch64:
>
>   int A[10];
>
>   void half_hammock() {
> if (A[0])
>   A[1] = 2;
>   }
>
>
> Both tested against trunk and bootstrapped OK with defaults* on
> AMD64-AKA-"x86_64" GNU/Linux.
>
> '*': [except for "--prefix"]
>
>
> I`m sending the patch as an attachment to avoid it
> being corrupted/reformatted by any e-mail troubles.
>
> I look forward to your feedback.
>
> Regards,
>
> Abe
>


Re: [gomp4.1] Doacross library implementation

2015-10-08 Thread Torvald Riegel
On Thu, 2015-09-24 at 20:32 +0200, Jakub Jelinek wrote:
> Torvald, can you please have a look at it, if I got all the atomics / memory
> models right?

More detailed comments below, but in general, I'd really suggest to add
more code comments for the synchronization parts.  In the end, the level
of detail of documentation of libgomp is your decision, but, for
example, the lack of comments in synchronization code in glibc has made
maintaining this code and fixing issues in it very costly.  It has also
been hard to understand for many.

My suggestion would be both to (1) document the high-level, abstract
synchronization scheme and (2) how that scheme is implemented.  The
first point is important in my experience because typically, the
high-level scheme and the actual thinking behind it (or, IOW, the intent
of the original author) is much harder to reconstruct in case of
concurrent code than it is for sequential code; you can't just simply
follow the program along line by line, but have to consider
interleavings.

Even if the synchronization problem to solve is relatively
straight-forward as in thise case (ie, one-directional waiting), it's
worth IMO to do point (1).  If it is simple, the high-level description
will be simple, and it will assure others that one really has to just
solve that and that the original author wasn't aware of any other
issues.

Regarding point (2), what we're doing in glibc now is basically to
document how the specific things we do in the code make sure we
implement the high-level scheme.  So we'd say things like "this CAS here
now ensures consensus among the threads A, B, C".  For memory orders
specifically, it helps to document why they are sufficient and
necessary; this helps others understand the code, so that they don't
need to go hunting through all of the code looking for other accesses to
the same memory locations to be able to reconstruct the intended
happens-before relations.  I have some examples below.
Also, given that you don't use explicit atomic types but just atomic
operations, it's IMO a good idea to document which variables are
supposed to be accessed atomically so it becomes easier to not violate
the data-race-freedom requirement accidentally.

> The testcase obviously is not a good benchmark, we'll need
> some more realistic one.  But obviously when asking for oversubscription, it
> is quite expensive.  The question is how to implement a non-busy waiting
> fallback, whether we put some mutex and queue guarded by the mutex into the
> same (or some other?) cache-line, or just use atomics to queue it and how to
> make it cheap for the case where busy waiting is sufficient.

Atomics and futexes is probably the best approach if you want
performance; at least we need some efficient way for post() to figure
out that there are indeed waiters, and we don't want to use a lock for
that.

What specific approach to use is a question of how much time we want to
spend on this.  It's hard to estimate how often we'd really need a
blocking wait in practice, though.

> I'd say
> it should be sufficient to implement non-busy waiting in the flattened
> variant.

Sounds fine to me.

> --- libgomp/ordered.c.jj  2015-09-18 18:36:42.0 +0200
> +++ libgomp/ordered.c 2015-09-24 18:20:28.286244397 +0200
> @@ -252,14 +254,146 @@ GOMP_ordered_end (void)
>  {
>  }
>  
> +/* DOACROSS initialization.  */
> +
> +#define MAX_COLLAPSED_BITS (__SIZEOF_LONG__ * __CHAR_BIT__)
> +
> +void
> +gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size)
> +{
> +  struct gomp_thread *thr = gomp_thread ();
> +  struct gomp_team *team = thr->ts.team;
> +  struct gomp_work_share *ws = thr->ts.work_share;
> +  unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
> +  unsigned long ent, num_ents, elt_sz, shift_sz;
> +  struct gomp_doacross_work_share *doacross;
> +
> +  if (team == NULL || team->nthreads == 1)
> +return;
> +
> +  for (i = 0; i < ncounts; i++)
> +{
> +  /* If any count is 0, GOMP_doacross_{post,wait} can't be called.  */
> +  if (counts[i] == 0)
> + return;
> +
> +  if (num_bits <= MAX_COLLAPSED_BITS)
> + {
> +   unsigned int this_bits;
> +   if (counts[i] == 1)
> + this_bits = 1;
> +   else
> + this_bits = __SIZEOF_LONG__ * __CHAR_BIT__
> + - __builtin_clzl (counts[i] - 1);
> +   if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
> + {
> +   bits[i] = this_bits;
> +   num_bits += this_bits;
> + }
> +   else
> + num_bits = MAX_COLLAPSED_BITS + 1;
> + }
> +}
> +
> +  if (ws->sched == GFS_STATIC)
> +num_ents = team->nthreads;
> +  else
> +num_ents = (counts[0] - 1) / chunk_size + 1;
> +  if (num_bits <= MAX_COLLAPSED_BITS)
> +{
> +  elt_sz = sizeof (unsigned long);
> +  shift_sz = ncounts * sizeof (unsigned int);
> +}
> +  else
> +{
> +  elt_sz = sizeof (unsigned long) * ncounts;
> +  shift_sz = 0;
> +}
> +  e

Re: [PATCH] Unswitching outer loops.

2015-10-08 Thread Richard Biener
On Wed, Oct 7, 2015 at 5:26 PM, Yuri Rumyantsev  wrote:
> Richard,
>
> I noticed that 'gimple' type was changed and send you updated patch.

Ok.

Thanks,
Richard.

> Thanks.
> Yuri.
>
> 2015-10-07 12:53 GMT+03:00 Yuri Rumyantsev :
>> Richard,
>>
>> I've fixed adding virtual phi argument and add check on irreducible basic 
>> block.
>> New patch is attached.
>>
>> I checked it for bootstrap and regression testing, no new failures.
>>
>> ChangeLog:
>> 2015-10-07  Yuri Rumyantsev  
>>
>> * tree-ssa-loop-unswitch.c: Include "gimple-iterator.h" and
>> "cfghooks.h", add prototypes for introduced new functions.
>> (tree_ssa_unswitch_loops): Use from innermost loop iterator, move all
>> checks on ability of loop unswitching to tree_unswitch_single_loop;
>> invoke tree_unswitch_single_loop or tree_unswitch_outer_loop depending
>> on innermost loop check.
>> (tree_unswitch_single_loop): Add all required checks on ability of
>> loop unswitching under zero recursive level guard.
>> (tree_unswitch_outer_loop): New function.
>> (find_loop_guard): Likewise.
>> (empty_bb_without_guard_p): Likewise.
>> (used_outside_loop_p): Likewise.
>> (get_vop_from_header): Likewise.
>> (hoist_guard): Likewise.
>> (check_exit_phi): Likewise.
>>
>>gcc/testsuite/ChangeLog:
>> * gcc.dg/loop-unswitch-2.c: New test.
>> * gcc.dg/loop-unswitch-3.c: Likewise.
>> * gcc.dg/loop-unswitch-4.c: Likewise.
>>
>>
>> 2015-10-06 15:21 GMT+03:00 Richard Biener :
>>> On Tue, Oct 6, 2015 at 1:41 PM, Yuri Rumyantsev  wrote:
 Richard,

 Here is updated patch which reflects almost all your remarks:
 1. Use ordinary get_loop_body.
 2. Delete useless asserts.
 3. Use check on iterated loop instead of finite_loop_p.
 4. Do not update CFG by adjusting the CONDs condition to always true/false.
 5. Add couple tests.
>>>
>>> +  /* Add NEW_ADGE argument for all phi in post-header block.  */
>>> +  bb = exit->dest;
>>> +  for (gphi_iterator gsi = gsi_start_phis (bb);
>>> +   !gsi_end_p (gsi); gsi_next (&gsi))
>>> +{
>>> +  gphi *phi = gsi.phi ();
>>> +  /* edge_iterator ei; */
>>> +  tree arg;
>>> +  if (virtual_operand_p (gimple_phi_result (phi)))
>>> +   {
>>> + arg = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
>>> + add_phi_arg (phi, arg, new_edge, UNKNOWN_LOCATION);
>>>
>>> now I know what confused me - here you are looking at a loop exit PHI
>>> but querying with the preheader edge index.  I think you need to walk
>>> the loop header PHIs to find the PHI for the virtual operand and use that
>>> to get the PHI arg from?
>>>
>>> The side-effect / used-outside code is still the same.  What matters
>>> is side-effects outside of the loop-header protected code region, not
>>> blocks excluding the inner loop.  Say,
>>>
>>>   for (;;)
>>> {
>>>   if (invariant-guard)
>>> {
>>>printf ("Blah");
>>>for (;;)
>>>  ;
>>> }
>>> }
>>>
>>> would still ok to be unswitched.  So instead of
>>>
>>> +  if (body[i]->loop_father != loop)
>>> +   continue;
>>>
>>> it would be
>>>
>>>if (dominated_by_p (CDI_DOMINATORS, body[i], header)
>>>&& !dominated_by_p (CDI_DOMINATORS, body[i], fe->dest))
>>>
>>> with the obvious improvement to the patch to not only consider header checks
>>> in the outer loop header but in the pre-header block of the inner loop.
>>>
>>> And I still think you should walk the exit PHIs args to see whether they
>>> are defined in the non-guarded region of the outer loop instead of walking
>>> all uses of all defs.
>>>
>>> Note that I think you miss endless loops as side-effects if that endless
>>> loop occurs through a irreducible region (thus not reflected in the
>>> loop tree).  Thus you should reject BB_IRREDUCIBLE_LOOP blocks
>>> in the non-guarded region as well.
>>>
>>> It seems to me that protecting adjacent loops with a single guard is
>>> also eligible for hoisting thus the restriction on loop->inner->next
>>> should become a restriction on no loops (or irreducible regions)
>>> in the non-guarded region.
>>>
>>> Most things can be improved as followup, but at least the
>>> virtual PHI arg thing needs to be sorted out.
>>>
>>> Thanks,
>>> Richard.
>>>
>>>
 ChangeLog:
 2015-10-06  Yuri Rumyantsev  

 * tree-ssa-loop-unswitch.c: Include "gimple-iterator.h" and
 "cfghooks.h", add prototypes for introduced new functions.
 (tree_ssa_unswitch_loops): Use from innermost loop iterator, move all
 checks on ability of loop unswitching to tree_unswitch_single_loop;
 invoke tree_unswitch_single_loop or tree_unswitch_outer_loop depending
 on innermost loop check.
 (tree_unswitch_single_loop): Add all required checks on ability of
 loop unswitching under zero recursive level guard.
 (tree_unswitch_outer_loop): New function.
 (find_loop_guard): Likewise.
 (empty_bb_without_guard_p): Likewise.
 (used_outside_loop_p): Likewise.
 (hoi

Re: [PATCH V3][GCC] Algorithmic optimization in match and simplify

2015-10-08 Thread Richard Biener
On Wed, Oct 7, 2015 at 10:21 AM, Andre Vieira
 wrote:
> On 25/09/15 12:42, Richard Biener wrote:
>>
>> On Fri, Sep 25, 2015 at 1:30 PM, Andre Vieira
>>  wrote:
>>>
>>> On 17/09/15 10:46, Richard Biener wrote:


 On Thu, Sep 3, 2015 at 1:11 PM, Andre Vieira
  wrote:
>
>
> On 01/09/15 15:01, Richard Biener wrote:
>>
>>
>>
>> On Tue, Sep 1, 2015 at 3:40 PM, Andre Vieira
>>  wrote:
>>>
>>>
>>>
>>> Hi Marc,
>>>
>>> On 28/08/15 19:07, Marc Glisse wrote:




 (not a review, I haven't even read the whole patch)

 On Fri, 28 Aug 2015, Andre Vieira wrote:

> 2015-08-03  Andre Vieira  
>
>  * match.pd: Added new patterns:
>((X {&,<<,>>} C0) {|,^} C1) {^,|} C2)
>(X {|,^,&} C0) {<<,>>} C1 -> (X {<<,>>} C1) {|,^,&} (C0
> {<<,>>}
> C1)





 +(for op0 (rshift rshift lshift lshift bit_and bit_and)
 + op1 (bit_ior bit_xor bit_ior bit_xor bit_ior bit_xor)
 + op2 (bit_xor bit_ior bit_xor bit_ior bit_xor bit_ior)

 You can nest for-loops, it seems clearer as:
 (for op0 (rshift lshift bit_and)
   (for op1 (bit_ior bit_xor)
op2 (bit_xor bit_ior)
>>>
>>>
>>>
>>>
>>>
>>> Will do, thank you for pointing it out.





 +(simplify
 + (op2:c
 +  (op1:c
 +   (op0 @0 INTEGER_CST@1) INTEGER_CST@2) INTEGER_CST@3)

 I suspect you will want more :s (single_use) and less :c
 (canonicalization
 should put constants in second position).

>>> I can't find the definition of :s (single_use).
>>
>>
>>
>>
>> Sorry for that - didn't get along updating it yet :/  It restricts
>> matching to
>> sub-expressions that have a single-use.  So
>>
>> +  a &= 0xd123;
>> +  unsigned short tem = a ^ 0x6040;
>> +  a = tem | 0xc031; /* Simplify _not_ to ((a & 0xd123) | 0xe071).  */
>> ... use of tem ...
>>
>> we shouldn't do the simplifcation here because the expression
>> (a & 0x123) ^ 0x6040 is kept live by 'tem'.
>>
>>> GCC internals do point out
>>> that canonicalization does put constants in the second position,
>>> didnt
>>> see
>>> that first. Thank you for pointing it out.
>>>
 +   C1 = wi::bit_and_not (C1,C2);

 Space after ','.

>>> Will do.
>>>
 Having wide_int_storage in many places is surprising, I can't find
 similar
 code anywhere else in gcc.


>>>
>>> I tried looking for examples of something similar, I think I ended up
>>> using
>>> wide_int because I was able to convert easily to and from it and it
>>> has
>>> the
>>> "mask" and "wide_int_to_tree" functions. I welcome any suggestions on
>>> what I
>>> should be using here for integer constant transformations and
>>> comparisons.
>>
>>
>>
>>
>> Using wide-ints is fine, but you shouldn't need 'wide_int_storage'
>> constructors - those
>> are indeed odd.  Is it just for the initializers of wide-ints?
>>
>> +wide_int zero_mask = wi::zero (prec);
>> +wide_int C0 = wide_int_storage (@1);
>> +wide_int C1 = wide_int_storage (@2);
>> +wide_int C2 = wide_int_storage (@3);
>> ...
>> +   zero_mask = wide_int_storage (wi::mask (C0.to_uhwi (), false,
>> prec));
>>
>> tree_to_uhwi (@1) should do the trick as well
>>
>> +   C1 = wi::bit_and_not (C1,C2);
>> +   cst_emit = wi::bit_or (C1, C2);
>>
>> the ops should be replacable with @2 and @3, the store to C1 obviously
>> not
>> (but you can use a tree temporary and use wide_int_to_tree here to
>> avoid
>> the back-and-forth for the case where C1 is not assigned to).
>>
>> Note that transforms only doing association are prone to endless
>> recursion
>> in case some other pattern does the reverse op...
>>
>> Richard.
>>
>>
>>> BR,
>>> Andre
>>>
>>
> Thank you for all the comments, see reworked version:
>
> Two new algorithmic optimisations:
> 1.((X op0 C0) op1 C1) op2 C2)
>   with op0 = {&, >>, <<}, op1 = {|,^}, op2 = {|,^} and op1 != op2
>   zero_mask has 1's for all bits that are sure to be 0 in (X op0
> C0)
>   and 0's otherwise.
>   if (op1 == '^') C1 &= ~C2 (Only changed if actually emitted)
>   if ((C1 & ~zero_mask) == 0) then emit (X op0 C0) op2 (C1 op2 C2)
>   if ((C2 & ~zero_mask) == 0) then emit (X op0 C0) op1 (C1 op2 C2)
> 2. (X {|,^,&} C0) {<<,>>} C1 -> (X {<<,>>} C1) {|,^,&} (C0 {<<,>>}
> C1)
>
>
> This patch

PING: [PATCH] PR target/35514: Gcc shoud generate symbol type for undefined symbol

2015-10-08 Thread H.J. Lu
On Wed, Jul 8, 2015 at 1:35 PM, H.J. Lu  wrote:
> On Sun, Jul 5, 2015 at 2:54 PM, H.J. Lu  wrote:
>> On Sun, Jul 5, 2015 at 11:14 AM, H.J. Lu  wrote:
>>> Update default_elf_asm_output_external to also output symbol type to
>>> help ELF linker to properly issue diagnostic message.  We don't output
>>> symbol type for reference to external TLS symbol since assembler will
>>> generate TLS symbol type based on TLS relocation and Solaris assembler
>>> only supports the @tls_obj type directive, not the @tls_object type
>>> directive used by GNU assmbler, which doesn't understand the @tls_obj
>>> type directive.
>>>
>>> Tested on Linux/x86-64.  OK for trunk?
>>>
>>> Thanks.
>>>
>>>
>>> H.J.
>>> ---
>>> gcc/
>>>
>>> PR target/35514
>>> * varasm.c (default_elf_asm_output_external): Also output symbol
>>> type.
>>>
>>> gcc/testsuite/
>>>
>>> PR target/35514
>>> * lib/target-supports.exp (check_effective_target_elf): New.
>>> * gcc.dg/pr35514-1.c: New file.
>>> * gcc.dg/pr35514-2.c: Likewise.

>
> Here is the updated patch to adjust those tests by counting in
> the .type directive for ELF targets.  There is no regression on
> Linux/x86-64.  OK for trunk?
>

PING:

https://gcc.gnu.org/ml/gcc-patches/2015-07/msg00652.html

-- 
H.J.


Re: [ARM] Use vector wide add for mixed-mode adds

2015-10-08 Thread Kyrill Tkachov

Hi Michael,

On 01/10/15 11:05, Michael Collison wrote:

Kyrill,

I have modified the patch to address your comments. I also modified
check_effective_target_vect_widen_sum_hi_to_si_pattern in
target-supports.exp to
indicate that arm neon supports vector widen sum of HImode to SImode.
This resolved
several test suite failures.

Successfully tested on arm-none-eabi, arm-none-linux-gnueabihf. I have
four related execution failure
tests on armeb-non-linux-gnueabihf with -flto only.

gcc.dg/vect/vect-outer-4f.c -flto -ffat-lto-objects execution test
gcc.dg/vect/vect-outer-4g.c -flto -ffat-lto-objects execution test
gcc.dg/vect/vect-outer-4k.c -flto -ffat-lto-objects execution test
gcc.dg/vect/vect-outer-4l.c -flto -ffat-lto-objects execution test


We'd want to get to the bottom of these before committing.
Does codegen before and after the patch show anything?
When it comes to big-endian and NEON, the fiddly parts are
usually lane numbers. Do you need to select the proper lanes with
ENDIAN_LANE_N like Charles in his patch at:
https://gcc.gnu.org/ml/gcc-patches/2015-10/msg00656.html?

Thanks,
Kyrill



I am debugging but have not tracked down the root cause yet. Feedback?

2015-07-22  Michael Collison  

  * config/arm/neon.md (widen_sum): New patterns
  where mode is VQI to improve mixed mode vectorization.
  * config/arm/neon.md (vec_sel_widen_ssum_lo3): New
  define_insn to match low half of signed vaddw.
  * config/arm/neon.md (vec_sel_widen_ssum_hi3): New
  define_insn to match high half of signed vaddw.
  * config/arm/neon.md (vec_sel_widen_usum_lo3): New
  define_insn to match low half of unsigned vaddw.
  * config/arm/neon.md (vec_sel_widen_usum_hi3): New
  define_insn to match high half of unsigned vaddw.
  * testsuite/gcc.target/arm/neon-vaddws16.c: New test.
  * testsuite/gcc.target/arm/neon-vaddws32.c: New test.
  * testsuite/gcc.target/arm/neon-vaddwu16.c: New test.
  * testsuite/gcc.target/arm/neon-vaddwu32.c: New test.
  * testsuite/gcc.target/arm/neon-vaddwu8.c: New test.
  * testsuite/lib/target-supports.exp
  (check_effective_target_vect_widen_sum_hi_to_si_pattern): Indicate
  that arm neon support vector widen sum of HImode TO SImode.


Note that the testsuite changes should have their own ChangeLog entry
with the paths there starting relative to gcc/testsuite/



On 09/23/2015 01:49 AM, Kyrill Tkachov wrote:

Hi Michael,

On 23/09/15 00:52, Michael Collison wrote:

This is a modified version of the previous patch that removes the
documentation and read-md.c fixes. These patches have been submitted
separately and approved.

This patch is designed to address code that was not being vectorized due
to missing widening patterns in the ARM backend. Code such as:

int t6(int len, void * dummy, short * __restrict x)
{
 len = len & ~31;
 int result = 0;
 __asm volatile ("");
 for (int i = 0; i < len; i++)
   result += x[i];
 return result;
}

Validated on arm-none-eabi, arm-none-linux-gnueabi,
arm-none-linux-gnueabihf, and armeb-none-linux-gnueabihf.

2015-09-22  Michael Collison 

   * config/arm/neon.md (widen_sum): New patterns
   where mode is VQI to improve mixed mode add vectorization.


Please list all the new define_expands and define_insns
in the changelog. Also, please add an ChangeLog entry for
the testsuite additions.

The approach looks ok to me with a few comments on some
parts of the patch itself.


+(define_insn "vec_sel_widen_ssum_hi3"
+  [(set (match_operand: 0 "s_register_operand" "=w")
+(plus: (sign_extend: (vec_select:VW
(match_operand:VQI 1 "s_register_operand" "%w")
+   (match_operand:VQI 2
"vect_par_constant_high" "")))
+(match_operand: 3 "s_register_operand"
"0")))]
+  "TARGET_NEON"
+  "vaddw.\t%q0, %q3, %f1"
+  [(set_attr "type" "neon_add_widen")
+  (set_attr "length" "8")]
+)


This is a single instruction, and it has a length of 4, so no need to
override the length attribute.
Same with the other define_insns in this patch.


diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
new file mode 100644
index 000..ed10669
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */

The arm_neon_hw check is usually used when you want to run the tests.
Since this is a compile-only tests you just need arm_neon_ok.

  +/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+
+int
+t6(int len, void * dummy, short * __restrict x)
+{
+  len = len & ~31;
+  int result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.s16" } } */
+
+
+

Stray trailing newlines. Similar comments for the other testcases.

Thanks,
Kyrill





Re: [ARM] Add ARMv8.1 command line options.

2015-10-08 Thread Matthew Wahab

Ping.

Updated patch attached, I've broken the over-long lines added to arm-arches.def and 
arm-fpus.def.


Matthew

On 17/09/15 18:54, Matthew Wahab wrote:

Hello,

ARMv8.1 is a set of architectural extensions to ARMv8. Support has been
enabled in binutils for ARMv8.1 for the architechure, using the name
"armv8.1-a".

This patch adds support to gcc for specifying an ARMv8.1 architecture
using options "-march=armv8.1-a" and "-march=armv8.1-a+crc". It also
adds the FPU options "-mfpu=neon-fp-armv8.1" and
"-mpu=crypto-neon-fp-armv8.1", to specify the ARMv8.1 Adv.SIMD
instruction set.  The changes set the apropriate architecture and fpu
options for binutils but don't otherwise change the code generated by
gcc.

Tested for arm-none-linux-gnueabihf with native bootstrap and make
check.

Ok for trunk?
Matthew

2015-09-17  Matthew Wahab  

 * config/arm/arm-arches.def: Add "armv8.1-a" and "armv8.1-a+crc".
 * config/arm/arm-fpus.def: Add "neon-fp-armv8.1" and
 "crypto-neon-fp-armv8.1".
 * config/arm/arm-protos.h (FL2_ARCH8_1): New.
 (FL2_FOR_ARCH8_1A): New.
 * config/arm/arm-tables.opt: Regenerate.
 * config/arm/arm.h (FPU_FL_RDMA): New.
 * doc/invoke.texi (ARM -march): Add "armv8.1-a" and
 "armv8.1-a+crc".
 (ARM -mfpu): Add "neon-fp-armv8.1" and "crypto-neon-fp-armv8.1".


diff --git a/gcc/config/arm/arm-arches.def b/gcc/config/arm/arm-arches.def
index ddf6c3c..2635c7b 100644
--- a/gcc/config/arm/arm-arches.def
+++ b/gcc/config/arm/arm-arches.def
@@ -57,6 +57,11 @@ ARM_ARCH("armv7-m", cortexm3,	7M,	ARM_FSET_MAKE_CPU1 (FL_CO_PROC |	  FL_FOR_
 ARM_ARCH("armv7e-m", cortexm4,  7EM,	ARM_FSET_MAKE_CPU1 (FL_CO_PROC |	  FL_FOR_ARCH7EM))
 ARM_ARCH("armv8-a", cortexa53,  8A,	ARM_FSET_MAKE_CPU1 (FL_CO_PROC | FL_FOR_ARCH8A))
 ARM_ARCH("armv8-a+crc",cortexa53, 8A,   ARM_FSET_MAKE_CPU1 (FL_CO_PROC | FL_CRC32  | FL_FOR_ARCH8A))
+ARM_ARCH("armv8.1-a", cortexa53,  8A,
+	 ARM_FSET_MAKE (FL_CO_PROC | FL_FOR_ARCH8A,  FL2_FOR_ARCH8_1A))
+ARM_ARCH("armv8.1-a+crc",cortexa53, 8A,
+	 ARM_FSET_MAKE (FL_CO_PROC | FL_CRC32 | FL_FOR_ARCH8A,
+			FL2_FOR_ARCH8_1A))
 ARM_ARCH("iwmmxt",  iwmmxt, 5TE,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_STRONG | FL_FOR_ARCH5TE | FL_XSCALE | FL_IWMMXT))
 ARM_ARCH("iwmmxt2", iwmmxt2,5TE,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_STRONG | FL_FOR_ARCH5TE | FL_XSCALE | FL_IWMMXT | FL_IWMMXT2))
 
diff --git a/gcc/config/arm/arm-fpus.def b/gcc/config/arm/arm-fpus.def
index efd5896..2c7b82e 100644
--- a/gcc/config/arm/arm-fpus.def
+++ b/gcc/config/arm/arm-fpus.def
@@ -44,5 +44,9 @@ ARM_FPU("fp-armv8",	ARM_FP_MODEL_VFP, 8, VFP_REG_D32, FPU_FL_FP16)
 ARM_FPU("neon-fp-armv8",ARM_FP_MODEL_VFP, 8, VFP_REG_D32, FPU_FL_NEON | FPU_FL_FP16)
 ARM_FPU("crypto-neon-fp-armv8",
 			ARM_FP_MODEL_VFP, 8, VFP_REG_D32, FPU_FL_NEON | FPU_FL_FP16 | FPU_FL_CRYPTO)
+ARM_FPU("neon-fp-armv8.1", ARM_FP_MODEL_VFP, 8, VFP_REG_D32,
+	FPU_FL_NEON | FPU_FL_FP16 | FPU_FL_RDMA)
+ARM_FPU("crypto-neon-fp-armv8.1", ARM_FP_MODEL_VFP, 8, VFP_REG_D32,
+	FPU_FL_NEON | FPU_FL_FP16 | FPU_FL_RDMA | FPU_FL_CRYPTO)
 /* Compatibility aliases.  */
 ARM_FPU("vfp3",		ARM_FP_MODEL_VFP, 3, VFP_REG_D32, FPU_FL_NONE)
diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index f9b1276..9631ac9 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -387,6 +387,8 @@ extern bool arm_is_constant_pool_ref (rtx);
 #define FL_IWMMXT2(1 << 30)   /* "Intel Wireless MMX2 technology".  */
 #define FL_ARCH6KZ(1 << 31)   /* ARMv6KZ architecture.  */
 
+#define FL2_ARCH8_1   (1 << 0)	  /* Architecture 8.1.  */
+
 /* Flags that only effect tuning, not available instructions.  */
 #define FL_TUNE		(FL_WBUF | FL_VFPV2 | FL_STRONG | FL_LDSCHED \
 			 | FL_CO_PROC)
@@ -415,6 +417,7 @@ extern bool arm_is_constant_pool_ref (rtx);
 #define FL_FOR_ARCH7M	(FL_FOR_ARCH7 | FL_THUMB_DIV)
 #define FL_FOR_ARCH7EM  (FL_FOR_ARCH7M | FL_ARCH7EM)
 #define FL_FOR_ARCH8A	(FL_FOR_ARCH7VE | FL_ARCH8)
+#define FL2_FOR_ARCH8_1A	FL2_ARCH8_1
 
 /* There are too many feature bits to fit in a single word so the set of cpu and
fpu capabilities is a structure.  A feature set is created and manipulated
diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h
index 87c9f90..4037933 100644
--- a/gcc/config/arm/arm.h
+++ b/gcc/config/arm/arm.h
@@ -320,6 +320,7 @@ typedef unsigned long arm_fpu_feature_set;
 #define FPU_FL_NEON	(1 << 0)	/* NEON instructions.  */
 #define FPU_FL_FP16	(1 << 1)	/* Half-precision.  */
 #define FPU_FL_CRYPTO	(1 << 2)	/* Crypto extensions.  */
+#define FPU_FL_RDMA	(1 << 3)	/* ARMv8.1 extensions.  */
 
 /* Which floating point model to use.  */
 enum arm_fp_model
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 3a9594c..2dd89a3 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -13372,8 +13372,8 @@ of the @option{-mcpu=} option.  Permissible names are: @samp{armv2},
 @samp{armv6}, @samp{armv6j},
 @samp{armv6t2}, @samp{armv6z}, @samp{armv6kz}, @samp{

Re: Do not use TYPE_CANONICAL in useless_type_conversion

2015-10-08 Thread Eric Botcazou
> Thank you! I commited the patch.

It breaks the Ada build on x86-64 though:

eric@polaris:~/build/gcc/native/gcc/ada/rts> 
/home/eric/build/gcc/native/./gcc/xgcc -B/home/eric/build/gcc/native/./gcc/ -
B/home/eric/install/gcc/x86_64-suse-linux/bin/ -
B/home/eric/install/gcc/x86_64-suse-linux/lib/ -isystem 
/home/eric/install/gcc/x86_64-suse-linux/include -isystem 
/home/eric/install/gcc/x86_64-suse-linux/sys-include-c -g -O2  -fpic  -W -
Wall -gnatpg -nostdinc   s-regpat.adb -o s-regpat.o
+===GNAT BUG DETECTED==+
| 6.0.0 20151008 (experimental) [trunk revision 228597] (x86_64-suse-linux) 
GCC error:|
| in gen_lowpart_common, at emit-rtl.c:1399|
| Error detected around s-regpat.adb:1029:22   |
| Please submit a bug report; see http://gcc.gnu.org/bugs.html.|
| Use a subject line meaningful to you and us to track the bug.|
| Include the entire contents of this bug box in the report.   |
| Include the exact command that you entered.  |
| Also include sources listed below.   |
+==+

-- 
Eric Botcazou


  1   2   >