[PATCH, gomp4] Propagate independent clause for OpenACC kernels pass

2015-07-13 Thread Chung-Lin Tang
Hi Tom,
this patch provides a 'bool independent' field in struct loop, which
will be switched on by an "independent" clause in a #pragma acc loop directive.
I assume you'll be wiring it to the kernels parloops pass in a followup patch.

Note: there are already a few other similar fields in struct loop, namely
'safelen' and 'can_be_parallel', used by OMP simd safelen and GRAPHITE 
respectively.
The intention and/or setting of these fields are all a bit different, so I've
decided to add a new bool for OpenACC.

Tested and committed to gomp-4_0-branch.

Chung-Lin

2015-07-14  Chung-Lin Tang  

* cfgloop.h (struct loop): Add 'bool marked_independent' field.
* gimplify.c (gimplify_scan_omp_clauses): Keep OMP_CLAUSE_INDEPENDENT.
* omp-low.c (struct omp_region): Add 'int kind' and
'bool independent' fields.
(expand_omp_for): Set 'marked_independent' field for loop
corresponding to region.
(find_omp_for_region_data): New function.
(find_omp_target_region_data): Set kind field.
(build_omp_regions_1): Call find_omp_for_region_data() for
GIMPLE_OMP_FOR statements.
Index: cfgloop.h
===
--- cfgloop.h	(revision 225758)
+++ cfgloop.h	(working copy)
@@ -194,6 +194,10 @@ struct GTY ((chain_next ("%h.next"))) loop {
   /* True if the loop is part of an oacc kernels region.  */
   bool in_oacc_kernels_region;
 
+  /* True if loop is tagged as having independent iterations by user,
+ e.g. the OpenACC independent clause.  */
+  bool marked_independent;
+
   /* For SIMD loops, this is a unique identifier of the loop, referenced
  by IFN_GOMP_SIMD_VF, IFN_GOMP_SIMD_LANE and IFN_GOMP_SIMD_LAST_LANE
  builtins.  */
Index: gimplify.c
===
--- gimplify.c	(revision 225758)
+++ gimplify.c	(working copy)
@@ -6602,7 +6602,6 @@ gimplify_scan_omp_clauses (tree *list_p, gimple_se
 	  break;
 
 	case OMP_CLAUSE_DEVICE_RESIDENT:
-	case OMP_CLAUSE_INDEPENDENT:
 	  remove = true;
 	  break;
 
@@ -6612,6 +6611,7 @@ gimplify_scan_omp_clauses (tree *list_p, gimple_se
 	case OMP_CLAUSE_COLLAPSE:
 	case OMP_CLAUSE_AUTO:
 	case OMP_CLAUSE_SEQ:
+	case OMP_CLAUSE_INDEPENDENT:
 	case OMP_CLAUSE_MERGEABLE:
 	case OMP_CLAUSE_PROC_BIND:
 	case OMP_CLAUSE_SAFELEN:
Index: omp-low.c
===
--- omp-low.c	(revision 225758)
+++ omp-low.c	(working copy)
@@ -136,8 +136,16 @@ struct omp_region
   /* True if this is nested inside an OpenACC kernels construct.  */
   bool inside_kernels_p;
 
+  /* Records a generic kind field.  */
+  int kind;
+
   /* For an OpenACC loop, the level of parallelism requested.  */
   int gwv_this;
+
+  /* For an OpenACC loop directive, true if has the 'independent' clause.  */
+  bool independent;
+
+  tree broadcast_array;
 };
 
 /* Context structure.  Used to store information about each parallel
@@ -8273,8 +8281,15 @@ expand_omp_for (struct omp_region *region, gimple
 loops_state_set (LOOPS_NEED_FIXUP);
 
   if (region->inside_kernels_p)
-expand_omp_for_generic (region, &fd, BUILT_IN_NONE, BUILT_IN_NONE,
-			inner_stmt);
+{
+  expand_omp_for_generic (region, &fd, BUILT_IN_NONE, BUILT_IN_NONE,
+			  inner_stmt);
+  if (region->independent && region->cont->loop_father)
+	{
+	  struct loop *loop = region->cont->loop_father; 
+	  loop->marked_independent = true;
+	}
+}
   else if (gimple_omp_for_kind (fd.for_stmt) & GF_OMP_FOR_SIMD)
 expand_omp_simd (region, &fd);
   else if (gimple_omp_for_kind (fd.for_stmt) == GF_OMP_FOR_KIND_CILKFOR)
@@ -9943,6 +9958,34 @@ find_omp_for_region_gwv (gimple stmt)
   return tmp;
 }
 
+static void
+find_omp_for_region_data (struct omp_region *region, gomp_for *stmt)
+{
+  region->gwv_this = find_omp_for_region_gwv (stmt);
+  region->kind = gimple_omp_for_kind (stmt);
+
+  if (region->kind == GF_OMP_FOR_KIND_OACC_LOOP)
+{
+  struct omp_region *target_region = region->outer;
+  while (target_region
+	 && target_region->type != GIMPLE_OMP_TARGET)
+	target_region = target_region->outer;
+  if (!target_region)
+	return;
+
+  tree clauses = gimple_omp_for_clauses (stmt);
+
+  if (target_region->kind == GF_OMP_TARGET_KIND_OACC_PARALLEL
+	  && !find_omp_clause (clauses, OMP_CLAUSE_SEQ))
+	/* In OpenACC parallel constructs, 'independent' is implied on all
+	   loop directives without a 'seq' clause.  */
+	region->independent = true;
+  else if (target_region->kind == GF_OMP_TARGET_KIND_OACC_KERNELS
+	   && find_omp_clause (clauses, OMP_CLAUSE_INDEPENDENT))
+	region->independent = true;
+}
+}
+
 /* Fill in additional data for a region REGION associated with an
OMP_TARGET STMT.  */
 
@@ -9960,6 +10003,7 @@ find_omp_target_region_data (struct omp_region *re
 region->gwv_this |= OACC_LOOP_MASK (OACC_worker);
   if (find_omp_clause (clauses, OMP_CLAUSE_VECTOR_LE

Re: [PATCH][4/n] Remove GENERIC stmt combining from SCCVN

2015-07-13 Thread Jeff Law

On 07/13/2015 03:32 AM, Richard Biener wrote:

On Mon, 13 Jul 2015, Richard Biener wrote:


On Sun, 12 Jul 2015, Jeff Law wrote:


On 06/29/2015 01:58 AM, Richard Biener wrote:


In principle the following works for the testcase (even w/o fixing
the VRP part).

Index: gcc/tree-ssa-dom.c
===
--- gcc/tree-ssa-dom.c  (revision 225007)
+++ gcc/tree-ssa-dom.c  (working copy)
@@ -1409,6 +1409,14 @@ simplify_stmt_for_jump_threading (gimple
 return lookup_avail_expr (stmt, false);
   }

+static tree
+dom_valueize (tree t)
+{
+  if (TREE_CODE (t) == SSA_NAME)
+return SSA_NAME_VALUE (t);
+  return t;
+}
+
   /* Record into the equivalence tables any equivalences implied by
  traversing edge E (which are cached in E->aux).

@@ -1429,7 +1437,33 @@ record_temporary_equivalences (edge e)

 /* If we have a simple NAME = VALUE equivalence, record it.  */
 if (lhs && TREE_CODE (lhs) == SSA_NAME)
-   const_and_copies->record_const_or_copy (lhs, rhs);
+   {
+ gimple use_stmt;
+ imm_use_iterator iter;
+ const_and_copies->record_const_or_copy (lhs, rhs);
+ FOR_EACH_IMM_USE_STMT (use_stmt, iter, lhs)
+   {
+ /* Only bother to record more equivalences for lhs that
+can be directly used by e->dest.
+???  If the code gets re-organized to a worklist to
+catch more indirect opportunities and it is made to
+handle PHIs then this should only consider use_stmts
+in basic-blocks we have already visited.  */
+ if (!dominated_by_p (CDI_DOMINATORS,
+  e->dest, gimple_bb (use_stmt)))
+   continue;
+ tree lhs = gimple_get_lhs (use_stmt);
+ if (lhs && TREE_CODE (lhs) == SSA_NAME)
+   {
+ tree res = gimple_fold_stmt_to_constant_1 (use_stmt,
+dom_valueize,
+
no_follow_ssa_edges);
+ if (TREE_CODE (res) == SSA_NAME
+ || is_gimple_min_invariant (res))
+   const_and_copies->record_const_or_copy (lhs, res);
+   }
+   }
+   }

 /* If we have 0 = COND or 1 = COND equivalences, record them
   into our expression hash tables.  */


it's not using DOMs own stmt visiting machinery as that always modifies
stmts in-place.  As stated in the comment it doesn't catch secondary
opportunities.  That would be possible by using a work-list seeded
by LHS we recorded new const/copies for and re-visiting their uses.
You can get extra fancy here by properly handling PHIs and
conditionals.  But it's a question of cost here, of course.

Right, the code you're modifying is only used by jump threading to record
temporary equivalences, particularly equivalences that are specific to a path.




Note that I think this isn't really "backward propagation" but
just context sensitive value-numbering.

I think that's because we're looking at the problem differently.  It's
certainly not backward propagation in the traditional dataflow sense, so I'm
probably being too loose with terminology here.

When we discover something about X by means other than the definition of X, we
can look at how X was set and possibly discover a value for source operands of
that statement.  Similarly we can look at uses of X and possibly discover a
value for the destination of those statement(s).  In both cases we're going
backwards from an order-of-execution point of view and recording additional
equivalences.

The existing code did the former (look at X's defining statement and try to
discover an equivalence for a source operand in that statement). What we need
to optimize this case is the latter.

I *think* these are closely enough related that some code can be factored out
a bit and reused in both r_e_f_i_e and r_t_e to discover both types of
equivalences for DOM and for jump threading.


Indeed - the odd thing here is that one function uses
const_and_copies->record_const_or_copy directly while the other one
record_equality (this function is _solely_ used by
record_equivalences_from_incoming_edge).  I didn't want to introduce
a callback to commonize the code (though in principle we could use
a template function with a function template parameter...)

That said, I don't see that record_equality does sth not suitable
if called from record_temporary_equivalences.  So if we make
use of that function we could simply call record_temporary_equivalences
from record_equivalences_from_incoming_edge.


So, like the following.

Bootstrapped on x86_64-unknown-linux-gnu - ok if testing succeeds?

Thanks,
Richard.

2015-07-13  Richard Biener  

* tree-ssa-dom.c (record_temporary_equivalences): Merge
wideing type conversion case from record_equivalences_from_incoming_edge
and use record_equality to record eq

[Patch wwwdocs] gcc-6/changes.html : Document AMD monitorx and mwaitx

2015-07-13 Thread Kumar, Venkataramanan
Hi Richard and Gerald,

This patch adds the documentation in changes.html for the GCC trunk (gcc-6) .

Please let me know if it is good to commit.

Index: htdocs/gcc-6/changes.html
===
RCS file: /cvs/gcc/wwwdocs/htdocs/gcc-6/changes.html,v
retrieving revision 1.13
diff -r1.13 changes.html
100,101c100,112
< 
< 
---
> IA-32/x86-64
>
>  
>Support for new AMD instructions monitorx and
>mwaitx has been added. This includes new intrinsic
>and built-in support. It is enabled through option 
> -mmwaitx.
>The instructions monitorx and mwaitx
>implement the same functionality as the old monitor
>and mwait instructions. In addition mwaitx
>adds a configurable timer. The timer value is received as third
>argument and stored in register %ebx.
>  
>

Regards,
Venkat.


Re: [PATCH, rtl-optimization]: Fix PR66838, Calling multiple SYSV AMD64 ABI functions from MS x64 ABI one results in clobbered parameters

2015-07-13 Thread Jeff Law

On 07/12/2015 12:35 PM, Uros Bizjak wrote:

Another missing case of CALL_INSN_FUNCTION_USAGE, where clobbered
registers are also marked, this time in postreload/
reload_cse_move2add.

Fixed compiler now generates following code

callsysv_abi_func
movl$global, %esi
movl$.LC2, %edi
callsysv_abi_func
movl$global, %esi
movl$.LC3, %edi
callsysv_abi_func

which correctly reloads %esi for every sysv_abi function call.

2015-07-12  Uros Bizjak  

 PR rtl-optimization/66838
 * postreload.c (reload_cse_move2add): Also process
 CALL_INSN_FUNCTION_USAGE when resetting information of
 call-clobbered registers.

testsuite/ChangeLog:

2015-07-12  Uros Bizjak  

 PR rtl-optimization/66838
 * gcc.target/i386/pr66838.c: New test.

Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.

OK for mainline and gcc-5 branch?

OK for the trunk.

jeff


Re: [patch, driver] Ignore -ftree-parallelize-loops={0,1}

2015-07-13 Thread Jeff Law

On 07/13/2015 04:58 AM, Tom de Vries wrote:

On 07/07/15 09:53, Tom de Vries wrote:

Hi,

currently, we have these spec strings in gcc/gcc.c involving
ftree-parallelize-loops:
...
%{fopenacc|fopenmp|ftree-parallelize-loops=*:%:include(libgomp.spec)%(link_gomp)}


%{fopenacc|fopenmp|ftree-parallelize-loops=*:-pthread}"
...

Actually, ftree-parallelize-loops={0,1} means that no parallelization is
done, but these spec strings still get activated for these values.


Attached patch fixes that, by introducing a spec function gt (short for
greather than), and using it in the spec lines.



Attached (untested) patch manages the same, without introducing the spec
function 'gt'. But the solution is a bit convoluted, so I prefer the one
with the gt function.

I prefer the one with the gt function :-)

jeff


Re: [PATCH, RFC] combine: Don't create insv insns unless HAVE_insv

2015-07-13 Thread Jeff Law

On 07/12/2015 07:56 AM, Segher Boessenkool wrote:

Currently combine tries to make assignments to bitfields (of a register)
whenever it can.  If the target has no insv pattern, the result will not
ever match (if the MD is sane at all).  Doing insv on registers generates
worse code than what you get if you express things directly (with and/ior),
so many targets do not _want_ to have insv patterns.

This patch changes combine to not generate insv patterns if the target
does not have any.

Bootstrapped and regression checked on powerpc64-linux (with and without
insv patterns there).  Also built on many other targets, for many months.

I'm vaguely aware there have been changes to extzv etc. so there now are
extzv; I'll investigate if that means anything for insv as well.
It's also a new #ifdef HAVE_xxx.  But we're not clean there yet so I hope
to get away with that ;-)

Comments?  Complaints?
Well, I'd rather avoid the #ifdef.  Just because we aren't clean yet 
doesn't mean we need to introduce more stuff to clean up later.


It'd also be nice to have a testcase or two.  Guessing they'd be target 
dependent, but that's OK with me.


jeff



Re: [PATCH 1/2, rtl-optimization]: Fix PR 58066, __tls_get_addr is called with misaligned stack on x86-64

2015-07-13 Thread Jeff Law

On 07/13/2015 11:03 AM, Uros Bizjak wrote:

This is rtl-optimization part of a two-part patch series.

As discussed in the PR, we have to prcompute register parameters
before stack alignment is performed, otherwise eventual call to
__tls_get_addr can be called with unaligned stack. When compiling the
testcase from the PR, anti_adjust_stack is called just before
precompute starts expanding function parameters.

The solution is to move  precomputation before stack pointer is adjusted.

2015-07-13  Uros Bizjak  

 PR rtl-optimization/58066
 * calls.c (expand_call): Precompute register parameters before stack
 alignment is performed.

Patch was bootstrapped and regression tested on x86_64-linux-gnu
{,-m32} for all default languages, obj-c++ and go.

OK for mainline?
OK once a comment is added indicating why we have to precompute before 
the anti-adjust-stack.


jeff



[PING] Re: [PATCH] c/66516 - missing diagnostic on taking the address of a builtin function

2015-07-13 Thread Martin Sebor

[CC Jason since the patch also touches the C++ front end]

The updated patch is here:
  https://gcc.gnu.org/ml/gcc-patches/2015-07/msg00258.html

Thanks
Martin

On 07/04/2015 04:32 PM, Martin Sebor wrote:

I don't think c_validate_addressable is a good name - given that it's
called for lots of things that aren't addressable, in contexts in which
there is no need for them to be addressable, and doesn't do checks of
addressability in contexts where they are actually needed and done
elsewhere (e.g. checks for not taking the address of a register
variable).
The question seems to be something more like: is the expression used
as an
operand something it's OK to use as an operand at all?


Thank you for the review.

I've changed the name to c_reject_gcc_builtin. If you would prefer
a different name still please suggest one. I'm not partial to any
one in particular.


What is the logic for the list of functions above being a complete
list of
the places that need changes?


The logic by which I arrived at the changes was by constructing
test cases exercising expressions where a function is a valid
operand and where its address might need to be obtained when
it's not available, and stepping through the code and modifying
it until I found a suitable place to change to reject it.


How can ifexp be of pointer type?  It's undergone truthvalue conversion
and should always be of type int at this point (but in any case, you
can't
refer to TREE_OPERAND (ifexp, 0) without knowing what sort of expression
it is).


I've removed the redundant test from this function.




+/* For EXPR that is an ADDR_EXPR or whose type is a FUNCTION_TYPE,
+   determines whether its operand can have its address taken issues
+   an error pointing to the location LOC.
+   Operands that cannot have their address taken are builtin functions
+   that have no library fallback (no other kinds of expressions are
+   considered).
+   Returns true when the expression can have its address taken and
+   false otherwise.  */


Apart from the naming issue, the comment says nothing about the semantics
of the function for an argument that's not of that form.


I've reworded the comment to hopefully make the semantics of
the function more clear.

Attached is an updated patch with these changes.

Martin




[PING 4] Re: [PATCH] warn for unsafe calls to __builtin_return_addres

2015-07-13 Thread Martin Sebor

Still looking for a review of this small patch to help detect
potentially unsafe calls to __builtin_{frame,return}_address
(with an argument > 2) that tend to either return bogus values
or lead to crashes at runtime.

The problem the function can cause was originally reported in
pr8743 and resolved by adding a note to the documentation. This
patch tweaks the documentation further to make the risks of
using the function that much clearer.

  https://gcc.gnu.org/ml/gcc-patches/2015-06/msg00886.html

Thanks
Martin


Go patch committed: Check for possible pointers at run time

2015-07-13 Thread Ian Lance Taylor
This patch by Chris Manghane changes the Go frontend to check whether
a type contains pointers at run time, rather than using two different
functions, one with possible pointers, one not.  Bootstrapped and ran
Go testsuite on x86_64-unknown-linux-gnu.  Committed to mainline.

Ian
Index: gcc/go/gofrontend/MERGE
===
--- gcc/go/gofrontend/MERGE (revision 225756)
+++ gcc/go/gofrontend/MERGE (working copy)
@@ -1,4 +1,4 @@
-83191e8e2cb9f47f7c1e6bcb9997f21163292612
+2c985e4781691fea3eb4171de85265bfbc4e4997
 
 The first line of this file holds the git revision number of the last
 merge done from the gofrontend repository.
Index: gcc/go/gofrontend/gogo.cc
===
--- gcc/go/gofrontend/gogo.cc   (revision 225756)
+++ gcc/go/gofrontend/gogo.cc   (working copy)
@@ -4391,15 +4391,7 @@ Gogo::allocate_memory(Type* type, Locati
   Expression* td = Expression::make_type_descriptor(type, location);
   Expression* size =
 Expression::make_type_info(type, Expression::TYPE_INFO_SIZE);
-
-  // If this package imports unsafe, then it may play games with
-  // pointers that look like integers.  We should be able to determine
-  // whether or not to use new pointers in libgo/go-new.c.  FIXME.
-  bool use_new_pointers = this->imported_unsafe_ || type->has_pointer();
-  return Runtime::make_call((use_new_pointers
-? Runtime::NEW
-: Runtime::NEW_NOPOINTERS),
-   location, 2, td, size);
+  return Runtime::make_call(Runtime::NEW, location, 2, td, size);
 }
 
 // Traversal class used to check for return statements.
Index: gcc/go/gofrontend/runtime.def
===
--- gcc/go/gofrontend/runtime.def   (revision 225750)
+++ gcc/go/gofrontend/runtime.def   (working copy)
@@ -223,10 +223,6 @@ DEF_GO_RUNTIME(REGISTER_GC_ROOTS, "__go_
 // Allocate memory.
 DEF_GO_RUNTIME(NEW, "__go_new", P2(TYPE, UINTPTR), R1(POINTER))
 
-// Allocate memory which can not contain pointers.
-DEF_GO_RUNTIME(NEW_NOPOINTERS, "__go_new_nopointers", P2(TYPE, UINTPTR), 
R1(POINTER))
-
-
 // Start a new goroutine.
 DEF_GO_RUNTIME(GO, "__go_go", P2(FUNC_PTR, POINTER), R0())
 
Index: libgo/runtime/go-new.c
===
--- libgo/runtime/go-new.c  (revision 225752)
+++ libgo/runtime/go-new.c  (working copy)
@@ -8,19 +8,12 @@
 #include "runtime.h"
 #include "arch.h"
 #include "malloc.h"
+#include "go-type.h"
 
 void *
 __go_new (const struct __go_type_descriptor *td, uintptr_t size)
 {
   return runtime_mallocgc (size,
   (uintptr) td | TypeInfo_SingleObject,
-  0);
-}
-
-void *
-__go_new_nopointers (const struct __go_type_descriptor *td,  uintptr_t size)
-{
-  return runtime_mallocgc (size,
-  (uintptr) td | TypeInfo_SingleObject,
-  FlagNoScan);
+  td->__code & GO_NO_POINTERS ? FlagNoScan : 0);
 }


Go patch committed: Fix location of function parameters

2015-07-13 Thread Ian Lance Taylor
This patch by Chris Manghane fixes the location of function parameters
in the Go frontend.  Bootstrapped and ran Go testsuite on
x86_64-unknown-linux-gnu.  Committed to mainline.

Ian
Index: gcc/go/gofrontend/MERGE
===
--- gcc/go/gofrontend/MERGE (revision 225752)
+++ gcc/go/gofrontend/MERGE (working copy)
@@ -1,4 +1,4 @@
-c8cb74e70fbe87b14bbd083730f52a68c79ec6bb
+83191e8e2cb9f47f7c1e6bcb9997f21163292612
 
 The first line of this file holds the git revision number of the last
 merge done from the gofrontend repository.
Index: gcc/go/gofrontend/gogo.cc
===
--- gcc/go/gofrontend/gogo.cc   (revision 225750)
+++ gcc/go/gofrontend/gogo.cc   (working copy)
@@ -1694,7 +1694,7 @@ Gogo::start_function(const std::string&
   ++p)
{
  Variable* param = new Variable(p->type(), NULL, false, true, false,
-location);
+p->location());
  if (is_varargs && p + 1 == parameters->end())
param->set_is_varargs_parameter();
 


Re: [PATCH][AArch64] Handle -|x| case using a single csneg

2015-07-13 Thread Segher Boessenkool
On Mon, Jul 13, 2015 at 10:48:19AM +0100, Kyrill Tkachov wrote:
> For the testcase in the patch we were generating an extra neg instruction:
> cmp w0, wzr
> csneg   w0, w0, w0, ge
> neg w0, w0
> ret
> 
> instead of the optimal:
> cmp w0, wzr
> csneg   w0, w0, w0, lt
> ret
> 
> The reason is that combine tries to merge the operation into a negation of 
> an abs.

Before combine, you have two insns, a negation and an abs, so that is
not so very strange :-)

Some archs have actual nabs insns btw (for floating point, anyway).

Archs without abs or conditional assignment, and with cheap branches,
get a branch around a neg followed by another neg, at expand time.
This then isn't optimised away either.

So I'd say expand should be made a bit smarter for this.  Failing
that, your approach looks fine to me -- assuming you want to have a
fake "abs" insn at all.

On to the patch...


> +;; Combine will try merging (c > 0 ? -x : x) into (-|x|).  This isn't a good

"x > 0" here.


Segher


Go patch committed: Analyze multiple-result conversions

2015-07-13 Thread Ian Lance Taylor
This patch from Chris Manghane fixes escape analysis to correctly
handle calls to interface conversion functions that return multiple
results.  Bootstrapped and ran Go testsuite on
x86_64-unknown-linux-gnu.  Committed to mainline.

Ian
Index: gcc/go/gofrontend/MERGE
===
--- gcc/go/gofrontend/MERGE (revision 225715)
+++ gcc/go/gofrontend/MERGE (working copy)
@@ -1,4 +1,4 @@
-a5122ab435cf40c22b110487eb5f189ee28e77f4
+1b3d945d201bcb1238f15ef154c6e4671e4c6f5c
 
 The first line of this file holds the git revision number of the last
 merge done from the gofrontend repository.
Index: gcc/go/gofrontend/escape.cc
===
--- gcc/go/gofrontend/escape.cc (revision 225715)
+++ gcc/go/gofrontend/escape.cc (working copy)
@@ -547,6 +547,41 @@ Build_connection_graphs::resolve_var_ref
expr = expr->type_guard_expression()->expr();
break;
 
+  case Expression::EXPRESSION_UNSAFE_CONVERSION:
+   {
+ Expression* e = expr->unsafe_conversion_expression()->expr();
+ if (e->call_result_expression() != NULL
+ && e->call_result_expression()->index() == 0)
+   {
+ // a, ok := p.(T) gets lowered into a call to one of the interface
+ // to type conversion functions instead of a type guard 
expression.
+ // We only want to make a connection between a and p, the bool
+ // result should not escape because p escapes.
+ e = e->call_result_expression()->call();
+
+ Named_object* fn =
+   e->call_expression()->fn()->func_expression()->named_object();
+ std::string fn_name = fn->name();
+ if (fn->package() == NULL
+ && fn->is_function_declaration()
+ && !fn->func_declaration_value()->asm_name().empty())
+   {
+ if (fn_name == "ifaceI2E2"
+ || fn_name == "ifaceI2I2")
+   e = e->call_expression()->args()->at(0);
+ else if (fn_name == "ifaceE2I2"
+  || fn_name == "ifaceI2I2"
+  || fn_name == "ifaceE2T2P"
+  || fn_name == "ifaceI2T2P"
+  || fn_name == "ifaceE2T2"
+  || fn_name == "ifaceI2T2")
+   e = e->call_expression()->args()->at(1);
+   }
+   }
+ expr = e;
+   }
+   break;
+
   default:
done = true;
break;
Index: gcc/go/gofrontend/expressions.cc
===
--- gcc/go/gofrontend/expressions.cc(revision 225715)
+++ gcc/go/gofrontend/expressions.cc(working copy)
@@ -3391,52 +3391,7 @@ Expression::make_cast(Type* type, Expres
   return new Type_conversion_expression(type, val, location);
 }
 
-// An unsafe type conversion, used to pass values to builtin functions.
-
-class Unsafe_type_conversion_expression : public Expression
-{
- public:
-  Unsafe_type_conversion_expression(Type* type, Expression* expr,
-   Location location)
-: Expression(EXPRESSION_UNSAFE_CONVERSION, location),
-  type_(type), expr_(expr)
-  { }
-
- protected:
-  int
-  do_traverse(Traverse* traverse);
-
-  bool
-  do_is_immutable() const;
-
-  Type*
-  do_type()
-  { return this->type_; }
-
-  void
-  do_determine_type(const Type_context*)
-  { this->expr_->determine_type_no_context(); }
-
-  Expression*
-  do_copy()
-  {
-return new Unsafe_type_conversion_expression(this->type_,
-this->expr_->copy(),
-this->location());
-  }
-
-  Bexpression*
-  do_get_backend(Translate_context*);
-
-  void
-  do_dump_expression(Ast_dump_context*) const;
-
- private:
-  // The type to convert to.
-  Type* type_;
-  // The expression to convert.
-  Expression* expr_;
-};
+// Class Unsafe_type_conversion_expression.
 
 // Traversal.
 
Index: gcc/go/gofrontend/expressions.h
===
--- gcc/go/gofrontend/expressions.h (revision 225715)
+++ gcc/go/gofrontend/expressions.h (working copy)
@@ -32,6 +32,7 @@ class Temporary_reference_expression;
 class Set_and_use_temporary_expression;
 class String_expression;
 class Type_conversion_expression;
+class Unsafe_type_conversion_expression;
 class Unary_expression;
 class Binary_expression;
 class Call_expression;
@@ -571,6 +572,15 @@ class Expression
   conversion_expression()
   { return this->convert(); 
}
 
+  // If this is an unsafe conversion expression, return the
+  // Unsafe_type_conversion_expression structure.  Otherwise, return NULL.
+  Unsafe_type_conversion_expression*
+  unsafe_conversion_expression()
+  {
+return this->convert();
+  }
+
   // Return whether this is the express

Re: [PATCH][4/n] Remove GENERIC stmt combining from SCCVN

2015-07-13 Thread Jeff Law

On 07/13/2015 01:55 AM, Richard Biener wrote:

I *think* these are closely enough related that some code can be factored out
a bit and reused in both r_e_f_i_e and r_t_e to discover both types of
equivalences for DOM and for jump threading.


Indeed - the odd thing here is that one function uses
const_and_copies->record_const_or_copy directly while the other one
record_equality (this function is _solely_ used by
record_equivalences_from_incoming_edge).   I didn't want to introduce
a callback to commonize the code (though in principle we could use
a template function with a function template parameter...)
Funny you should mention that -- I poked at a bit at refactoring the 
code last night, saw this issue and decided to sleep on it a bit as I 
didn't want to introduce the callback either.


The code for handling the equivalency tables is in a bit of a state of 
flux in preparation for handling 47679.  That work got pushed down on 
the stack.  There's still untangling to do in this space.




That said, I don't see that record_equality does sth not suitable
if called from record_temporary_equivalences.  So if we make
use of that function we could simply call record_temporary_equivalences
from record_equivalences_from_incoming_edge.

Agreed.

jeff


Re: [PATCH][4/n] Remove GENERIC stmt combining from SCCVN

2015-07-13 Thread Jeff Law

On 07/13/2015 01:47 AM, Richard Biener wrote:


The path duplication to expose redundancies is one of the things I'd like to
get out of a Bodik-esque scheme.  One of the things Bodik's work does is
identify the minimal set of blocks that need to be copied to expose each path
specific redundancy that it finds.


I see - so this could be then also used to compute a better idea of
a cost to duplicate such a path then?  That is, you don't commit
to creating all of those paths.

Correct.

The identification step with Bodik is a backwards substitution with a 
form of GVN to capture a secondary effect.  Once the redundancies are 
identified, you run a totally separate analysis to identify the minimal 
region of blocks that need to be copied to expose the redundancy.  Then 
you can look at the size of those regions vs the redundancy and make a 
decision about the cost/benefit.




So are you fine with the idea (or even implementation) of the patch
to go forward with moving fold_{widening,sign_change}_comparison to
match.pd?
Yes -- I'm pretty sure we can fix the immediate issue of the 
missed-optimization in the threader without major surgery.


jeff


RE: [PATCH, MIPS] Fix restoration of hi/lo in MIPS64R2 interrupt handlers

2015-07-13 Thread Moore, Catherine


> -Original Message-
> From: Robert Suchanek [mailto:robert.sucha...@imgtec.com]
> Sent: Wednesday, July 08, 2015 6:43 AM
> To: Moore, Catherine; Matthew Fortune; gcc-patches@gcc.gnu.org
> Subject: [PATCH, MIPS] Fix restoration of hi/lo in MIPS64R2 interrupt
> handlers
> 
> Hi,
> 
> The attached patch fixes an ICE (unrecognizable insn) when accumulators are
> used in interrupt handlers for MIPS64R2. There was just a typo in the
> function name.
> 
> Ok to apply?
> 
> gcc/
>   * config/mips/mips.c (mips_emit_save_slot_move): Fix typo.
> 
> gcc/testsuite/
>   * gcc.target/mips/20150707.c: New test.

Hi Robert,
The patch is OK, but will you please name the test something other than the 
date?
Thanks,
Catherine


RE: [PATCH, MIPS] Support new interrupt handler options

2015-07-13 Thread Moore, Catherine


> -Original Message-
> From: Robert Suchanek [mailto:robert.sucha...@imgtec.com]
> Sent: Wednesday, July 08, 2015 6:43 AM
> To: Matthew Fortune; Moore, Catherine; gcc-patches@gcc.gnu.org
> Subject: [PATCH, MIPS] Support new interrupt handler options
> 
> Hi,
> 
> This patch adds support for optional arguments for interrupt and
> use_shadow_register_set attributes.  The patch also fixes an ICE if both
> interrupt and use_shadow_register_set are enabled and compiled with -
> mips64r2 -mabi=64 discovered during testing of the attached test.
> 
> The interrupt attribute accepts new arguments: "eic" and
> "vector=[sw0|sw1|hw0|hw1|hw2|hw3|hw4|hw5]".  The former is the
> default if no argument is given and the latter changes the behaviour of GCC
> and masks interrupts from sw0 up to and including the specified vector.  As
> part of this change, the EPC is now saved and restored unconditionally to
> recover the state in nested interrupts.  Only K1 register is clobbered for
> masked interrupts but for non-masked interrupts K0 is still used.
> 
> The use_shadow_register_set attribute has a new option, "intstack", to
> indicate that the shadow register set has a valid stack pointer.  With this
> option "rdpgpr $sp, $sp" will not be generated for an ISR.
> 
> Tested with mips-img-elf, mips-img-linux-gnu and mips64el-linux-gnu cross
> compilers. Ok to apply?
> 
> Regards,
> Robert
> 
> 2015-07-07  Matthew Fortune  
> Robert Suchanek  
> 
> gcc/
>   * config/mips/mips.c (mips_int_mask): New enum.
>   (mips_shadow_set): Likewise.
>   (int_mask): New variable.
>   (use_shadow_register_set_p): Change type to enum
> mips_shadow_set.
>   (machine_function): Add int_mask and use_shadow_register_set.
>   (mips_attribute_table): Add attribute handlers for interrupt and
>   use_shadow_register_set.
>   (mips_interrupt_mask): New static function.
>   (mips_handle_interrupt_attr): Likewise.
>   (mips_handle_use_shadow_register_set_attr): Likewise.
>   (mips_use_shadow_register_set): Change return type to enum
>   mips_shadow_set.  Add argument handling for
> use_shadow_register_set
>   attribute.
>   (mips_interrupt_extra_called_saved_reg_p): Update the conditional
> to
>   compare with mips_shadow_set enum.
>   (mips_compute_frame_info): Add interrupt mask and
>   use_shadow_register_set to per-function information structure.
>   Add a stack slot for EPC unconditionally.
>   (mips_expand_prologue): Compare use_shadow_register_set value
>   with mips_shadow_set enum.  Save EPC always in K1, clobber only K1
> for
>   masked interrupt register but in EIC mode use K0 and save Cause in
> K0.
>   EPC saved and restored unconditionally.  Use PMODE_INSN macro
> when
>   copying the stack pointer from the shadow register set.
>   * config/mips/mips.h (SR_IM0): New define.
>   * config/mips/mips.md (mips_rdpgpr): Rename to...
>   (mips_rdpgpr_): ...this.  Use the Pmode iterator.
>   * doc/extend.texi (Declaring Attributes of Functions): Document
>   optional arguments for interrupt and use_shadow_register_set
>   attributes.
> 
> gcc/testsuite/
>   * gcc.target/mips/interrupt_handler-4.c: New test.

Hi Robert,
I'm getting build errors with the current TOT and your patch.

The first errors that I encounter are:
gcc/config/mips/mips.c:1355:1: warning: 'mips_int_mask 
mips_interrupt_mask(tree)' defined but not used [-Wunused-function]
gcc/config/mips/mips.c:1392:1: warning: 'mips_shadow_set 
mips_use_shadow_register_set(tree)' defined but not used [-Wunused-function]

Removing these two functions results in further errors that I have not 
investigated.
Will you try applying and building your patch again?

I have a couple of further comments on the existing patch, see below.

Thanks,
Catherine

> diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c index
> ce21a0f..b6ad7db 100644
> --- a/gcc/config/mips/mips.c
> +++ b/gcc/config/mips/mips.c
> @@ -1325,13 +1359,62 @@ mips_interrupt_type_p (tree type)
>return lookup_attribute ("interrupt", TYPE_ATTRIBUTES (type)) != NULL;  }
> 
> +static enum mips_int_mask
> +mips_interrupt_mask (tree type)

This function requires a comment.

> +static enum mips_shadow_set
> +mips_use_shadow_register_set (tree type)

Likewise.

>  {
> @@ -1537,6 +1620,87 @@ mips_can_inline_p (tree caller, tree callee)
>  return false;
>return default_target_can_inline_p (caller, callee);  }
> +
> +static tree
> +mips_handle_interrupt_attr (tree *node, tree name, tree args,
> + int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
> {
Likewise.

> +
> +static tree
> +mips_handle_use_shadow_register_set_attr (tree *node, tree name, tree
> args,

And here as well.
> 



RE: [PATCH, MIPS] Support interrupt handlers with hard-float

2015-07-13 Thread Moore, Catherine


> -Original Message-
> From: Robert Suchanek [mailto:robert.sucha...@imgtec.com]
> Sent: Wednesday, July 08, 2015 6:42 AM
> To: Matthew Fortune; Moore, Catherine; gcc-patches@gcc.gnu.org
> Subject: [PATCH, MIPS] Support interrupt handlers with hard-float
> 
> Hi Matthew/Catherine,
> 
> The attached patch removes the restriction to compile a TU with an ISR with -
> mhard-float. Instead of forcing -msoft-float, the coprocessor 1 is disabled in
> an ISR for -mhard-float.
> 
> Ok to apply?

Yes, this one is OK.

> 
> gcc/
>   * config/mips/mips.c (mips_compute_frame_info): Allow -mhard-
> float in
>   interrupt attribute.
>   (mips_expand_prologue): Disable the floating point unit in an ISR for
>   -mhard-float.
>   * config/mips/mips.h (SR_COP1): New define.



[patch] libstdc++/66855 ignore endianness in codecvt_utf8_utf16

2015-07-13 Thread Jonathan Wakely

This facet is not meant to pay attention to mode|little_endian, so
explicitly override the value to output with the target's native
endianness.

Tested powerpc64-linux and powerp64le-linux, committed to trunk.

I'll commit it to the gcc-5-branch after the 5.2 release.

commit e35e03773955a618cba43fcef4b1559ba9a27d8c
Author: Jonathan Wakely 
Date:   Mon Jul 13 19:38:58 2015 +0100

	PR libstdc++/66855
	* src/c++11/codecvt.cc (__codecvt_utf8_utf16_base::do_in): Override
	endianness bit in mode.
	* testsuite/22_locale/codecvt/codecvt_utf8_utf16/66855.cc: New.

diff --git a/libstdc++-v3/src/c++11/codecvt.cc b/libstdc++-v3/src/c++11/codecvt.cc
index 6b82aa8..a454064 100644
--- a/libstdc++-v3/src/c++11/codecvt.cc
+++ b/libstdc++-v3/src/c++11/codecvt.cc
@@ -1264,7 +1264,11 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 {
   range from{ __from, __from_end };
   range to{ __to, __to_end };
-  auto res = utf16_in(from, to, _M_maxcode, _M_mode);
+  codecvt_mode mode = codecvt_mode(_M_mode | (consume_header|generate_header));
+#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
+  mode = codecvt_mode(mode | little_endian);
+#endif
+  auto res = utf16_in(from, to, _M_maxcode, mode);
   __from_next = from.next;
   __to_next = to.next;
   return res;
diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8_utf16/66855.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8_utf16/66855.cc
new file mode 100644
index 000..3f99cb4
--- /dev/null
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8_utf16/66855.cc
@@ -0,0 +1,52 @@
+// Copyright (C) 2015 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// .
+
+// { dg-options "-std=gnu++11" }
+
+#include 
+#include 
+
+void
+test01()
+{
+  std::codecvt_utf8_utf16 cvt;
+  char16_t utf16[] = u"\ub098\ub294\ud0dc\uc624";
+  const char16_t* nf16;
+  char utf8[16];
+  char* nt8;
+  std::mbstate_t st{};
+  auto res = cvt.out(st, utf16, utf16+4, nf16, utf8, utf8+16, nt8);
+  VERIFY( res == std::codecvt_base::ok );
+
+  st = {};
+  char16_t buf[4] = {};
+  const char* nf8 = nt8;
+  char16_t* nt16;
+  res = cvt.in(st, utf8, nf8, nf8, buf, buf+4, nt16);
+  VERIFY( res == std::codecvt_base::ok );
+  VERIFY( nt16 == buf+4 );
+  VERIFY( buf[0] == utf16[0] );
+  VERIFY( buf[1] == utf16[1] );
+  VERIFY( buf[2] == utf16[2] );
+  VERIFY( buf[3] == utf16[3] );
+}
+
+int
+main()
+{
+  test01();
+}


Re: [Patch, fortran] PR 37131, inline matmul

2015-07-13 Thread Thomas Schwinge
Hi!

On Wed, 06 May 2015 22:26:31 +0200, Thomas Koenig  wrote:
> thanks for the review.  I have committed the following [...]

For nvptx-none targets, I'm seeing some odd behavior with this
commit.

Background: the nvptx backend has some special handling for varags
functions, and this only works if the function has been declared before
being called; in gcc/config/nvptx.c:nvptx_expand_call, if there hasn't
been a declaration, there is a fallback code path to come up with a
function declaration based on the actual parameters of the call site,
gcc/config/nvptx.c:write_func_decl_from_insn, but that doesn't work with
varags functions.

Anyway, code that is generated by GCC itself, such as the matmul checking
code in the Fortran front end that has been added in this commit,
gcc/fortran/frontend-passes.c:runtime_error_ne, should be able to
properly declare its helper functions, but here, this doesn't seem to be
happening.  Being new to it, I'm overwhelmed ;-) by the symbol handling
code in the Fortran front end.  I came up with a hack that then makes the
right thing happen in the nvptx backend, but this looks too much of a
hack...

In particular, I've been thinking that the instantiation of
fe_runtime_error in gcc/fortran/intrinsic.c:add_subroutines must be meant
to provide the declaration for the usage in
gcc/fortran/iresolve.c:gfc_resolve_fe_runtime_error, but there seems to
be some missing link.  Any ideas what's really going on, and how to fix
it, properly?

Original situation; _gfortran_runtime_error is not being properly
declared (invoked via gcc/fortran/frontend-passes.c:runtime_error_ne),
but, for example, _gfortran_error_runtime_at is being properly declared
(invoked from elsewhere):

$ build-gcc/gcc/gfortran -Bbuild-gcc/gcc/ 
source-gcc/gcc/testsuite/gfortran.dg/matmul_bounds_2.f90 -fbounds-check 
-fno-realloc-lhs -S -O1
$ grep runtime_error < matmul_bounds_2.s 
call _gfortran_runtime_error_at, (%out_arg0, %out_arg1, 
%out_arg2);
call _gfortran_runtime_error, (%out_arg0, %out_arg1, 
%out_arg2, %out_arg3);
call _gfortran_runtime_error_at, (%out_arg0, %out_arg1, 
%out_arg2);
call _gfortran_runtime_error_at, (%out_arg0, %out_arg1, 
%out_arg2);
call _gfortran_runtime_error_at, (%out_arg0, %out_arg1, 
%out_arg2);
call _gfortran_runtime_error_at, (%out_arg0, %out_arg1, 
%out_arg2);
// BEGIN GLOBAL FUNCTION DECL: _gfortran_runtime_error
.extern .func _gfortran_runtime_error (.param.u64 %arg0, .param.u64 
%arg1, .param.u64 %arg2, .param.u32 %arg3);
// BEGIN GLOBAL FUNCTION DECL: _gfortran_runtime_error_at
.extern .func _gfortran_runtime_error_at(.param.u64 %in_ar1, .param.u64 
%in_ar2, .param.u64 %in_argp);

With the following hack:

--- gcc/fortran/iresolve.c
+++ gcc/fortran/iresolve.c
@@ -2207,6 +2207,9 @@ gfc_resolve_fe_runtime_error (gfc_code *c)
 a->name = "%VAL";
 
   c->resolved_sym = gfc_get_intrinsic_sub_symbol (name);
+  //TODO
+  extern tree gfor_fndecl_runtime_error;
+  c->resolved_sym->backend_decl = gfor_fndecl_runtime_error;
 }
 
 void

..., we get the correct declarations (and also the code gets optimized
much better):

$ grep runtime_error < matmul_bounds_2.s
call _gfortran_runtime_error_at, (%out_arg0, %out_arg1, 
%out_arg2);
call _gfortran_runtime_error, (%out_arg0, %out_arg1);
// BEGIN GLOBAL FUNCTION DECL: _gfortran_runtime_error_at
.extern .func _gfortran_runtime_error_at(.param.u64 %in_ar1, .param.u64 
%in_ar2, .param.u64 %in_argp);
// BEGIN GLOBAL FUNCTION DECL: _gfortran_runtime_error
.extern .func _gfortran_runtime_error(.param.u64 %in_ar1, .param.u64 
%in_argp);

Help?


For reference, the whole patch:

> 2015-05-06  Thomas Koenig  
> 
>   PR fortran/37131
>   * gfortran.h (gfc_isym_id):  Add GFC_ISYM_FE_RUNTIME_ERROR.
>   (gfc_intrinsic_sym):  Add vararg.
>   * intrinsic.h (gfc_check_fe_runtime_error):  Add prototype.
>   (gfc_resolve_re_runtime_error):  Likewise.
>   Add prototype for gfc_is_reallocatable_lhs.
>   * trans-array.h (gfc_is_reallocatable_lhs):  Remove prototype.
>   * check.c (gfc_check_fe_runtime_error):  New function.
>   * intrinsic.c (add_sym_1p):  New function.
>   (make_vararg):  New function.
>   (add_subroutines):  Add fe_runtime_error.
>   (gfc_intrinsic_sub_interface): Skip sorting for variable number
>   of arguments.
>   * iresolve.c (gfc_resolve_fe_runtime_error):  New function.
>   * lang.opt (inline-matmul-limit):  New option.
>   (gfc_post_options): If no inline matmul limit has been set and
>   BLAS is called externally, use the BLAS limit.
>   * frontend-passes.c:  Include intrinsic.h.
>   (var_num):  New global counter for naming temporary variablbles.
>   (matrix_case):  Enum for differentiating the different matmul
>   cases.
>   (realloc_string_call

Re: [gomp4.1 WIP] omp_target_* libgomp APIs

2015-07-13 Thread Jakub Jelinek
On Mon, Jul 13, 2015 at 10:06:24PM +0300, Ilya Verbin wrote:

> libgomp/
>   * libgomp.h (struct gomp_device_descr): Add dev2dev_func.
>   * target.c (omp_target_memcpy): Support device-to-device.
>   (omp_target_memcpy_rect_worker): Likewise.
>   (omp_target_memcpy_rect): Likewise.
>   (gomp_load_plugin_for_device): Check for GOMP_OFFLOAD_dev2dev.
>   * testsuite/libgomp.c/target-12.c (main): Extend for testing
>   device-to-device memcpy.
> liboffloadmic/
>   * plugin/libgomp-plugin-intelmic.cpp (GOMP_OFFLOAD_dev2dev): New
>   function.
>   * plugin/offload_target_main.cpp (__offload_target_tgt2tgt): New static
>   function, register it in liboffloadmic.

Ok, with a small change:

> @@ -1437,10 +1449,6 @@ omp_target_memcpy_rect (void *dst, void *src, size_t 
> element_size,
>   src_devicep = NULL;
>  }
>  
> -  /* FIXME: Support device-to-device somehow?  */
> -  if (src_devicep != NULL && dst_devicep != NULL)
> -return EINVAL;
> -

Please do here
  if (src_devicep != NULL && dst_devicep != NULL && src_devicep != dst_devicep)
return EINVAL;
anyway.
No need to do all the locking and computations just to find out it will fail
anyway.

Jakub


Re: [PATCH] Fix typo in rtl.c

2015-07-13 Thread Jeff Law

On 07/13/2015 11:00 AM, Marek Polacek wrote:

While working on a new warning I found this typo.

Bootstrapped/regtested on x86_64-linux, ok for trunk?

2015-07-13  Marek Polacek  

* rtl.c (rtx_equal_p_cb): Fix typo.

OK.
jeff



RE: [PATCH] [aarch64] Implemented reciprocal square root (rsqrt) estimation in -ffast-math

2015-07-13 Thread Evandro Menezes
FWIW, I was curious about the precision of the results using such instructions 
for the standard sqrt{,f} functions.  This is not a wide sample, but it does 
point to a floor of series iterations to 3 for DP and 2 for SP:

x   sqrt(x) 1 Step (ulps)   2 Steps (ulps)  
3 Steps (ulps)
2.2251e-308 1.4917e-154 1.4917e-154 (999)   1.4917e-154 (999)   
1.4917e-154 (000)
1.6022e-19  4.0027e-10  4.0027e-10 (999)4.0027e-10 (999)
4.0027e-10 (000)
1.e+00  1.e+00  1.e+00 (001)1.e+00 (001)
1.e+00 (001)
1.e+00  1.e+00  9.e-01 (999)1.e+00 (999)
1.e+00 (000)
1.e+00  1.e+00  9.e-01 (999)1.e+00 (999)
1.e+00 (000)
2.e+00  1.4142e+00  1.4142e+00 (999)1.4142e+00 (999)
1.4142e+00 (000)
2.2500e+00  1.5000e+00  1.5000e+00 (999)1.5000e+00 (999)
1.5000e+00 (000)
2.5600e+00  1.6000e+00  1.6000e+00 (000)1.6000e+00 (000)
1.6000e+00 (000)
3.1416e+00  1.7725e+00  1.7725e+00 (999)1.7725e+00 (999)
1.7725e+00 (000)
6.0221e+23  7.7602e+11  7.7602e+11 (999)7.7602e+11 (999)
7.7602e+11 (000)
1.7977e+308 1.3408e+154 1.3408e+154 (000)   1.3408e+154 (000)   
1.3408e+154 (000)

x   sqrtf(x)1 Step (ulps)   2 Steps (ulps)  
3 Steps (ulps)
1.1755e-38  1.0842e-19  1.0842e-19 (096)1.0842e-19 (000)
1.0842e-19 (000)
1.6022e-19  4.0027e-10  4.0027e-10 (008)4.0027e-10 (000)
4.0027e-10 (000)
1.e+00  1.e+00  1.e+00 (001)1.e+00 (001)
1.e+00 (001)
1.e+00  1.e+00  9.e-01 (096)1.e+00 (000)
1.e+00 (000)
1.e+00  1.e+00  9.e-01 (094)1.e+00 (001)
1.e+00 (000)
2.e+00  1.4142e+00  1.4142e+00 (146)1.4142e+00 (001)
1.4142e+00 (000)
2.2500e+00  1.5000e+00  1.5000e+00 (018)1.5000e+00 (000)
1.5000e+00 (001)
2.5600e+00  1.6000e+00  1.6000e+00 (001)1.6000e+00 (001)
1.6000e+00 (001)
3.1416e+00  1.7725e+00  1.7725e+00 (006)1.7725e+00 (001)
1.7725e+00 (001)
6.0221e+23  7.7602e+11  7.7602e+11 (069)7.7602e+11 (001)
7.7602e+11 (000)
3.4028e+38  1.8447e+19  1.8447e+19 (000)1.8447e+19 (000)
1.8447e+19 (000)

The error in ULPs saturates at 999 above.

The result of having to use so many iterations to achieve accuracy would defeat 
using the Newton series, as it would likely be slower than the FSQRT 
instruction.

Unlike in x86, I have the impression that the initial estimate in AArch64 is 
meant to be used in applications that do not require precision, like graphics, 
etc.  Then, a single series iteration for SP would perhaps be good enough.

-- 
Evandro Menezes  Austin, TX


> -Original Message-
> From: gcc-patches-ow...@gcc.gnu.org [mailto:gcc-patches-ow...@gcc.gnu.org] On
> Behalf Of Dr. Philipp Tomsich
> Sent: Monday, June 29, 2015 6:45
> To: James Greenhalgh
> Cc: Kumar, Venkataramanan; pins...@gmail.com; Benedikt Huber; gcc-
> patc...@gcc.gnu.org; Marcus Shawcroft; Ramana Radhakrishnan; Richard Earnshaw
> Subject: Re: [PATCH] [aarch64] Implemented reciprocal square root (rsqrt)
> estimation in -ffast-math
> 
> James,
> 
> On 29 Jun 2015, at 13:36, James Greenhalgh  wrote:
> >
> > On Mon, Jun 29, 2015 at 10:18:23AM +0100, Kumar, Venkataramanan wrote:
> >>
> >>> -Original Message-
> >>> From: Dr. Philipp Tomsich
> >>> [mailto:philipp.toms...@theobroma-systems.com]
> >>> Sent: Monday, June 29, 2015 2:17 PM
> >>> To: Kumar, Venkataramanan
> >>> Cc: pins...@gmail.com; Benedikt Huber; gcc-patches@gcc.gnu.org
> >>> Subject: Re: [PATCH] [aarch64] Implemented reciprocal square root
> >>> (rsqrt) estimation in -ffast-math
> >>>
> >>> Kumar,
> >>>
> >>> This does not come unexpected, as the initial estimation and each
> >>> iteration will add an architecturally-defined number of bits of
> >>> precision (ARMv8 guarantuees only a minimum number of bits provided
> >>> per operation… the exact number is specific to each micro-arch, though).
> >>> Depending on your architecture and on the required number of precise
> >>> bits by any given benchmark, one may see miscompares.
> >>
> >> True.
> >
> > I would be very uncomfortable with this approach.
> 
> Same here. The default must be safe. Always.
> Unlike other architectures, we don’t have a problem with making the proper
> defaults for “safety”, as the ARMv8 ISA guarantees a minimum number of
> precise bits per iteration.
> 
> > From Richard Biener's post in the thread Michael Matz linked earlier
> > in the thread:
> >
> >It would follow existing practice of things we allow in
> >-funsafe-math-o

Re: [gomp4.1 WIP] omp_target_* libgomp APIs

2015-07-13 Thread Ilya Verbin
On Mon, Jul 13, 2015 at 18:50:29 +0300, Ilya Verbin wrote:
> On Mon, Jul 13, 2015 at 17:26:43 +0200, Jakub Jelinek wrote:
> > > > > > +  /* FIXME: Support device-to-device somehow?  */
> > > > > 
> > > > > Should libgomp copy data device-host-device if device-device is not 
> > > > > supported by
> > > > > target?  Current liboffloadmic doesn't support this.  I'll find out 
> > > > > if there are
> > > > > any plans.
> > > > 
> > > > There is also the option to spawn an offloaded function that will just 
> > > > call
> > > > memcpy, or have such a function next to the main () of the program that 
> > > > we link
> > > > in.
> > > 
> > > Do you mean the case when src_devicep == dst_devicep ?  It's easy to 
> > > support
> > > this by adding new func into plugin, whithout any changes in 
> > > liboffloadmic.
> > > I thought about memcpy between different devices...
> > 
> > Well, even src_devicep == dst_devicep does not guarantee it is the same
> > device, that is the case only if also src_devicep->target_id ==
> > dst_devicep->target_id, right?
> 
> Why?  Devices of one type with different target_id's have different entries in
> devices[].
> 
> > I wouldn't worry about that and just return EINVAL when copying in between
> > different devices.
> 
> I'll prepare a patch, which will add an interface for copying within one 
> device,
> covered by GOMP_OFFLOAD_CAP_OPENMP_400.

Here it is.  make check-target-libgomp passed.


libgomp/
* libgomp.h (struct gomp_device_descr): Add dev2dev_func.
* target.c (omp_target_memcpy): Support device-to-device.
(omp_target_memcpy_rect_worker): Likewise.
(omp_target_memcpy_rect): Likewise.
(gomp_load_plugin_for_device): Check for GOMP_OFFLOAD_dev2dev.
* testsuite/libgomp.c/target-12.c (main): Extend for testing
device-to-device memcpy.
liboffloadmic/
* plugin/libgomp-plugin-intelmic.cpp (GOMP_OFFLOAD_dev2dev): New
function.
* plugin/offload_target_main.cpp (__offload_target_tgt2tgt): New static
function, register it in liboffloadmic.


diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index 8ed1abd..a64b98c 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -768,6 +768,7 @@ struct gomp_device_descr
   void (*free_func) (int, void *);
   void *(*dev2host_func) (int, void *, const void *, size_t);
   void *(*host2dev_func) (int, void *, const void *, size_t);
+  void *(*dev2dev_func) (int, void *, const void *, size_t);
   void (*run_func) (int, void *, void *);
 
   /* Splay tree containing information about mapped memory regions.  */
diff --git a/libgomp/target.c b/libgomp/target.c
index 024a9c8..2bfc019 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -1329,7 +1329,15 @@ omp_target_memcpy (void *dst, void *src, size_t length, 
size_t dst_offset,
   gomp_mutex_unlock (&src_devicep->lock);
   return 0;
 }
-  /* FIXME: Support device-to-device somehow?  */
+  if (src_devicep == dst_devicep)
+{
+  gomp_mutex_lock (&src_devicep->lock);
+  src_devicep->dev2dev_func (src_devicep->target_id,
+(char *) dst + dst_offset,
+(char *) src + src_offset, length);
+  gomp_mutex_unlock (&src_devicep->lock);
+  return 0;
+}
   return EINVAL;
 }
 
@@ -1364,6 +1372,10 @@ omp_target_memcpy_rect_worker (void *dst, void *src, 
size_t element_size,
src_devicep->dev2host_func (src_devicep->target_id,
(char *) dst + dst_off,
(char *) src + src_off, length);
+  else if (src_devicep == dst_devicep)
+   src_devicep->dev2dev_func (src_devicep->target_id,
+  (char *) dst + dst_off,
+  (char *) src + src_off, length);
   else
return EINVAL;
   return 0;
@@ -1437,10 +1449,6 @@ omp_target_memcpy_rect (void *dst, void *src, size_t 
element_size,
src_devicep = NULL;
 }
 
-  /* FIXME: Support device-to-device somehow?  */
-  if (src_devicep != NULL && dst_devicep != NULL)
-return EINVAL;
-
   if (src_devicep)
 gomp_mutex_lock (&src_devicep->lock);
   else if (dst_devicep)
@@ -1601,10 +1609,10 @@ gomp_load_plugin_for_device (struct gomp_device_descr 
*device,
 }  \
   while (0)
   /* Similar, but missing functions are not an error.  */
-#define DLSYM_OPT(f, n)\
+#define DLSYM_OPT(f, n)
\
   do   \
 {  \
-  const char *tmp_err; 
\
+  const char *tmp_err; \
   device->f##_func = dlsym (plugin_handle, "GOMP_OFFLOAD_" #n);\
  

libgo patch committed: Remove unnecessary Entersyscall (RMs: OK for GCC 5 branch?)

2015-07-13 Thread Ian Lance Taylor
This patch by Lynn Boger removes unnecessary duplicate calls to
Entersyscall and Exitsyscall from the GNU/Linux Getdents function.
The calls are duplicates because they are called by Syscall, also
called by Getdents.  These duplicate calls sometimes cause the
deadlock detector to fire incorrectly, reported as
http://golang.org/issue/11406 .  This patch fixes that problem.
Bootstrapped and ran Go tests on mainline, GCC 4.9 branch, and GCC 5
branch.  Committed to mainline and 4.9 branch.

Release managers: OK to commit to GCC 5 branch?

Ian
Index: libgo/go/syscall/libcall_linux.go
===
--- libgo/go/syscall/libcall_linux.go   (revision 225715)
+++ libgo/go/syscall/libcall_linux.go   (working copy)
@@ -223,7 +223,6 @@ func Getdents(fd int, buf []byte) (n int
} else {
p = (*byte)(unsafe.Pointer(&_zero))
}
-   Entersyscall()
s := SYS_GETDENTS64
if s == 0 {
s = SYS_GETDENTS
@@ -233,7 +232,6 @@ func Getdents(fd int, buf []byte) (n int
if n < 0 {
err = errno
}
-   Exitsyscall()
return
 }
 


Re: [gomp4.1] depend(sink) and depend(source) parsing for C

2015-07-13 Thread Jakub Jelinek
On Mon, Jul 13, 2015 at 10:11:35AM -0700, Aldy Hernandez wrote:
> On 07/13/2015 06:56 AM, Jakub Jelinek wrote:
> >On Sat, Jul 11, 2015 at 11:35:36AM -0700, Aldy Hernandez wrote:
> 
> >On the C++ FE side, please also try a testcase in g++.dg/gomp/ where
> >the ordered(n) loop with #pragma omp ordered depend({source,sink}) will be
> >in a template, to make sure pt.c does the right thing with it.
> 
> I assume you mean something like:
> 
> void bar (int, int, int);
> 
> template
> T baz (T arg)
> {
>   int i, j, k;

Yeah, or even better T i, j, k;
As you don't use the argument, it can be just
template
void baz ()
{

> Also, was this supposed to work?:
> 
> template
> int foo()
> {
>   int i, j, k;
> #pragma omp parallel for ordered(N)

It is not 100% clear, but we don't support collapse(N)
either when N is a template parameter, as it affects
parsing of the code, we require that it is a non-dependent
constant expression.

Whether depend(sink:) should allow template parameters
depends on whether it will be required to be integer constant
or integer constant expression, right now it should be the former.

> >If you want to spend time on something still in the FE, it would be nice to
> >resolve the C++ iteration var issue (i.e. increase OMP_FOR number of
> >arguments, so that it could have yet another (optional) vector, say
> >OMP_FOR_ORIG_DECLS.  If that vector would be NULL, the gimplifier would
> >assume that all the decls in OMP_FOR_INIT are the ones present in the
> >source, if it would be present, you'd use them for the variable checking
> >instead of the ones from OMP_FOR_INIT (but, replace them with the
> >decls from OMP_FOR_INIT after the checking).
> >
> >There is another issue - if some iterator var has pointer type, supposedly
> >we want somewhere in the FEs already multiply it by the size of what they
> >point to (and convert to sizetype).  For C FE, it can be done already during
> >parsing, we should know the type of the iterator var already at that point,
> >for C++ FE it needs to be done only in finish_omp_clauses if
> >!processing_template_decl, because in templates we might not know the type.
> 
> Sure.  As follow-ups?

Of course.

Jakub


Re: [RFC, Fortran, (pr66775)] Allocatable function result

2015-07-13 Thread Mike Stump
On Jul 11, 2015, at 4:58 AM, Dan Nagle  wrote:
> The standard is written in standardese, not English.

While what you say is true, sorry, shall _is_ English:

  used in laws, regulations, or directives to express what is mandatory


[v3 patch] Remove __gnu_cxx::__alloc_traits::_S_nothrow_swap()

2015-07-13 Thread Jonathan Wakely

As I suggested recently, this removes the _S_nothrow_swap() function
because it's redundant: allocators are not allowed to throw exceptions
when swapped, so we can assume they don't.

So now exception specifications for swapping containers don't depend
on allocators.

This also adds a _GLIBCXX_NOEXCEPT_IF macro for helping to define
conditional noexcept guarantees.

Tested powerpc64le-linux, committed to trunk.

commit 54d85a21c3b1c94d1bf28bc095ae9d373e180802
Author: Jonathan Wakely 
Date:   Sat Jul 11 12:13:09 2015 +0100

	* include/bits/c++config (_GLIBCXX_NOEXCEPT_IF): Define.
	* include/bits/forward_list.h (forward_list::swap): Make noexcept
	unconditional.
	* include/bits/hashtable.h (_Hashtable::swap): Do not use
	_S_nothrow_swap().
	* include/bits/stl_bvector.h (vector::swap): Make noexcept
	unconditional.
	* include/bits/stl_deque.h (deque::swap): Likewise.
	(swap(deque&, deque&)): Use _GLIBCXX_NOEXCEPT_IF.
	* include/bits/stl_list.h (list::swap): Make noexcept unconditional.
	(swap(list&, list&)): Use _GLIBCXX_NOEXCEPT_IF.
	* include/bits/stl_map.h (map::swap, swap(map&, map&)): Use
	_GLIBCXX_NOEXCEPT_IF, do not depend on _S_nothrow_swap.
	* include/bits/stl_multimap.h (multimap::swap,
	swap(multimap&, multimap&)): Likewise.
	* include/bits/stl_multiset.h (multiset::swap,
	swap(multiset&, multiset&)): Likewise.
	* include/bits/stl_set.h (set::swap, swap(set&, set&)): Likewise.
	* include/bits/stl_tree.h (_Rb_tree::swap,
	swap(_Rb_tree&, _Rb_tree&)): Likewise.
	* include/bits/stl_vector.h (vector::swap): Make noexcept
	unconditional.
	(swap(vector&, vector&)): Use _GLIBCXX_NOEXCEPT_IF.
	* include/debug/deque (deque::swap, swap): Likewise.
	* include/debug/forward_list (swap): Add noexcept.
	* include/debug/list (list::swap, swap): Use _GLIBCXX_NOEXCEPT_IF.
	* include/debug/map.h (map::swap, swap): Likewise.
	* include/debug/multimap.h (multimap::swap, swap): Likewise.
	* include/debug/multiset.h (multiset::Swap, swap): Likewise.
	* include/debug/set.h (set::swap, swap): Likewise.
	* include/debug/unordered_map (unordered_map::swap,
	unordered_multimap::swap, swap): Likewise.
	* include/debug/unordered_set (unordered_set::swap,
	unordered_multiset::swap, swap): Likewise.
	* include/debug/vector (vector::swap, swap): Likewise.
	* include/ext/alloc_traits.h (__alloc_traits::_S_nothrow_swap()):
	Remove.
	* include/profile/deque (deque::swap, swap): Use _GLIBCXX_NOEXCEPT_IF.
	* include/profile/forward_list (swap): Add noexcept.
	* include/profile/list (list::swap, swap) : Use _GLIBCXX_NOEXCEPT_IF.
	* include/profile/map.h (map::swap, swap): Likewise.
	* include/profile/multimap.h (multimap::swap, swap): Likewise.
	* include/profile/multiset.h (multiset::swap, swap): Likewise.
	* include/profile/set.h (set::swap, swap): Likewise.
	* include/profile/unordered_map (swap): Likewise.
	* include/profile/unordered_set (swap): Likewise.
	* include/profile/vector (vector::swap, swap): Likewise. Remove
	overloads for swapping rvalues.
	* testsuite/23_containers/deque/allocator/noexcept.cc: Update tests
	for noexcept on swap.
	* testsuite/23_containers/forward_list/allocator/noexcept.cc:
	Likewise.
	* testsuite/23_containers/list/allocator/noexcept.cc: Likewise.
	* testsuite/23_containers/map/allocator/noexcept.cc: Likewise.
	* testsuite/23_containers/multimap/allocator/noexcept.cc: Likewise.
	* testsuite/23_containers/multiset/allocator/noexcept.cc: Likewise.
	* testsuite/23_containers/set/allocator/noexcept.cc: Likewise.
	* testsuite/23_containers/unordered_map/allocator/noexcept.cc:
	Likewise.
	* testsuite/23_containers/unordered_multimap/allocator/noexcept.cc:
	Likewise.
	* testsuite/23_containers/unordered_multiset/allocator/noexcept.cc:
	Likewise.
	* testsuite/23_containers/unordered_set/allocator/noexcept.cc:
	Likewise.
	* testsuite/23_containers/vector/allocator/noexcept.cc: Likewise.
	* testsuite/23_containers/vector/bool/allocator/noexcept.cc: Likewise.
	* testsuite/ext/profile/mutex_extensions_neg.cc: Adjust dg-error line
	number.

diff --git a/libstdc++-v3/include/bits/c++config b/libstdc++-v3/include/bits/c++config
index ae3065f..34acca3 100644
--- a/libstdc++-v3/include/bits/c++config
+++ b/libstdc++-v3/include/bits/c++config
@@ -115,10 +115,12 @@
 #ifndef _GLIBCXX_NOEXCEPT
 # if __cplusplus >= 201103L
 #  define _GLIBCXX_NOEXCEPT noexcept
+#  define _GLIBCXX_NOEXCEPT_IF(_COND) noexcept(_COND)
 #  define _GLIBCXX_USE_NOEXCEPT noexcept
 #  define _GLIBCXX_THROW(_EXC)
 # else
 #  define _GLIBCXX_NOEXCEPT
+#  define _GLIBCXX_NOEXCEPT_IF(_COND)
 #  define _GLIBCXX_USE_NOEXCEPT throw()
 #  define _GLIBCXX_THROW(_EXC) throw(_EXC)
 # endif
diff --git a/libstdc++-v3/include/bits/forward_list.h b/libstdc++-v3/include/bits/forward_list.h

Re: [gomp4.1] depend(sink) and depend(source) parsing for C

2015-07-13 Thread Aldy Hernandez

On 07/13/2015 06:56 AM, Jakub Jelinek wrote:

On Sat, Jul 11, 2015 at 11:35:36AM -0700, Aldy Hernandez wrote:



On the C++ FE side, please also try a testcase in g++.dg/gomp/ where
the ordered(n) loop with #pragma omp ordered depend({source,sink}) will be
in a template, to make sure pt.c does the right thing with it.


I assume you mean something like:

void bar (int, int, int);

template
T baz (T arg)
{
  int i, j, k;
#pragma omp parallel for ordered(2)
  for (i=0; i < 100; ++i)
for (j=0; j < 100; ++j)
  {
#pragma omp ordered depend(sink:i-3,j)
bar (i, j, 0);
  }
  return arg;
}

int main()
{
  return baz(5);
}

??

Also, was this supposed to work?:

template
int foo()
{
  int i, j, k;
#pragma omp parallel for ordered(N)
  for (i=0; i < 100; ++i)
for (j=0; j < 100; ++j)
  {
extern void bark();
bark();
  }
}

The above was broken before I arrived.

And if this last example is supposed to work, I should probably address 
the same thing for sink offsets.



If you want to spend time on something still in the FE, it would be nice to
resolve the C++ iteration var issue (i.e. increase OMP_FOR number of
arguments, so that it could have yet another (optional) vector, say
OMP_FOR_ORIG_DECLS.  If that vector would be NULL, the gimplifier would
assume that all the decls in OMP_FOR_INIT are the ones present in the
source, if it would be present, you'd use them for the variable checking
instead of the ones from OMP_FOR_INIT (but, replace them with the
decls from OMP_FOR_INIT after the checking).

There is another issue - if some iterator var has pointer type, supposedly
we want somewhere in the FEs already multiply it by the size of what they
point to (and convert to sizetype).  For C FE, it can be done already during
parsing, we should know the type of the iterator var already at that point,
for C++ FE it needs to be done only in finish_omp_clauses if
!processing_template_decl, because in templates we might not know the type.


Sure.  As follow-ups?

Aldy



[PATCH 2/2, i386]: Fix PR 58066, __tls_get_addr is called with misaligned stack on x86-64

2015-07-13 Thread Uros Bizjak
This is target-dependant part of a two patch series.

Scheduler is free to move stack adjustment throughs
tls_global_dynamic_64 and tls_local_dynamic_base_64 patterns,
misaligning the stack for embedded call to __tls_get_addr.

The patch makes these patterns dependent on SP_REG.

Patch was bootstrapped and regression tested on x86_64-linux-gnu
{,-m32} for all default languages, obj-c++ and go.

Patch will be committed to mainline after patch 1/2 from the series.

Uros.

Index: config/i386/i386.md
===
--- config/i386/i386.md(revision 225727)
+++ config/i386/i386.md(working copy)
@@ -13158,7 +13158,8 @@
 (call:P
  (mem:QI (match_operand 2 "constant_call_address_operand" "Bz"))
  (match_operand 3)))
-   (unspec:P [(match_operand 1 "tls_symbolic_operand")]
+   (unspec:P [(match_operand 1 "tls_symbolic_operand")
+  (reg:P SP_REG)]
  UNSPEC_TLS_GD)]
   "TARGET_64BIT"
 {
@@ -13182,8 +13183,9 @@
  (mem:QI (plus:DI (match_operand:DI 2 "register_operand" "b")
   (match_operand:DI 3 "immediate_operand" "i")))
  (match_operand 4)))
-   (unspec:DI [(match_operand 1 "tls_symbolic_operand")]
- UNSPEC_TLS_GD)]
+   (unspec:DI [(match_operand 1 "tls_symbolic_operand")
+   (reg:DI SP_REG)]
+  UNSPEC_TLS_GD)]
   "TARGET_64BIT && ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF
&& GET_CODE (operands[3]) == CONST
&& GET_CODE (XEXP (operands[3], 0)) == UNSPEC
@@ -13204,7 +13206,8 @@
   (call:P
(mem:QI (match_operand 2))
(const_int 0)))
- (unspec:P [(match_operand 1 "tls_symbolic_operand")]
+ (unspec:P [(match_operand 1 "tls_symbolic_operand")
+(reg:P SP_REG)]
UNSPEC_TLS_GD)])]
   "TARGET_64BIT"
   "ix86_tls_descriptor_calls_expanded_in_cfun = true;")
@@ -13254,7 +13257,7 @@
 (call:P
  (mem:QI (match_operand 1 "constant_call_address_operand" "Bz"))
  (match_operand 2)))
-   (unspec:P [(const_int 0)] UNSPEC_TLS_LD_BASE)]
+   (unspec:P [(reg:P SP_REG)] UNSPEC_TLS_LD_BASE)]
   "TARGET_64BIT"
 {
   output_asm_insn
@@ -13272,7 +13275,7 @@
  (mem:QI (plus:DI (match_operand:DI 1 "register_operand" "b")
   (match_operand:DI 2 "immediate_operand" "i")))
  (match_operand 3)))
-   (unspec:DI [(const_int 0)] UNSPEC_TLS_LD_BASE)]
+   (unspec:DI [(reg:DI SP_REG)] UNSPEC_TLS_LD_BASE)]
   "TARGET_64BIT && ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF
&& GET_CODE (operands[2]) == CONST
&& GET_CODE (XEXP (operands[2], 0)) == UNSPEC
@@ -13293,7 +13296,7 @@
(call:P
 (mem:QI (match_operand 1))
 (const_int 0)))
-  (unspec:P [(const_int 0)] UNSPEC_TLS_LD_BASE)])]
+  (unspec:P [(reg:P SP_REG)] UNSPEC_TLS_LD_BASE)])]
   "TARGET_64BIT"
   "ix86_tls_descriptor_calls_expanded_in_cfun = true;")


[PATCH 1/2, rtl-optimization]: Fix PR 58066, __tls_get_addr is called with misaligned stack on x86-64

2015-07-13 Thread Uros Bizjak
This is rtl-optimization part of a two-part patch series.

As discussed in the PR, we have to prcompute register parameters
before stack alignment is performed, otherwise eventual call to
__tls_get_addr can be called with unaligned stack. When compiling the
testcase from the PR, anti_adjust_stack is called just before
precompute starts expanding function parameters.

The solution is to move  precomputation before stack pointer is adjusted.

2015-07-13  Uros Bizjak  

PR rtl-optimization/58066
* calls.c (expand_call): Precompute register parameters before stack
alignment is performed.

Patch was bootstrapped and regression tested on x86_64-linux-gnu
{,-m32} for all default languages, obj-c++ and go.

OK for mainline?

Uros.

Index: calls.c
===
--- calls.c(revision 225727)
+++ calls.c(working copy)
@@ -3144,6 +3144,10 @@ expand_call (tree exp, rtx target, int ignore)

   compute_argument_addresses (args, argblock, num_actuals);

+  /* Precompute all register parameters.  It isn't safe to compute anything
+ once we have started filling any specific hard regs.  */
+  precompute_register_parameters (num_actuals, args, ®_parm_seen);
+
   /* Perform stack alignment before the first push (the last arg).  */
   if (argblock == 0
   && adjusted_args_size.constant > reg_parm_stack_space
@@ -3184,10 +3188,6 @@ expand_call (tree exp, rtx target, int ignore)

   funexp = rtx_for_function_call (fndecl, addr);

-  /* Precompute all register parameters.  It isn't safe to compute anything
- once we have started filling any specific hard regs.  */
-  precompute_register_parameters (num_actuals, args, ®_parm_seen);
-
   if (CALL_EXPR_STATIC_CHAIN (exp))
 static_chain_value = expand_normal (CALL_EXPR_STATIC_CHAIN (exp));
   else


[PATCH] Fix typo in rtl.c

2015-07-13 Thread Marek Polacek
While working on a new warning I found this typo.

Bootstrapped/regtested on x86_64-linux, ok for trunk?

2015-07-13  Marek Polacek  

* rtl.c (rtx_equal_p_cb): Fix typo.

diff --git gcc/rtl.c gcc/rtl.c
index dccf298..b1b485e 100644
--- gcc/rtl.c
+++ gcc/rtl.c
@@ -441,7 +441,7 @@ rtx_equal_p_cb (const_rtx x, const_rtx y, 
rtx_equal_p_callback_function cb)
 
 case DEBUG_PARAMETER_REF:
   return DEBUG_PARAMETER_REF_DECL (x)
-== DEBUG_PARAMETER_REF_DECL (x);
+== DEBUG_PARAMETER_REF_DECL (y);
 
 case ENTRY_VALUE:
   return rtx_equal_p_cb (ENTRY_VALUE_EXP (x), ENTRY_VALUE_EXP (y), cb);

Marek


Re: [PATCH] PR target/66824: Allow software FP SFmode in FP splitter

2015-07-13 Thread H.J. Lu
On Sun, Jul 12, 2015 at 10:56 AM, Uros Bizjak  wrote:
> On Sat, Jul 11, 2015 at 9:23 PM, H.J. Lu  wrote:
>> On Thu, Jul 09, 2015 at 01:58:22PM -0700, H.J. Lu wrote:
>>> On Thu, Jul 09, 2015 at 12:13:38PM -0700, H.J. Lu wrote:
>>> > ix86_split_long_move can optimize floating point constant move, which
>>> > can be used to optimize SFmode move for IA MCU.
>>> >
>>> > OK for trunk if there is no regression?
>>> >
>>> >
>>> > H.J.
>>> > ---
>>> > gcc/
>>> >
>>> > PR target/66824
>>> > * config/i386/i386.c (ix86_split_to_parts): Allow SFmode move
>>> > for IA MCU.
>>> > (ix86_split_long_move): Support single move.
>>> > * config/i386/i386.md (FP splitter): Allow SFmode for IA MCU.
>>> >
>>> > gcc/testsuite/
>>> >
>>> > PR target/66824
>>> > * gcc.target/i386/pr66824.c: New test.
>>> > ---
>>>
>>>
>>> I missed the testcase.  Here is the updated patch.
>>>
>>
>> ix86_split_long_move can optimize floating point constant move, which
>> can be used to optimize SFmode move with software floating point.
>>
>> OK for trunk if there are no regressions?
>
> No, this patch is wrong. Please investigate why "*movsf_internal"
> doesn't use "?r/rmF" alternative in case FP regs are unavailable.
> Perhaps you should add new alternative with a conditional constraint,
> but without "?". And... please use:
>

I couldn't figure a way to add conditional constraints for "?r/rmF" and
"r/rmF".   I simply disabled *movsf_internal if TARGET_HARD_FP_REGS
is false and added a new "*movsf_internal_soft_fp" pattern.

OK for trunk if there is no regressions?

> #define TARGET_HARD_FP_REGS(TARGET_80387 || TARGET_MMX || TARGET_SSE)
>
> Uros.

Thanks.

-- 
H.J.
From 87a92b1168c37e7607d0c839860f2eecf0f34345 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" 
Date: Thu, 9 Jul 2015 12:06:40 -0700
Subject: [PATCH] Add *movsf_internal_soft_fp pattern

Without hard floating point registers, general purpose registers are
used and we should generate mov with general purpose registers for
SFmode load/store.

gcc/

	PR target/66824
	* config/i386/i386.h (TARGET_HARD_FP_REGS): New.
	* config/i386/i386.md (*movsf_internal): Enable only if
	TARGET_HARD_FP_REGS is true.
	(*movsf_internal_soft_fp): New pattern.  Enable only if
	TARGET_HARD_FP_REGS is false.

gcc/testsuite/

	PR target/66824
	* gcc.target/i386/pr66824.c: New test.

Revert ix86_split_to_parts/FP splitter

pr66824
---
 gcc/config/i386/i386.h  |  2 ++
 gcc/config/i386/i386.md | 12 
 gcc/testsuite/gcc.target/i386/pr66824.c | 29 +
 3 files changed, 43 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr66824.c

diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 0fcf391..3b7cf92 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -164,6 +164,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 #define TARGET_16BIT	TARGET_CODE16
 #define TARGET_16BIT_P(x)	TARGET_CODE16_P(x)
 
+#define TARGET_HARD_FP_REGS	(TARGET_80387 || TARGET_MMX || TARGET_SSE)
+
 /* SSE4.1 defines round instructions */
 #define	OPTION_MASK_ISA_ROUND	OPTION_MASK_ISA_SSE4_1
 #define	TARGET_ISA_ROUND	((ix86_isa_flags & OPTION_MASK_ISA_ROUND) != 0)
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 354532a..c83cf6d 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -3325,6 +3325,7 @@
 	(match_operand:SF 1 "general_operand"
 	  "Yf*fm,Yf*f,G   ,rmF,rF,C,v,m,v,Yj,r  ,*y ,m  ,*y,*Yn,r"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))
+   && TARGET_HARD_FP_REGS
&& (!can_create_pseudo_p ()
|| (ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_LARGE)
|| !CONST_DOUBLE_P (operands[1])
@@ -3444,6 +3445,17 @@
 	  ]
 	  (const_string "SF")))])
 
+(define_insn "*movsf_internal_soft_fp"
+  [(set (match_operand:SF 0 "nonimmediate_operand"
+	  "=r ,m")
+	(match_operand:SF 1 "general_operand"
+	  "rmF,rF"))]
+  "!(MEM_P (operands[0]) && MEM_P (operands[1]))
+   && !TARGET_HARD_FP_REGS"
+  "mov{l}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "imov")
+   (set_attr "mode" "SI")])
+
 (define_split
   [(set (match_operand 0 "any_fp_register_operand")
 	(match_operand 1 "memory_operand"))]
diff --git a/gcc/testsuite/gcc.target/i386/pr66824.c b/gcc/testsuite/gcc.target/i386/pr66824.c
new file mode 100644
index 000..3511e4c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr66824.c
@@ -0,0 +1,29 @@
+/* { dg-do compile { target ia32 } } */
+/* { dg-options "-O2 -mno-sse -mno-mmx -mno-80387" } */
+/* { dg-final { scan-assembler-not "\.LC\[0-9\]" } } */
+
+double foo (float);
+
+double
+f1 (void)
+{
+  return foo (1.0);
+}
+
+double
+f2 (void)
+{
+  return foo (0.0);
+}
+
+void
+f3 (float *x, float t)
+{
+  *x = 0.0 + t;
+}
+
+float
+f4 (void)
+{
+  return 1.0;
+}
-- 
2.4.3



Re: [PATCH][ARM][testsuite] Fix FAIL: gcc.target/arm/macro_defs0.c and macro_defs1.c when -marm forced

2015-07-13 Thread Kyrill Tkachov

Hi Mantas,

On 05/03/15 10:14, Mantas Mikaitis wrote:

Hello,

Tests gcc.target/arm/macro_defs0.c and gcc.target/arm/macro_defs1.c fail
in multilib which forces -marm as pointed out in this message:
https://gcc.gnu.org/ml/gcc-patches/2015-02/msg00483.html .

This patch will cause these tests to be classified as unsupported rather
than FAIL.

Ok for trunk?

Kind regards,
Mantas M.

2015-03-05  Mantas Mikaitis  

  * gcc.target/arm/macro_defs0.c: added directive to skip
  test if -marm is present.
  * gcc.target/arm/macro_defs1.c: added directive to skip
  test if -marm is present.


Ok for trunk, sorry for the delay.
I have committed this to trunk for you as r225742 with the
ChangeLog entry:

2015-07-13  Mantas Mikaitis  

* gcc.target/arm/macro_defs0.c: Add directive to skip
test if -marm is present.
* gcc.target/arm/macro_defs1.c: Likewise.



Re: [gomp4.1 WIP] omp_target_* libgomp APIs

2015-07-13 Thread Ilya Verbin
On Mon, Jul 13, 2015 at 17:26:43 +0200, Jakub Jelinek wrote:
> > > > > +  /* FIXME: Support device-to-device somehow?  */
> > > > 
> > > > Should libgomp copy data device-host-device if device-device is not 
> > > > supported by
> > > > target?  Current liboffloadmic doesn't support this.  I'll find out if 
> > > > there are
> > > > any plans.
> > > 
> > > There is also the option to spawn an offloaded function that will just 
> > > call
> > > memcpy, or have such a function next to the main () of the program that 
> > > we link
> > > in.
> > 
> > Do you mean the case when src_devicep == dst_devicep ?  It's easy to support
> > this by adding new func into plugin, whithout any changes in liboffloadmic.
> > I thought about memcpy between different devices...
> 
> Well, even src_devicep == dst_devicep does not guarantee it is the same
> device, that is the case only if also src_devicep->target_id ==
> dst_devicep->target_id, right?

Why?  Devices of one type with different target_id's have different entries in
devices[].

> I wouldn't worry about that and just return EINVAL when copying in between
> different devices.

I'll prepare a patch, which will add an interface for copying within one device,
covered by GOMP_OFFLOAD_CAP_OPENMP_400.

  -- Ilya


[gomp4] Add additional test for declare directive

2015-07-13 Thread James Norris

Hi,

The attached patch adds a test for the copyout clause with
the declare directive. The testing of this clause was
overlooked.

Committed to gomp-4_0-branch.

Jim
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/declare-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/declare-1.c
index 584b921..8fbec4d 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/declare-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/declare-1.c
@@ -7,6 +7,23 @@
 #define N 8
 
 void
+subr2 (int *a)
+{
+  int i;
+  int f[N];
+#pragma acc declare copyout (f)
+
+#pragma acc parallel copy (a[0:N])
+  {
+for (i = 0; i < N; i++)
+  {
+	f[i] = a[i];
+	a[i] = f[i] + f[i] + f[i];
+  }
+  }
+}
+
+void
 subr1 (int *a)
 {
   int f[N];
@@ -93,5 +110,13 @@ main (int argc, char **argv)
 	abort ();
 }
 
+  subr2 (&a[0]);
+
+  for (i = 0; i < 1; i++)
+{
+  if (a[i] != 1234 * 6)
+	abort ();
+}
+
   return 0;
 }


Re: [PATCH, ARM] stop changing signedness in PROMOTE_MODE

2015-07-13 Thread H.J. Lu
On Mon, Jul 13, 2015 at 8:29 AM, Michael Matz  wrote:
> Hi,
>
> On Mon, 13 Jul 2015, Richard Biener wrote:
>
>> On Fri, Jul 10, 2015 at 5:46 PM, Jim Wilson  wrote:
>> > On Tue, Jul 7, 2015 at 2:35 PM, Richard Biener
>> >  wrote:
>> >> On July 7, 2015 6:29:21 PM GMT+02:00, Jim Wilson  
>> >> wrote:
>> >>>signed sub-word locals.  Thus to detect the need for a conversion, you
>> >>>have to have the decls, and we don't have them here.  There is also
>> >>
>> >> It probably is.  The decks for the parameter based SSA names are 
>> >> available, for the PHI destination there might be no decl.
>> >
>> > I tried looking again, and found the decls.  I'm able to get correct
>> > code for my testcase with the attached patch to force the conversion.
>> > It is rather inelegant, but I think I can cache the values I need to
>> > make this simpler and cleaner.  I still don't have decls from
>> > insert_part_to_rtx_on_edge and insert_rtx_to_part_on_edge, but it
>> > looks like those are for breaking cycles, and hence might not need
>> > conversions.
>>
>> Yes, that looks like a defect.  CCing Micha who wrote this code
>
> I think it's a backend bug that parameters and locals are extended
> differently.  The code in tree-outof-ssa was written with the assumption
> that the modes of RTL objects might be different (larger) than the tree
> types suggest, but that they be _consistent_, i.e. that the particular
> extension depends on only the types, not on the tree-type of the decl.
>
> I think the above assumption does make sense because it's also a
> fundamental assumption in the whole gimple pipeline, types matter, not the
> objects (or better, we slowly but surely work towards this).  Hence such
> mismatches should either not exist (changing the backend), or should be
> exposed explicitely during gimplification already.  The latter is a large
> change, though.
>
> I think dealing with this situation in outof-ssa is a hack and I don't
> like it.  It would extend the ugliness of different modes for same types
> even more, and that's something we should (gradually) move away from.
>

Different parts of GCC have their own ideas how parameters should
be promoted, which leads to subtle bugs like PR 64037.


-- 
H.J.


[Fortran, Patch] Passing function pointer to co_reduce

2015-07-13 Thread Alessandro Fanfarillo
Dear all,

during the implementation of co_reduce in OpenCoarrays I noticed that
GFortran passes a pointer to function instead of the function name to
co_reduce.

Currently the compiler produces the following call:

_gfortran_caf_co_reduce (&desc.0, &simple_reduction, 0, 0, 0B, 0B, 0, 0);

where simple_reduction is the pure function that has to be used by co_reduce.

The attached patch seems to fix the issue, any comments?


Regards

Alessandro

PS: I also attach the test case
commit a12b6ce6993df109c81a32d5684b4b9f41f69ea4
Author: Alessandro Fanfarillo 
Date:   Mon Jul 13 15:46:19 2015 +0200

Fix function pointer argument for co_reduce

diff --git a/gcc/fortran/trans-intrinsic.c b/gcc/fortran/trans-intrinsic.c
index 66bc72a..967a741 100644
--- a/gcc/fortran/trans-intrinsic.c
+++ b/gcc/fortran/trans-intrinsic.c
@@ -8804,7 +8804,7 @@ conv_co_collective (gfc_code *code)
}
   opr_flags = build_int_cst (integer_type_node, opr_flag_int);
   gfc_conv_expr (&argse, opr_expr);
-  opr = gfc_build_addr_expr (NULL_TREE, argse.expr);
+  opr = argse.expr;
   fndecl = build_call_expr_loc (input_location, fndecl, 8, array, opr, 
opr_flags,
image_index, stat, errmsg, strlen, 
errmsg_len);
 }
program simple_reduce
  implicit none

  integer :: me

  me = this_image()

  sync all

  call co_reduce(me,simple_reduction)

  write(*,*) this_image(),me

contains
  
  pure function simple_reduction(a,b)
integer,intent(in) :: a,b
integer :: simple_reduction

simple_reduction = a * b
  end function simple_reduction

end program simple_reduce


Re: [PATCH, ARM] stop changing signedness in PROMOTE_MODE

2015-07-13 Thread Michael Matz
Hi,

On Mon, 13 Jul 2015, Richard Biener wrote:

> On Fri, Jul 10, 2015 at 5:46 PM, Jim Wilson  wrote:
> > On Tue, Jul 7, 2015 at 2:35 PM, Richard Biener
> >  wrote:
> >> On July 7, 2015 6:29:21 PM GMT+02:00, Jim Wilson  
> >> wrote:
> >>>signed sub-word locals.  Thus to detect the need for a conversion, you
> >>>have to have the decls, and we don't have them here.  There is also
> >>
> >> It probably is.  The decks for the parameter based SSA names are 
> >> available, for the PHI destination there might be no decl.
> >
> > I tried looking again, and found the decls.  I'm able to get correct
> > code for my testcase with the attached patch to force the conversion.
> > It is rather inelegant, but I think I can cache the values I need to
> > make this simpler and cleaner.  I still don't have decls from
> > insert_part_to_rtx_on_edge and insert_rtx_to_part_on_edge, but it
> > looks like those are for breaking cycles, and hence might not need
> > conversions.
> 
> Yes, that looks like a defect.  CCing Micha who wrote this code

I think it's a backend bug that parameters and locals are extended 
differently.  The code in tree-outof-ssa was written with the assumption 
that the modes of RTL objects might be different (larger) than the tree 
types suggest, but that they be _consistent_, i.e. that the particular 
extension depends on only the types, not on the tree-type of the decl.

I think the above assumption does make sense because it's also a 
fundamental assumption in the whole gimple pipeline, types matter, not the 
objects (or better, we slowly but surely work towards this).  Hence such 
mismatches should either not exist (changing the backend), or should be 
exposed explicitely during gimplification already.  The latter is a large 
change, though.

I think dealing with this situation in outof-ssa is a hack and I don't 
like it.  It would extend the ugliness of different modes for same types 
even more, and that's something we should (gradually) move away from.


Ciao,
Michael.


Re: [gomp4.1 WIP] omp_target_* libgomp APIs

2015-07-13 Thread Jakub Jelinek
On Mon, Jul 13, 2015 at 06:15:45PM +0300, Ilya Verbin wrote:
> On Mon, Jul 13, 2015 at 16:03:06 +0200, Jakub Jelinek wrote:
> > On Mon, Jul 13, 2015 at 04:38:33PM +0300, Ilya Verbin wrote:
> > > On Mon, Jul 13, 2015 at 15:17:29 +0200, Jakub Jelinek wrote:
> > > > +  k->refcount = INT_MAX;
> > > 
> > > Shouldn't it be UINTPTR_MAX?
> > 
> > Dunno if we can count on it being in stdint.h on all targets.
> > Perhaps
> > #define REFCOUNT_INFINITY (~(uintptr_t) 0)
> > ?
> 
> Probably, I don't know.

Ok, I'll change this later.
> 
> > > > +  /* FIXME: Support device-to-device somehow?  */
> > > 
> > > Should libgomp copy data device-host-device if device-device is not 
> > > supported by
> > > target?  Current liboffloadmic doesn't support this.  I'll find out if 
> > > there are
> > > any plans.
> > 
> > There is also the option to spawn an offloaded function that will just call
> > memcpy, or have such a function next to the main () of the program that we 
> > link
> > in.
> 
> Do you mean the case when src_devicep == dst_devicep ?  It's easy to support
> this by adding new func into plugin, whithout any changes in liboffloadmic.
> I thought about memcpy between different devices...

Well, even src_devicep == dst_devicep does not guarantee it is the same
device, that is the case only if also src_devicep->target_id ==
dst_devicep->target_id, right?
I wouldn't worry about that and just return EINVAL when copying in between
different devices.

> > Also, could you see if the 2 and 3 dimension memcpy_rect couldn't be handled
> > more efficiently by liboffloadmic too?
> > From what I can see, on the cuda side there is some cudaMemcpy2D and
> > cudaMemcpy3D, though I admit I haven't studied in detail what exactly they
> > do.
> 
> I'll try to find out.

Thanks.  I haven't looked exactly how the copying is implemented, but if it
is done by sending some control info plus (for copying to device) the data
itself, and for copying from device reading the data back, then if
the control data could be extended to pass in the device 2D or 3D
slice/offset/volume info and you could readv or writev the data...

Jakub


Re: [gomp4.1 WIP] omp_target_* libgomp APIs

2015-07-13 Thread Ilya Verbin
On Mon, Jul 13, 2015 at 16:03:06 +0200, Jakub Jelinek wrote:
> On Mon, Jul 13, 2015 at 04:38:33PM +0300, Ilya Verbin wrote:
> > On Mon, Jul 13, 2015 at 15:17:29 +0200, Jakub Jelinek wrote:
> > > +  k->refcount = INT_MAX;
> > 
> > Shouldn't it be UINTPTR_MAX?
> 
> Dunno if we can count on it being in stdint.h on all targets.
> Perhaps
> #define REFCOUNT_INFINITY (~(uintptr_t) 0)
> ?

Probably, I don't know.

> > > +  /* FIXME: Support device-to-device somehow?  */
> > 
> > Should libgomp copy data device-host-device if device-device is not 
> > supported by
> > target?  Current liboffloadmic doesn't support this.  I'll find out if 
> > there are
> > any plans.
> 
> There is also the option to spawn an offloaded function that will just call
> memcpy, or have such a function next to the main () of the program that we 
> link
> in.

Do you mean the case when src_devicep == dst_devicep ?  It's easy to support
this by adding new func into plugin, whithout any changes in liboffloadmic.
I thought about memcpy between different devices...

> Also, could you see if the 2 and 3 dimension memcpy_rect couldn't be handled
> more efficiently by liboffloadmic too?
> From what I can see, on the cuda side there is some cudaMemcpy2D and
> cudaMemcpy3D, though I admit I haven't studied in detail what exactly they
> do.

I'll try to find out.

  -- Ilya


Re: [PATCH] Fix PR c++/65186 (bound template template parm as valid nontype parm)

2015-07-13 Thread Jason Merrill

OK.

Jason



[PATCH, PR66851] Handle double reduction in parloops

2015-07-13 Thread Tom de Vries

Hi,

this patch fixes PR66851.

In parloops, we manage to parallelize outer loops, but not if the inner 
loop contains a reduction. There is an xfail in autopar/outer-4.c for this:

...
/* { dg-final { scan-tree-dump-times "parallelizing outer loop" 1 
"parloops" { xfail *-*-* } } } */

...

This patch allows outer loops with a reduction in the inner loop to be 
parallelized.


Bootstrapped and reg-tested on x86_64.

OK for trunk?

Thanks,
- Tom
Handle double reduction in parloops

2015-07-13  Tom de Vries  

	PR tree-optimization/66851
	* tree-parloops.c (reduc_stmt_res): New function.
	(initialize_reductions, add_field_for_reduction)
	(create_phi_for_local_result, create_loads_for_reductions)
	(create_stores_for_reduction, build_new_reduction): Handle case that
	reduc_stmt is a phi.
	(gather_scalar_reductions): Allow double_reduc reductions.

	* gcc.dg/autopar/outer-4.c (parloop): Remove superfluous noinline
	attribute.  Remove xfail on scan for parallelizing outer loop.
	(main): Remove.

	* testsuite/libgomp.c/outer-4.c: New test.
---
 gcc/testsuite/gcc.dg/autopar/outer-4.c | 17 
 gcc/tree-parloops.c| 37 +-
 libgomp/testsuite/libgomp.c/outer-4.c  | 36 +
 3 files changed, 68 insertions(+), 22 deletions(-)
 create mode 100644 libgomp/testsuite/libgomp.c/outer-4.c

diff --git a/gcc/testsuite/gcc.dg/autopar/outer-4.c b/gcc/testsuite/gcc.dg/autopar/outer-4.c
index 6fd37c5..f435080 100644
--- a/gcc/testsuite/gcc.dg/autopar/outer-4.c
+++ b/gcc/testsuite/gcc.dg/autopar/outer-4.c
@@ -6,15 +6,13 @@ void abort (void);
 int g_sum=0;
 int x[500][500];
 
-__attribute__((noinline))
-void parloop (int N)
+void
+parloop (int N)
 {
   int i, j;
   int sum;
 
-  /* Double reduction is currently not supported, outer loop is not 
- parallelized.  Inner reduction is detected, inner loop is 
- parallelized.  */
+  /* Double reduction is detected, outer loop is parallelized.  */
   sum = 0;
   for (i = 0; i < N; i++)
 for (j = 0; j < N; j++)
@@ -23,13 +21,6 @@ void parloop (int N)
   g_sum = sum;
 }
 
-int main(void)
-{
-  parloop(500);
-
-  return 0;
-}
-
 
-/* { dg-final { scan-tree-dump-times "parallelizing outer loop" 1 "parloops" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "parallelizing outer loop" 1 "parloops" } } */
 /* { dg-final { scan-tree-dump-times "loopfn" 4 "optimized" } } */
diff --git a/gcc/tree-parloops.c b/gcc/tree-parloops.c
index 21ed17b..db7da62 100644
--- a/gcc/tree-parloops.c
+++ b/gcc/tree-parloops.c
@@ -560,6 +560,14 @@ take_address_of (tree obj, tree type, edge entry,
   return name;
 }
 
+static tree
+reduc_stmt_res (gimple stmt)
+{
+  return (gimple_code (stmt) == GIMPLE_PHI
+	  ? gimple_phi_result (stmt)
+	  : gimple_assign_lhs (stmt));
+}
+
 /* Callback for htab_traverse.  Create the initialization statement
for reduction described in SLOT, and place it at the preheader of
the loop described in DATA.  */
@@ -586,7 +594,7 @@ initialize_reductions (reduction_info **slot, struct loop *loop)
   c = build_omp_clause (gimple_location (reduc->reduc_stmt),
 			OMP_CLAUSE_REDUCTION);
   OMP_CLAUSE_REDUCTION_CODE (c) = reduc->reduction_code;
-  OMP_CLAUSE_DECL (c) = SSA_NAME_VAR (gimple_assign_lhs (reduc->reduc_stmt));
+  OMP_CLAUSE_DECL (c) = SSA_NAME_VAR (reduc_stmt_res (reduc->reduc_stmt));
 
   init = omp_reduction_init (c, TREE_TYPE (bvar));
   reduc->init = init;
@@ -993,7 +1001,7 @@ add_field_for_reduction (reduction_info **slot, tree type)
 {
 
   struct reduction_info *const red = *slot;
-  tree var = gimple_assign_lhs (red->reduc_stmt);
+  tree var = reduc_stmt_res (red->reduc_stmt);
   tree field = build_decl (gimple_location (red->reduc_stmt), FIELD_DECL,
 			   SSA_NAME_IDENTIFIER (var), TREE_TYPE (var));
 
@@ -1053,12 +1061,12 @@ create_phi_for_local_result (reduction_info **slot, struct loop *loop)
 e = EDGE_PRED (store_bb, 1);
   else
 e = EDGE_PRED (store_bb, 0);
-  local_res = copy_ssa_name (gimple_assign_lhs (reduc->reduc_stmt));
+  tree lhs = reduc_stmt_res (reduc->reduc_stmt);
+  local_res = copy_ssa_name (lhs);
   locus = gimple_location (reduc->reduc_stmt);
   new_phi = create_phi_node (local_res, store_bb);
   add_phi_arg (new_phi, reduc->init, e, locus);
-  add_phi_arg (new_phi, gimple_assign_lhs (reduc->reduc_stmt),
-	   FALLTHRU_EDGE (loop->latch), locus);
+  add_phi_arg (new_phi, lhs, FALLTHRU_EDGE (loop->latch), locus);
   reduc->new_phi = new_phi;
 
   return 1;
@@ -1151,7 +1159,7 @@ create_loads_for_reductions (reduction_info **slot, struct clsn_data *clsn_data)
   struct reduction_info *const red = *slot;
   gimple stmt;
   gimple_stmt_iterator gsi;
-  tree type = TREE_TYPE (gimple_assign_lhs (red->reduc_stmt));
+  tree type = TREE_TYPE (reduc_stmt_res (red->reduc_stmt));
   tree load_struct;
   tree name;
   tree x;
@@ -1212,7 +1220,7 @@ create_stores_for_reduction (reduction_info **slot, struct clsn_data *clsn_data)
   tree t

Re: [RFC] two-phase marking in gt_cleare_cache

2015-07-13 Thread Tom de Vries

On 13/07/15 16:21, Michael Matz wrote:

Hi,

On Mon, 13 Jul 2015, Tom de Vries wrote:


Implementing multi-step maps or making the hashmaps non-caching
doesn't solve any of the above problems


I'm not saying that making those hashmaps non-caching solves any of
these problems.


Ah, I didn't mean to imply this, I meant to imply that enforcing policy as
you do is a good thing because it finds bugs, and that the policy to be
enforced should be forbidding multi-step deps :)


I'm saying that it decouples fixing the policy (for which I have a
patch) from fixing the issues that allow us to use these 3 as caches
again (for which there are no patches yet). The advantage of having a
policy in place is that we won't regress for tables still marked as
cache (or new tables marked as cache). So blocking committing the policy
on those issues makes no sense IMHO.


That's right, I didn't argue for that either.


Great,  we're in agreement then :)


But there should then be at
least a PR with a patch that disables the work-arounds for policy breakers
(the three decl-debug hash-maps), that if applied breaks bootstrap, so
that the fact that there's still a real bug somewhere doesn't get lost.



Yep, there should be a PR to track these issues.

And I made the down-grades for each cache a single patch, to make it 
easy to revert once we fix all the issues for one table.


Now let's see if I can get approval for "Don't mark live recursively in 
gt_cleare_cache".


Thanks,
- Tom


Re: [PATCH] libstdc++ os_defines now required for DragonFly

2015-07-13 Thread Jonathan Wakely

On 06/07/15 15:57 +0200, John Marino wrote:

On the development branch of DragonFly BSD, it was discovered that
__LONG_LONG_SUPPORTED was accidently unconditionally defined.  This had
a positive side effect of allowing GCC conftests to pass for C99 support
via wchar.h.  When the bug was fixed, the wchar C99 conftest now fails,
resulting in a c++ regression where software that previously compiled
now fail due to unknown functions such as wcstoll (since C99 supported
changed from "true" to "false")

FreeBSD behaves the exactly same way, and this OS dealt with it with
system-specific defines.
The DragonFly regression is fixed by copying the relevant defines from
the FreeBSD config. (see attached patch).

This patch should be applied to trunk and also backported to GCC-5 branch.


Committed to trunk so far, we'll have to wait for the branch to
re-open next week.



Index: libstdc++-v3/config/os/bsd/dragonfly/os_defines.h
===
--- libstdc++-v3/config/os/bsd/dragonfly/os_defines.h   (revision 225453)
+++ libstdc++-v3/config/os/bsd/dragonfly/os_defines.h   (working copy)
@@ -29,4 +29,9 @@
// System-specific #define, typedefs, corrections, etc, go here.  This
// file will come before all others.

+#define _GLIBCXX_USE_C99_CHECK 1
+#define _GLIBCXX_USE_C99_DYNAMIC (!(__ISO_C_VISIBLE >= 1999))
+#define _GLIBCXX_USE_C99_LONG_LONG_CHECK 1
+#define _GLIBCXX_USE_C99_LONG_LONG_DYNAMIC (_GLIBCXX_USE_C99_DYNAMIC || 
!defined __LONG_LONG_SUPPORTED)
+
#endif




Re: [PATCH] [gomp] Recycle non-nested team if possible

2015-07-13 Thread Sebastian Huber



On 13/07/15 16:17, Jakub Jelinek wrote:

On Mon, Jul 13, 2015 at 01:15:44PM +0200, Sebastian Huber wrote:

diff --git a/libgomp/team.c b/libgomp/team.c
index b98b233..0bcbaf8 100644
--- a/libgomp/team.c
+++ b/libgomp/team.c
@@ -134,6 +134,25 @@ gomp_thread_start (void *xdata)
return NULL;
  }
  
+static struct gomp_team *

+get_recycable_team (unsigned nthreads)

That would be recyclable.
But I think get_last_team would be better.


Ok.


Also, please make it static inline.


Out of curiosity, does this make a difference for a static function in a 
module if it has the inline or not?





+  team = gomp_malloc (sizeof (*team) + nthreads * extra);
+
+#ifndef HAVE_SYNC_BUILTINS
+  gomp_mutex_init (&team->work_share_list_free_lock);
+#endif

Avoiding gomp_mutex_destroy/gomp_mutex_init is fine,
but I must say I'm far less sure about gomp_sem_init (can you
add there a temporary assert that it has the expected value)
and even less about gomp_barrier_init (I think e.g. on Linux
generation will be very unlikely 0 that it should be, and not
sure about awaited_final value either).

Jakub


I didn't observe any testsuite failures on x86_64-unknown-linux-gnu with 
this patch. I will add asserts and re-run the testsuite tomorrow.


--
Sebastian Huber, embedded brains GmbH

Address : Dornierstr. 4, D-82178 Puchheim, Germany
Phone   : +49 89 189 47 41-16
Fax : +49 89 189 47 41-09
E-Mail  : sebastian.hu...@embedded-brains.de
PGP : Public key available on request.

Diese Nachricht ist keine geschäftliche Mitteilung im Sinne des EHUG.



Re: [GOMP] a struct for offload target data

2015-07-13 Thread Bernd Schmidt

On 07/13/2015 03:57 PM, Nathan Sidwell wrote:

this patch changes the offload target data type from an array of void *,
to a struct, which is somewhat easier to deal with than remembering
numeric indices and type casts.

This is step  1 in reworking the launch API.


Looks fine.


Bernd




Re: [PATCH 2/4] Add liboffloadmic

2015-07-13 Thread Ilya Verbin
On Thu, Jul 09, 2015 at 12:00:29 +0200, Thomas Schwinge wrote:
> I noticed that -- at least with current versions of GCC -- there are
> several compiler diagnostics displayed during the build.  It would be
> nice to get these addressed -- as applicable, presumably in the Intel
> upstream version, and then a new import be done into GCC?  For example, I
> noticed the following changes in my build logs (not a complete list):
> 
> {+[...]/source-gcc/liboffloadmic/runtime/emulator/coi_device.cpp:112:28: 
> warning: invalid suffix on literal; C++11 requires a space between literal 
> and string macro [-Wliteral-suffix]+}
> {+   sprintf (pipe_host_path, "%s"PIPE_HOST_PATH, mic_dir);+}
> {+^+}
> {+[...]/source-gcc/liboffloadmic/runtime/emulator/coi_device.cpp:113:30: 
> warning: invalid suffix on literal; C++11 requires a space between literal 
> and string macro [-Wliteral-suffix]+}
> {+   sprintf (pipe_target_path, "%s"PIPE_TARGET_PATH, mic_dir);+}
> {+  ^+}
> 
> {+[...]/source-gcc/liboffloadmic/runtime/emulator/coi_host.cpp:892:24: 
> warning: invalid suffix on literal; C++11 requires a space between literal 
> and string macro [-Wliteral-suffix]+}
> {+   sprintf (pipes_path, "%s"PIPES_PATH, eng->dir);+}
> {+^+}
> {+[...]/source-gcc/liboffloadmic/runtime/emulator/coi_host.cpp:903:28: 
> warning: invalid suffix on literal; C++11 requires a space between literal 
> and string macro [-Wliteral-suffix]+}
> {+   sprintf (pipe_host_path, "%s"PIPE_HOST_PATH, eng->dir);+}
> {+^+}
> {+[...]/source-gcc/liboffloadmic/runtime/emulator/coi_host.cpp:904:30: 
> warning: invalid suffix on literal; C++11 requires a space between literal 
> and string macro [-Wliteral-suffix]+}
> {+   sprintf (pipe_target_path, "%s"PIPE_TARGET_PATH, eng->dir);+}
> {+  ^+}
> 
> [...]/source-gcc/liboffloadmic/runtime/offload_host.cpp:107:30: warning: 
> [-deprecated conversion from-]{+ISO C++ forbids converting a+} string 
> constant to 'char*' [-Wwrite-strings]
>  static char *timer_envname = "H_TIME";
>   ^
> 
> [...]/source-gcc/liboffloadmic/runtime/offload_myo_host.cpp: In function 
> 'void __intel_cilk_for_32_offload(int, void (*)(void*, void*), int, void*, 
> void*, unsigned int, unsigned int)':
> [...]/source-gcc/liboffloadmic/runtime/offload_myo_host.cpp:762:55: 
> warning: [-deprecated conversion from-]{+ISO C++ forbids converting a+} 
> string constant to 'char*' [-Wwrite-strings]
> args, target_number)
>^
> [...]/source-gcc/liboffloadmic/runtime/offload_myo_host.cpp: In function 
> 'void __intel_cilk_for_64_offload(int, void (*)(void*, void*), int, void*, 
> void*, uint64_t, uint64_t)':
> [...]/source-gcc/liboffloadmic/runtime/offload_myo_host.cpp:815:49: 
> warning: [-deprecated conversion from-]{+ISO C++ forbids converting a+} 
> string constant to 'char*' [-Wwrite-strings]
> target_number)
>  ^
> 
> [...]/source-gcc/liboffloadmic/runtime/offload_orsl.cpp:39:33: warning: 
> [-deprecated conversion from-]{+ISO C++ forbids converting a+} string 
> constant to 'ORSLTag {aka char*}' [-Wwrite-strings]
>  static const ORSLTag   my_tag = "Offload";

Yeah, they are already fixed in the upstream version.  I'll prepare an update
for GCC soon.

  -- Ilya


Re: [RFC] two-phase marking in gt_cleare_cache

2015-07-13 Thread Michael Matz
Hi,

On Mon, 13 Jul 2015, Tom de Vries wrote:

> > Implementing multi-step maps or making the hashmaps non-caching 
> > doesn't solve any of the above problems
> 
> I'm not saying that making those hashmaps non-caching solves any of 
> these problems.

Ah, I didn't mean to imply this, I meant to imply that enforcing policy as 
you do is a good thing because it finds bugs, and that the policy to be 
enforced should be forbidding multi-step deps :)

> I'm saying that it decouples fixing the policy (for which I have a 
> patch) from fixing the issues that allow us to use these 3 as caches 
> again (for which there are no patches yet). The advantage of having a 
> policy in place is that we won't regress for tables still marked as 
> cache (or new tables marked as cache). So blocking committing the policy 
> on those issues makes no sense IMHO.

That's right, I didn't argue for that either.  But there should then be at 
least a PR with a patch that disables the work-arounds for policy breakers 
(the three decl-debug hash-maps), that if applied breaks bootstrap, so 
that the fact that there's still a real bug somewhere doesn't get lost.


Ciao,
Michael.


Change genmatch (if ...) syntax

2015-07-13 Thread Richard Biener

The following patch fixes a subtle issue with how (if ...) is currently
operating.  Take

/* Transform comparisons of the form X * C1 CMP 0 to X CMP 0 in the
   signed arithmetic case.  That form is created by the compiler
   often enough for folding it to be of value.  One example is in
   computing loop trip counts after Operator Strength Reduction.  */
(for cmp (simple_comparison)
 scmp (swapped_simple_comparison)
 (simplify
  (cmp (mult @0 INTEGER_CST@1) integer_zerop@2)
  /* Handle unfolded multiplication by zero.  */
  (if (integer_zerop (@1))
   (cmp @1 @2))
  (if (ANY_INTEGRAL_TYPE_P (TREE_TYPE (@0))
   && TYPE_OVERFLOW_UNDEFINED (TREE_TYPE (@0)))
   /* If @1 is negative we swap the sense of the comparison.  */
   (if (tree_int_cst_sgn (@1) < 0)
(scmp @0 @2))
   (cmp @0 @2

as an example.  Currently there is no "else" arm implemented
because of an implementation detail.  That implementation detail
is that currently the above is equivalent to three separate
simplify patterns, each collecting 'if's from their final result
expression.  Thus,

  (if (integer_zerop (@1))
   (cmp @1 @2))

  (if (ANY_INTEGRAL_TYPE_P (TREE_TYPE (@0))
   && TYPE_OVERFLOW_UNDEFINED (TREE_TYPE (@0)))
   /* If @1 is negative we swap the sense of the comparison.  */
   (if (tree_int_cst_sgn (@1) < 0)
(scmp @0 @2))

and

  (if (ANY_INTEGRAL_TYPE_P (TREE_TYPE (@0))
   && TYPE_OVERFLOW_UNDEFINED (TREE_TYPE (@0)))
   (cmp @0 @2

and the fact that the last one is "mostly" equivalent to

  (if (ANY_INTEGRAL_TYPE_P (TREE_TYPE (@0))
   && TYPE_OVERFLOW_UNDEFINED (TREE_TYPE (@0)))
   (if (!(tree_int_cst_sgn (@1) < 0))
(cmp @0 @2

is just because the first one is guaranteed to match first - _if_ it
matches.  Matching can fail due to various reasons though and it
is not always obvious whether the fallthru is really equivalent to
an else.

Lack of an explicit else was also pointed out as a missing feature.
The following patch fixes the above implementation detail and keeps
a single 'simplify' for the whole if/with expression tree (thus
also not duplicating common ifs and withs).  It adds an explicit
else and removes the ability to "sequence" stuff via the fallthru.

Thus the above pattern now becomes

/* Transform comparisons of the form X * C1 CMP 0 to X CMP 0 in the
   signed arithmetic case.  That form is created by the compiler
   often enough for folding it to be of value.  One example is in
   computing loop trip counts after Operator Strength Reduction.  */
(for cmp (simple_comparison)
 scmp (swapped_simple_comparison)
 (simplify
  (cmp (mult @0 INTEGER_CST@1) integer_zerop@2)
  /* Handle unfolded multiplication by zero.  */
  (if (integer_zerop (@1))
   (cmp @1 @2)
   (if (ANY_INTEGRAL_TYPE_P (TREE_TYPE (@0))
&& TYPE_OVERFLOW_UNDEFINED (TREE_TYPE (@0)))
/* If @1 is negative we swap the sense of the comparison.  */
(if (tree_int_cst_sgn (@1) < 0)
 (scmp @0 @2)
 (cmp @0 @2))

as the new if syntax is

 (if (cond-expr)
   (then-expr)
  [(else-expr)])

with the else-expr being optional.  Note that writing

 (if (cond1)
  (bla))
 (if (cond2)
  (foo))
 (if (cond3)
  (baz))

is no longer supported (I might introduce that again as followup
with explicitely denoting this as 'switch/case' list).

This means touching quite a few patterns in match.pd.  Most cases
simply didn't continue parsing but the following one for example
has different meaning before and after:

 (if (cond1)
  (if (cond2)
   (expr1))
  (expr2))

now expr2 is guarded by !cond1 while formerly it was guarded by cond1.

Anticipating 'switch/case' support I have not re-indented the cases
of large (if ..) (else if ...) (else if ...) forms.

As this now keeps the if/with expression tree doing further
simplifications to make the generated code smaller should now
be easier.

Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.

Richard.

2015-07-13  Richard Biener  

* genmatch.c (struct operand): Add OP_IF and OP_WITH op_types.
(struct if_expr): New.
(struct with_expr): Likewise.
(is_a_helper): Add helpers for if_expr and with_expr.
(struct simplify): Add simplify_kind enum and member.  Remove
ifexpr_vec member.
(simplify::simplify): Adjust.
(lower_commutative): Adjust.
(lower_opt_convert): Likewise.
(lower_cond): Likewise.
(replace_id): Handle with_expr and if_expr.
(lower_for): Adjust.
(dt_simplify::gen_1): New recursive worker, split out from ...
(dt_simplify::gen): ... here.  Deal with if and with expansion
recursively.
(capture_info::capture_info): Take context argument
(capture_info::walk_result): Only analyze specific result.
(parser::parse_result): New function.
(parser::parse_simplify): Adjust to parse ifs with then end
else case.
(parser::parse_if): Simplify.
(parser::parse_pattern): Pass down simplify kind.

Re: [PATCH] [gomp] Recycle non-nested team if possible

2015-07-13 Thread Jakub Jelinek
On Mon, Jul 13, 2015 at 01:15:44PM +0200, Sebastian Huber wrote:
> diff --git a/libgomp/team.c b/libgomp/team.c
> index b98b233..0bcbaf8 100644
> --- a/libgomp/team.c
> +++ b/libgomp/team.c
> @@ -134,6 +134,25 @@ gomp_thread_start (void *xdata)
>return NULL;
>  }
>  
> +static struct gomp_team *
> +get_recycable_team (unsigned nthreads)

That would be recyclable.
But I think get_last_team would be better.
Also, please make it static inline.

> +  team = gomp_malloc (sizeof (*team) + nthreads * extra);
> +
> +#ifndef HAVE_SYNC_BUILTINS
> +  gomp_mutex_init (&team->work_share_list_free_lock);
> +#endif

Avoiding gomp_mutex_destroy/gomp_mutex_init is fine,
but I must say I'm far less sure about gomp_sem_init (can you
add there a temporary assert that it has the expected value)
and even less about gomp_barrier_init (I think e.g. on Linux
generation will be very unlikely 0 that it should be, and not
sure about awaited_final value either).

Jakub


Re: [RFC] two-phase marking in gt_cleare_cache

2015-07-13 Thread Tom de Vries

On 13/07/15 15:43, Michael Matz wrote:

Hi,

On Sun, 12 Jul 2015, Tom de Vries wrote:


I'm trying to get to a defined policy for what is allowed for caches.
Either forbidding or allowing multi-step dependencies, I don't really
mind.


I think forbidding is the way to go, because ...


I managed to write a patch series that implements the forbidding of
multi-step dependencies. I'll post this soon.


https://gcc.gnu.org/ml/gcc-patches/2015-07/msg00970.html


... aha, it finds bugs!  So you actually had to changes some hashes to be
non-caching for this to work, and it's all some decl-to-debug-something
maps (well, of course, otherwise you wouldn't have run into the bug you're
trying to fix in the first place).  I think this hints at actual bugs that
needs fixing in the gomp branch:

As you analyzed in PR 66714, eventually a decl A is replaced by decl B,
but its debug-expr is simply copied, and that one itself refers to decls
(let's call them D*) that meanwhile are removed.

Now, as the D* decls are not in any other data structure (otherwise they
would have been marked) the typical actions that needed to have been done
for them (like e.g. associating debug info with them, allocating them to
some stack or register place) i.e. anything that needed to be done for
normal decls won't have been done.  So the debug info generator in this
case, when it sees those D* decls can't do its work, e.g. debug info
generated for D* won't refer to the real place containing the value,
because also the generated code itself doesn't refer to D* anymore.

This also hints at other problems (which might not actually occur in the
case at hand, but still): the contents of DECL_VALUE_EXPR is the "real"
thing containing the value of a decl (i.e. a decl having a value-expr
doesn't itself occur in the code anymore), be it a decl itself, or some
expression (which might also refer to decls).  Now, in PR 66714 you
analyzed that one of those D* was removed from the function, which should
have happened only because no code referred to anymore, i.e. D* was also
rewritten to some other D'* (if it weren't rewritten and D* was referred
to in code, you would have created a miscompilation).  At that point also
the DECL_VALUE_EXPRs need to be rewritten to refer to D'*, not to D*
anymore.



Thanks for looking into the PR. I suspected that these things were 
wrong, but I have no knowledge of this part of the compiler, so I was 
not sure.



Implementing multi-step maps or making the hashmaps non-caching doesn't
solve any of the above problems


I'm not saying that making those hashmaps non-caching solves any of 
these problems.


I'm saying that it decouples fixing the policy (for which I have a 
patch) from fixing the issues that allow us to use these 3 as caches 
again (for which there are no patches yet). The advantage of having a 
policy in place is that we won't regress for tables still marked as 
cache (or new tables marked as cache). So blocking committing the policy 
on those issues makes no sense IMHO.


Thanks,
- Tom


, it merely forces some DECLs in the
compiler to remain live but that actually have no meaning in their
context.

So, I think this makes it pretty clear that those hashmaps should remain
caching maps, and that multi-step deps in caches should be disallowed, and
that the underlying problem should rather be fixed (and the checking code
against multi-step-deps should be added to the compiler).


Ciao,
Michael.





Re: [PATCH][RTL-ifcvt] Make non-conditional execution if-conversion more aggressive

2015-07-13 Thread Kyrill Tkachov

Hi Bernhard,

On 13/07/15 10:45, Kyrill Tkachov wrote:

PS: no -mbranch-cost and, a tad more seriously, no --param branch-cost either ;)
PPS: attached meant to illustrate comments above. Untested.

Thanks a lot! This is all very helpful.
I'll respin the patch.


Here it is. I've expanded the comments in the functions you mentioned,
moved the tests to gcc.dg and enabled them for aarch64 and x86 and changed
the types of the costs used to unsigned int.


Bootstrapped on aarch64 and x86_64.
The go testsuite passes on x86_64-unknown-linux-gnu for me...

Thanks,
Kyrill

2015-07-13  Kyrylo Tkachov  

* ifcvt.c (struct noce_if_info): Add then_simple, else_simple,
then_cost, else_cost fields.  Change branch_cost field to unsigned int.
(end_ifcvt_sequence): Call set_used_flags on each insn in the
sequence.
(noce_simple_bbs): New function.
(noce_try_move): Bail if basic blocks are not simple.
(noce_try_store_flag): Likewise.
(noce_try_store_flag_constants): Likewise.
(noce_try_addcc): Likewise.
(noce_try_store_flag_mask): Likewise.
(noce_try_cmove): Likewise.
(noce_try_minmax): Likewise.
(noce_try_abs): Likewise.
(noce_try_sign_mask): Likewise.
(noce_try_bitop): Likewise.
(bbs_ok_for_cmove_arith): New function.
(noce_emit_all_but_last): Likewise.
(noce_emit_insn): Likewise.
(noce_emit_bb): Likewise.
(noce_try_cmove_arith): Handle non-simple basic blocks.
(insn_valid_noce_process_p): New function.
(bb_valid_for_noce_process_p): Likewise.
(noce_process_if_block): Allow non-simple basic blocks
where appropriate.


2015-07-13  Kyrylo Tkachov  

* gcc.dg/ifcvt-1.c: New test.
* gcc.dg/ifcvt-2.c: Likewise.
* gcc.dg/ifcvt-3.c: Likewise.






Thanks,
Kyrill



cheers,


commit bc62987a2fa3d9dc3de5a1ed8003a745340255bd
Author: Kyrylo Tkachov 
Date:   Wed Jul 8 15:45:04 2015 +0100

[PATCH][ifcvt] Make non-conditional execution if-conversion more aggressive

diff --git a/gcc/ifcvt.c b/gcc/ifcvt.c
index 31849ee..2f0a228 100644
--- a/gcc/ifcvt.c
+++ b/gcc/ifcvt.c
@@ -815,8 +815,17 @@ struct noce_if_info
  form as well.  */
   bool then_else_reversed;
 
+  /* True if the contents of then_bb and else_bb are a
+ simple single set instruction.  */
+  bool then_simple;
+  bool else_simple;
+
+  /* The total rtx cost of the instructions in then_bb and else_bb.  */
+  unsigned int then_cost;
+  unsigned int else_cost;
+
   /* Estimated cost of the particular branch instruction.  */
-  int branch_cost;
+  unsigned int branch_cost;
 };
 
 static rtx noce_emit_store_flag (struct noce_if_info *, rtx, int, int);
@@ -1036,6 +1045,10 @@ end_ifcvt_sequence (struct noce_if_info *if_info)
   set_used_flags (if_info->cond);
   set_used_flags (if_info->a);
   set_used_flags (if_info->b);
+
+  for (insn = seq; insn; insn = NEXT_INSN (insn))
+set_used_flags (insn);
+
   unshare_all_rtl_in_chain (seq);
   end_sequence ();
 
@@ -1053,6 +1066,21 @@ end_ifcvt_sequence (struct noce_if_info *if_info)
   return seq;
 }
 
+/* Return true iff the then and else basic block (if it exists)
+   consist of a single simple set instruction.  */
+
+static bool
+noce_simple_bbs (struct noce_if_info *if_info)
+{
+  if (!if_info->then_simple)
+return false;
+
+  if (if_info->else_bb)
+return if_info->else_simple;
+
+  return true;
+}
+
 /* Convert "if (a != b) x = a; else x = b" into "x = a" and
"if (a == b) x = a; else x = b" into "x = b".  */
 
@@ -1067,6 +1095,9 @@ noce_try_move (struct noce_if_info *if_info)
   if (code != NE && code != EQ)
 return FALSE;
 
+  if (!noce_simple_bbs (if_info))
+return FALSE;
+
   /* This optimization isn't valid if either A or B could be a NaN
  or a signed zero.  */
   if (HONOR_NANS (if_info->x)
@@ -1115,6 +1146,9 @@ noce_try_store_flag (struct noce_if_info *if_info)
   rtx target;
   rtx_insn *seq;
 
+  if (!noce_simple_bbs (if_info))
+return FALSE;
+
   if (CONST_INT_P (if_info->b)
   && INTVAL (if_info->b) == STORE_FLAG_VALUE
   && if_info->a == const0_rtx)
@@ -1163,6 +1197,9 @@ noce_try_store_flag_constants (struct noce_if_info *if_info)
   int normalize, can_reverse;
   machine_mode mode;
 
+  if (!noce_simple_bbs (if_info))
+return FALSE;
+
   if (CONST_INT_P (if_info->a)
   && CONST_INT_P (if_info->b))
 {
@@ -1291,6 +1328,9 @@ noce_try_addcc (struct noce_if_info *if_info)
   rtx_insn *seq;
   int subtract, normalize;
 
+  if (!noce_simple_bbs (if_info))
+return FALSE;
+
   if (GET_CODE (if_info->a) == PLUS
   && rtx_equal_p (XEXP (if_info->a, 0), if_info->b)
   && (reversed_comparison_code (if_info->cond, if_info->jump)
@@ -1382,6 +1422,9 @@ noce_try_store_flag_mask (struct noce_if_info *if_info)
   rtx_insn *seq;
   int reversep;
 
+  if (!noce_simple_bbs (if_info))
+return FALSE;
+
   reversep = 0;
   if ((if_info->branch_cost >= 2
 

Re: [gomp4.1 WIP] omp_target_* libgomp APIs

2015-07-13 Thread Jakub Jelinek
On Mon, Jul 13, 2015 at 04:38:33PM +0300, Ilya Verbin wrote:
> On Mon, Jul 13, 2015 at 15:17:29 +0200, Jakub Jelinek wrote:
> > Here is a new version that I've committed.  I've finished up
> > associate/disassociate, wrote a test and tested also with intelmicemul
> > offloading.
> 
> Great!
> 
> > +  k->refcount = INT_MAX;
> 
> Shouldn't it be UINTPTR_MAX?

Dunno if we can count on it being in stdint.h on all targets.
Perhaps
#define REFCOUNT_INFINITY (~(uintptr_t) 0)
?

> > +  /* FIXME: Support device-to-device somehow?  */
> 
> Should libgomp copy data device-host-device if device-device is not supported 
> by
> target?  Current liboffloadmic doesn't support this.  I'll find out if there 
> are
> any plans.

There is also the option to spawn an offloaded function that will just call
memcpy, or have such a function next to the main () of the program that we link
in.
Also, could you see if the 2 and 3 dimension memcpy_rect couldn't be handled
more efficiently by liboffloadmic too?
>From what I can see, on the cuda side there is some cudaMemcpy2D and
cudaMemcpy3D, though I admit I haven't studied in detail what exactly they
do.

Jakub


Re: GOMP_offload_register

2015-07-13 Thread Nathan Sidwell

On 07/13/15 09:49, Ilya Verbin wrote:

On Mon, Jul 13, 2015 at 09:42:50 -0400, Nathan Sidwell wrote:

GOMP_offload_register's target data argument is 'void *'.  Is there
any reason it shouldn't be 'const void *'?  It would seem to me that
that would be better?

(a cursory look at i386/intelmic-mkoffload.c suggests a lack of
consts in the variable decls there.  ptx suffers the same problem)


I can't remember any reason, so I agree that const is better (if this works :)


Ok, I'll work in that direction.

(my thought was that for targets where their is mutable data in there, they 
should insert the appropriate const-removing casts)


nathan



[GOMP] a struct for offload target data

2015-07-13 Thread Nathan Sidwell

Bernd,
this patch changes the offload target data type from an array of void *, to a 
struct, which is somewhat easier to deal with than remembering numeric indices 
and type casts.


This is step  1 in reworking the launch API.

ok?

nathan
2015-07-13  Nathan Sidwell  

	gcc/
	* config/nvptx/mkoffload.c (process): Constify mapping variables.
	Define target data struct and initialize it.

	libgomp/
	* plugin/plugin-nvptx.c (link_ptx): Constify string argument.
	Workaround driver library const error.
	(struct nvptx_tdata, nvptx_tdata_t): New.
	(GOMP_OFFLOAD_load_image): Use struct for target_data's real
	type.

Index: gcc/config/nvptx/mkoffload.c
===
--- gcc/config/nvptx/mkoffload.c	(revision 225703)
+++ gcc/config/nvptx/mkoffload.c	(working copy)
@@ -267,22 +267,30 @@ process (FILE *in, FILE *out)
 }
   fprintf (out, "\";\n\n");
 
-  unsigned int nvars = 0, nfuncs = 0;
-
-  fprintf (out, "static const char *var_mappings[] = {\n");
-  for (id_map *id = var_ids; id; id = id->next, nvars++)
+  fprintf (out, "static const char *const var_mappings[] = {\n");
+  for (id_map *id = var_ids; id; id = id->next)
 fprintf (out, "\t\"%s\"%s\n", id->ptx_name, id->next ? "," : "");
   fprintf (out, "};\n\n");
-  fprintf (out, "static const char *func_mappings[] = {\n");
-  for (id_map *id = func_ids; id; id = id->next, nfuncs++)
+  fprintf (out, "static const char *const func_mappings[] = {\n");
+  for (id_map *id = func_ids; id; id = id->next)
 fprintf (out, "\t\"%s\"%s\n", id->ptx_name, id->next ? "," : "");
   fprintf (out, "};\n\n");
 
-  fprintf (out, "static const void *target_data[] = {\n");
-  fprintf (out, "  ptx_code, (void *)(__UINTPTR_TYPE__)sizeof (ptx_code),\n");
-  fprintf (out, "  (void *) %u, var_mappings, (void *) %u, func_mappings\n",
-	   nvars, nfuncs);
-  fprintf (out, "};\n\n");
+  fprintf (out,
+	   "static struct nvptx_tdata {\n"
+	   "  const char *ptx_src;\n"
+	   "  __SIZE_TYPE__ ptx_len;\n"
+	   "  const char *const *var_names;\n"
+	   "  __SIZE_TYPE__ var_num;\n"
+	   "  const char *const *fn_names;\n"
+	   "  __SIZE_TYPE__ fn_num;\n"
+	   "} target_data = {\n"
+	   "  ptx_code, sizeof (ptx_code),\n"
+	   "  var_mappings,"
+	   "  sizeof (var_mappings) / sizeof (var_mappings[0]),\n"
+	   "  func_mappings,"
+	   "  sizeof (func_mappings) / sizeof (func_mappings[0])\n"
+	   "};\n\n");
 
   fprintf (out, "#ifdef __cplusplus\n");
   fprintf (out, "extern \"C\" {\n");
Index: libgomp/plugin/plugin-nvptx.c
===
--- libgomp/plugin/plugin-nvptx.c	(revision 225703)
+++ libgomp/plugin/plugin-nvptx.c	(working copy)
@@ -798,7 +798,7 @@ nvptx_get_num_devices (void)
 
 
 static void
-link_ptx (CUmodule *module, char *ptx_code, size_t length)
+link_ptx (CUmodule *module, char const *ptx_code, size_t length)
 {
   CUjit_option opts[7];
   void *optvals[7];
@@ -843,7 +843,9 @@ link_ptx (CUmodule *module, char *ptx_co
   while (off < length)
 {
   int l = strlen (ptx_code + off);
-  r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, ptx_code + off, l + 1,
+  /* cuLinkAddData's 'data' argument erroneously omits the const
+	 qualifier.  */
+  r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, (char*)ptx_code + off, l + 1,
 			 0, 0, 0, 0);
   if (r != CUDA_SUCCESS)
 	{
@@ -1580,23 +1582,35 @@ GOMP_OFFLOAD_fini_device (int n)
   pthread_mutex_unlock (&ptx_dev_lock);
 }
 
+typedef struct nvptx_tdata
+{
+  const char *ptx_src;
+  size_t ptx_len;
+
+  const char *const *var_names;
+  size_t var_num;
+
+  const char *const *fn_names;
+  size_t fn_num;
+} nvptx_tdata_t;
+
 int
 GOMP_OFFLOAD_load_image (int ord, void *target_data,
 			 struct addr_pair **target_table)
 {
   CUmodule module;
-  char **fn_names, **var_names;
+  const char *const *fn_names, *const *var_names;
   unsigned int fn_entries, var_entries, i, j;
   CUresult r;
   struct targ_fn_descriptor *targ_fns;
-  void **img_header = (void **) target_data;
+  nvptx_tdata_t const *img_header = (nvptx_tdata_t const *) target_data;
   struct ptx_image_data *new_image;
 
   GOMP_OFFLOAD_init_device (ord);
 
   nvptx_attach_host_thread_to_device (ord);
 
-  link_ptx (&module, img_header[0], (size_t) img_header[1]);
+  link_ptx (&module, img_header->ptx_src, img_header->ptx_len);
 
   pthread_mutex_lock (&ptx_image_lock);
   new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
@@ -1606,23 +1620,15 @@ GOMP_OFFLOAD_load_image (int ord, void *
   ptx_images = new_image;
   pthread_mutex_unlock (&ptx_image_lock);
 
-  /* The mkoffload utility emits a table of pointers/integers at the start of
- each offload image:
-
- img_header[0] -> ptx code
- img_header[1] -> size of ptx code
- img_header[2] -> number of variables
- img_header[3] -> array of variable names (pointers to strings)
- img_header[4] -> number of kernels
- img_header[5] -> array of kernel names (pointers to st

Re: [gomp4.1] depend(sink) and depend(source) parsing for C

2015-07-13 Thread Jakub Jelinek
On Sat, Jul 11, 2015 at 11:35:36AM -0700, Aldy Hernandez wrote:
> It looks like the C++ bits are quite similar to the C ones.  AFAICT, only
> numbers are allowed for the sink offsets, so no C++ iterators, which would
> likely complicate matters.  If they are eventually allowed, we can implement
> them as a follow up.
> 
> The attached patch addresses all your concerns plus includes the C++
> implementation.  The included test passes for both languages.
> 
> I can work on Fortran next if you'd like.

Please leave Fortran unresolved for now, we'll see in Autumn if we have time
for Fortran OpenMP 4.1 support, or not, there is also the possibility to
handle it like in 4.9 - 4.9.0 came with just C/C++ OpenMP 4.0 support
(and Fortran only OpenMP 3.1 support) and 4.9.1 added Fortran OpenMP 4.0 
support.

Please write ChangeLog entries and commit them into */ChangeLog.gomp files.

> +   if (c_parser_next_token_is_not (parser, CPP_NUMBER))
> + {
> +   c_parser_error (parser, "expected %");

I think %< and %> here

> +   return list;
> + }
> +
> +   addend = c_parser_peek_token (parser)->value;
> +   if (TREE_CODE (addend) != INTEGER_CST)
> + {
> +   c_parser_error (parser, "expected %");

and here aren't appropriate here, you don't expect integer as a keyword,
but some integer...

On the C++ FE side, please also try a testcase in g++.dg/gomp/ where
the ordered(n) loop with #pragma omp ordered depend({source,sink}) will be
in a template, to make sure pt.c does the right thing with it.

> +   if (cp_lexer_next_token_is_not (parser->lexer, CPP_NUMBER))
> + {
> +   cp_parser_error (parser, "expected %");
> +   return list;
> + }
> +
> +   addend = cp_lexer_peek_token (parser->lexer)->u.value;
> +   if (TREE_CODE (addend) != INTEGER_CST)
> + {
> +   cp_parser_error (parser, "expected %");

See above.

> @@ -365,6 +367,8 @@ new_omp_context (enum omp_region_type region_type)
>  
>c = XCNEW (struct gimplify_omp_ctx);
>c->outer_context = gimplify_omp_ctxp;
> +  c->iter_vars.safe_push(0);
> +  c->iter_vars.pop();

As mentioned, please leave this out.

> @@ -8982,7 +8997,36 @@ gimplify_expr (tree *expr_p, gimple_seq *pre_p, 
> gimple_seq *post_p,
>   }
>   break;
> case OMP_ORDERED:
> - g = gimple_build_omp_ordered (body);
> + if (gimplify_omp_ctxp)
> +   for (tree c = OMP_ORDERED_CLAUSES (*expr_p);
> +c; c = OMP_CLAUSE_CHAIN (c))
> + if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_DEPEND
> + && OMP_CLAUSE_DEPEND_KIND (c) == OMP_CLAUSE_DEPEND_SINK)
> +   {
> + unsigned int n = 0;
> + bool fail = false;
> + for (tree decls = OMP_CLAUSE_DECL (c);
> +  decls && TREE_CODE (decls) == TREE_LIST;
> +  decls = TREE_CHAIN (decls), ++n)
> +   if (n < gimplify_omp_ctxp->iter_vars.length ()
> +   && TREE_VALUE (decls)
> +   != gimplify_omp_ctxp->iter_vars[n])
> + {
> +   error_at (OMP_CLAUSE_LOCATION (c),
> + "variable %qE is not an iteration "
> + "variable", TREE_VALUE (decls));

I think this error message will be confusing to users, if they write
#pragma omp for ordered(3)
for (int i = 0; i < 10; i++)
for (int j = 0; j < 10; j++)
for (int k = 0; k < 10; k++)
{
#pragma omp ordered depend(sink:k-1, j+2, i-3)
#pragma omp ordered depend(source)
}
because then it will complain that k and i are not iteration
variables, when they in fact are, just in wrong order.

I believe our diagnostics doesn't have support for ngettext style
of diagnostic messages (1st vs. 2nd, 3rd, 4th ...); I wonder if
saying variable %qE is not an iteration variable of outermost loop %d, expected 
%qE",
TREE_VALUE (decls), n + 1, gimplify_omp_ctxp->iter_vars[n]
wouldn't be better or something similar.

> +   fail = true;
> + }
> + /* Avoid being too redundant.  */
> + if (!fail
> + && n != gimplify_omp_ctxp->iter_vars.length ())
> +   error_at (OMP_CLAUSE_LOCATION (c),
> +  "number of variables in depend(sink) clause "
> +  "does not match number of iteration variables");
> +   }
> +
> + g = gimple_build_omp_ordered (body,
> +   OMP_ORDERED_CLAUSES (*expr_p));
>   break;
> case OMP_CRITICAL:
>   gimplify_scan_omp_clauses (&OMP_CRITICAL_CLAUSES (*expr_p),
> diff --git a/gcc/omp-low.c b/gcc/omp-low.c
> index 83677ea..3dec095 100644
> --- a/gcc/

Re: GOMP_offload_register

2015-07-13 Thread Ilya Verbin
On Mon, Jul 13, 2015 at 09:42:50 -0400, Nathan Sidwell wrote:
> GOMP_offload_register's target data argument is 'void *'.  Is there
> any reason it shouldn't be 'const void *'?  It would seem to me that
> that would be better?
> 
> (a cursory look at i386/intelmic-mkoffload.c suggests a lack of
> consts in the variable decls there.  ptx suffers the same problem)

I can't remember any reason, so I agree that const is better (if this works :)

  -- Ilya


Re: [PATCH] PR/66760, ipa-inline-analysis.c compile-time hog

2015-07-13 Thread Richard Biener
On Mon, Jul 13, 2015 at 3:46 PM, Paolo Bonzini  wrote:
>
>
> On 13/07/2015 15:45, Richard Biener wrote:
>> It would be nice to have a patch that can be backported to the GCC 5 branch
>> as well.  We can improve this on trunk as followup,no?
>
> The patch I've already posted can be backported. O:-)

So unless Martin objects consider the patch approved for trunk and for
backporting
after 5.2 is released and trunk shows no issues.

Martin - can you take care of committing if you are fine with it?

Thanks,
Richard.

> Paolo


Re: [PATCH] PR/66760, ipa-inline-analysis.c compile-time hog

2015-07-13 Thread Paolo Bonzini


On 13/07/2015 15:45, Richard Biener wrote:
> It would be nice to have a patch that can be backported to the GCC 5 branch
> as well.  We can improve this on trunk as followup,no?

The patch I've already posted can be backported. O:-)

Paolo


Re: [PATCH] PR/66760, ipa-inline-analysis.c compile-time hog

2015-07-13 Thread Richard Biener
On Mon, Jul 13, 2015 at 3:30 PM, Martin Jambor  wrote:
> On Mon, Jul 13, 2015 at 02:47:23PM +0200, Paolo Bonzini wrote:
>>
>>
>> On 13/07/2015 14:34, Martin Jambor wrote:
>> > You might want to use Martin's shiny new
>> > function_summary class in symbol-summary.c.  That is a mechanism
>> > specifically designed to append to a cgraph_node information specific
>> > to an optimization pass (or two, as ipa-cp and ipa-inline already both
>> > use a few of them).  Unfortunately, the class is not very well
>> > documented but you should be able to figure out how to use it from
>> > other code using them.
>> >
>> > If you then always deallocate everything there at the end of
>> > ipa-inline analysis, you'll get exactly the right life-time for the
>> > data.
>>
>> Good.  I might as well merge func_body_info and ipa_node_params then, so
>> I already have ipa_node_params_sum.  WDYT?
>
> Well, perhaps but I am not so sure.  I have made a conscious decision
> to make func_body_info a separate structure and not a part of
> ipa_node_params exactly because of their different life-times.
>
> func_body_info is something only used during intra-procedural stage of
> IPA analysis and should be thrown away as soon as it is over.
>
> ipa_node_params is a structure which contains results of that
> analysis, which are streamed to disk during LTO and then read back for
> the actual intEr-procedural propagation of information.  Yes, it also
> contains quite a few fields that are used only during the IPA stage
> and so perhaps a few more bits used only during the intra-stage might
> be OK too.  But Honza recently told me the ipa-structures are
> beginning to show in memory footprint of LTOing Firefox, so allocating
> more unused memory for each and every function in Firefox and all its
> clones is not really such a good idea.
>
> I also tend to think that coding the deallocation is going to be
> easier for you if you just use another summary.  For an
> analysis-stage-only summary, you do not need to implement any of the
> hooks (i.e. insert, remove, duplicate), for example.  Those should
> never happen during intraprocedural phase, or so I believe :-), so
> just put gcc_unreachable into them and that should be it.
>
> I'm sorry for making this so complicated :-)

It would be nice to have a patch that can be backported to the GCC 5 branch
as well.  We can improve this on trunk as followup,no?

Richard.

> Martin


Re: [PATCH] remove some usage of expr_list from read_rtx

2015-07-13 Thread Richard Sandiford
Nice clean-up :-)

tbsaunde+...@tbsaunde.org writes:
> @@ -2248,11 +2249,14 @@ process_define_subst (void)
>  static void
>  rtx_handle_directive (int lineno, const char *rtx_name)
>  {
> -  rtx queue, x;
> +  auto_vec subrtxs;
> +  if (!read_rtx (rtx_name, &subrtxs))
> +return;

Very minor, but many iterators are over powers of 2, so it seems unlikely
that 10 would be a good stack/heap cut-off point.  How about 16 or 32
instead?

OK with that change, thanks.

Richard



Re: [RFC] two-phase marking in gt_cleare_cache

2015-07-13 Thread Michael Matz
Hi,

On Sun, 12 Jul 2015, Tom de Vries wrote:

> > I'm trying to get to a defined policy for what is allowed for caches. 
> > Either forbidding or allowing multi-step dependencies, I don't really 
> > mind.

I think forbidding is the way to go, because ...

> > I managed to write a patch series that implements the forbidding of 
> > multi-step dependencies. I'll post this soon.
> 
> https://gcc.gnu.org/ml/gcc-patches/2015-07/msg00970.html

... aha, it finds bugs!  So you actually had to changes some hashes to be 
non-caching for this to work, and it's all some decl-to-debug-something 
maps (well, of course, otherwise you wouldn't have run into the bug you're 
trying to fix in the first place).  I think this hints at actual bugs that 
needs fixing in the gomp branch:

As you analyzed in PR 66714, eventually a decl A is replaced by decl B, 
but its debug-expr is simply copied, and that one itself refers to decls 
(let's call them D*) that meanwhile are removed.

Now, as the D* decls are not in any other data structure (otherwise they 
would have been marked) the typical actions that needed to have been done 
for them (like e.g. associating debug info with them, allocating them to 
some stack or register place) i.e. anything that needed to be done for 
normal decls won't have been done.  So the debug info generator in this 
case, when it sees those D* decls can't do its work, e.g. debug info 
generated for D* won't refer to the real place containing the value, 
because also the generated code itself doesn't refer to D* anymore.

This also hints at other problems (which might not actually occur in the 
case at hand, but still): the contents of DECL_VALUE_EXPR is the "real" 
thing containing the value of a decl (i.e. a decl having a value-expr 
doesn't itself occur in the code anymore), be it a decl itself, or some 
expression (which might also refer to decls).  Now, in PR 66714 you 
analyzed that one of those D* was removed from the function, which should 
have happened only because no code referred to anymore, i.e. D* was also 
rewritten to some other D'* (if it weren't rewritten and D* was referred 
to in code, you would have created a miscompilation).  At that point also 
the DECL_VALUE_EXPRs need to be rewritten to refer to D'*, not to D* 
anymore.

Implementing multi-step maps or making the hashmaps non-caching doesn't 
solve any of the above problems, it merely forces some DECLs in the 
compiler to remain live but that actually have no meaning in their 
context.

So, I think this makes it pretty clear that those hashmaps should remain 
caching maps, and that multi-step deps in caches should be disallowed, and 
that the underlying problem should rather be fixed (and the checking code 
against multi-step-deps should be added to the compiler).


Ciao,
Michael.


GOMP_offload_register

2015-07-13 Thread Nathan Sidwell

Ilya,
GOMP_offload_register's target data argument is 'void *'.  Is there any reason 
it shouldn't be 'const void *'?  It would seem to me that that would be better?


(a cursory look at i386/intelmic-mkoffload.c suggests a lack of consts in the 
variable decls there.  ptx suffers the same problem)


nathan


[C++ Patch] Prefer error + inform to two errors in check_template_shadow

2015-07-13 Thread Paolo Carlini

Hi,

while going thru the pending template template issues, I noticed that 
check_template_shadow still emits two errors. I believe we also want to 
simply use D instead of #D for template parameters, that avoids the nit 
about class T vs template T which could be slightly confusing to 
novices, I suppose.


Tested x86_64-linux.

Thanks,
Paolo.

//
/cp
2015-07-13  Paolo Carlini  

* pt.c (check_template_shadow): Emit error + inform instead of
two errors.

/testsuite
2015-07-13  Paolo Carlini  

* g++.dg/template/crash81.C: Adjust for error + inform change.
* g++.dg/template/pr58878.C: Likewise.
* g++.dg/template/shadow1.C: Likewise.
* g++.dg/template/shadow2.C: Likewise.
* g++.old-deja/g++.benjamin/tem03.C: Likewise.
* g++.old-deja/g++.benjamin/tem04.C: Likewise.
* g++.old-deja/g++.brendan/crash7.C: Likewise.
* g++.old-deja/g++.pt/shadow2.C: Likewise.
Index: cp/pt.c
===
--- cp/pt.c (revision 225730)
+++ cp/pt.c (working copy)
@@ -3661,8 +3661,13 @@ check_template_shadow (tree decl)
   if (DECL_SELF_REFERENCE_P (decl))
 return false;
 
-  error ("declaration of %q+#D", decl);
-  error (" shadows template parm %q+#D", olddecl);
+  if (DECL_TEMPLATE_PARM_P (decl))
+error ("declaration of template parameter %q+D shadows "
+  "template parameter", decl);
+  else
+error ("declaration of %q+#D shadows template parameter", decl);
+  inform (DECL_SOURCE_LOCATION (olddecl),
+ "template parameter %qD declared here", olddecl);
   return false;
 }
 
Index: testsuite/g++.dg/template/crash81.C
===
--- testsuite/g++.dg/template/crash81.C (revision 225730)
+++ testsuite/g++.dg/template/crash81.C (working copy)
@@ -3,6 +3,6 @@
 struct A
 {
   template struct X; // { dg-error "'T' has not been declared" "T" }
-  // { dg-error "declaration of 'template struct A::X'" "A::X" { target 
*-*-* } 5 }
-  // { dg-error "shadows template parm 'int X'" "shadow" { target *-*-* } 5 }
+  // { dg-error "declaration of 'template struct A::X' shadows" "A::X" 
{ target *-*-* } 5 }
+  // { dg-message "template parameter 'X'" "" { target *-*-* } 5 }
 };
Index: testsuite/g++.dg/template/pr58878.C
===
--- testsuite/g++.dg/template/pr58878.C (revision 225730)
+++ testsuite/g++.dg/template/pr58878.C (working copy)
@@ -3,10 +3,10 @@
 // Template-members of non-template class
 struct A
 {
-template // { dg-error "shadows" }
+template // { dg-message "template parameter" }
 void f()
 {
-int t = 1;   // { dg-error "declaration" }
+int t = 1;   // { dg-error "shadows" }
 }
 
 template 
@@ -13,39 +13,39 @@ struct A
 void g();
 };
 
-template // { dg-error "shadows" }
+template // { dg-message "template parameter" }
 void A::g()
 {
-int t = 2;   // { dg-error "declaration" }
+int t = 2;   // { dg-error "shadows" }
 }
 
 // (Non-template) Members of template class
-template // { dg-error "shadows" }
+template // { dg-message "template parameter" }
 struct B
 {
 void f()
 {
-int t = 3;   // { dg-error "declaration" }
+int t = 3;   // { dg-error "shadows" }
 }
 
 void g();
 };
 
-template // { dg-error "shadows" }
+template // { dg-message "template parameter" }
 void B::g()
 {
-int t = 4;   // { dg-error "declaration" }
+int t = 4;   // { dg-error "shadows" }
 }
 
 // Template members of template class
-template // { dg-error "shadows" }
+template // { dg-message "template parameter" }
 struct C
 {
-template // { dg-error "shadows" }
+template // { dg-message "template parameter" }
 void f()
 {
-int t = 5;   // { dg-error "declaration" }
-int s = 6;   // { dg-error "declaration" }
+int t = 5;   // { dg-error "shadows" }
+int s = 6;   // { dg-error "shadows" }
 }
 
 template 
@@ -52,10 +52,10 @@ struct C
 void g();
 };
 
-template // { dg-error "shadows" }
-template // { dg-error "shadows" }
+template // { dg-message "template parameter" }
+template // { dg-message "template parameter" }
 void C::g()
 {
-int t = 7;   // { dg-error "declaration" }
-int s = 8;   // { dg-error "declaration" }
+int t = 7;   // { dg-error "shadows" }
+int s = 8;   // { dg-error "shadows" }
 }
Index: testsuite/g++.dg/template/shadow1.C
===
--- testsuite/g++.dg/template/shadow1.C (revision 225730)
+++ testsuite/g++.dg/template/shadow1.C (wor

Re: [gomp4.1 WIP] omp_target_* libgomp APIs

2015-07-13 Thread Ilya Verbin
On Mon, Jul 13, 2015 at 15:17:29 +0200, Jakub Jelinek wrote:
> Here is a new version that I've committed.  I've finished up
> associate/disassociate, wrote a test and tested also with intelmicemul
> offloading.

Great!

> +  k->refcount = INT_MAX;

Shouldn't it be UINTPTR_MAX?

> +  /* FIXME: Support device-to-device somehow?  */

Should libgomp copy data device-host-device if device-device is not supported by
target?  Current liboffloadmic doesn't support this.  I'll find out if there are
any plans.

  -- Ilya


Re: [PATCH] PR/66760, ipa-inline-analysis.c compile-time hog

2015-07-13 Thread Martin Jambor
On Mon, Jul 13, 2015 at 02:47:23PM +0200, Paolo Bonzini wrote:
> 
> 
> On 13/07/2015 14:34, Martin Jambor wrote:
> > You might want to use Martin's shiny new
> > function_summary class in symbol-summary.c.  That is a mechanism
> > specifically designed to append to a cgraph_node information specific
> > to an optimization pass (or two, as ipa-cp and ipa-inline already both
> > use a few of them).  Unfortunately, the class is not very well
> > documented but you should be able to figure out how to use it from
> > other code using them.
> > 
> > If you then always deallocate everything there at the end of
> > ipa-inline analysis, you'll get exactly the right life-time for the
> > data.
> 
> Good.  I might as well merge func_body_info and ipa_node_params then, so
> I already have ipa_node_params_sum.  WDYT?

Well, perhaps but I am not so sure.  I have made a conscious decision
to make func_body_info a separate structure and not a part of
ipa_node_params exactly because of their different life-times.

func_body_info is something only used during intra-procedural stage of
IPA analysis and should be thrown away as soon as it is over.

ipa_node_params is a structure which contains results of that
analysis, which are streamed to disk during LTO and then read back for
the actual intEr-procedural propagation of information.  Yes, it also
contains quite a few fields that are used only during the IPA stage
and so perhaps a few more bits used only during the intra-stage might
be OK too.  But Honza recently told me the ipa-structures are
beginning to show in memory footprint of LTOing Firefox, so allocating
more unused memory for each and every function in Firefox and all its
clones is not really such a good idea.

I also tend to think that coding the deallocation is going to be
easier for you if you just use another summary.  For an
analysis-stage-only summary, you do not need to implement any of the
hooks (i.e. insert, remove, duplicate), for example.  Those should
never happen during intraprocedural phase, or so I believe :-), so
just put gcc_unreachable into them and that should be it.

I'm sorry for making this so complicated :-)

Martin


Re: [gomp] Move openacc vector& worker single handling to RTL

2015-07-13 Thread Nathan Sidwell

On 07/13/15 07:26, Thomas Schwinge wrote:

Hi!

On Fri, 10 Jul 2015 11:04:14 +0200, I wrote:

On Thu, 09 Jul 2015 20:25:22 -0400, Nathan Sidwell  wrote:

This is the patch I committed.



2. Don't be shy to remove a bunch of XFAILs, in fact all :-) of those
remaining from the test cases that Julian had added in
.

Unfortunately, there's also one regressions, but I'm seeing it only on
Nvidia K20 hardware, not on my laptop (but it may well be
hardware-dependent: according to a web search, CUDA error 716 translates
to CUDA_ERROR_MISALIGNED_ADDRESS).  Are you reproducing that one, and/or
do you have an idea where it's coming from?


Are you looking into this, or should somebody else?


I'm not looking at any regressions because I  wasn't aware of any.

nathan


[PING] Re: [PATCH] New configure option to default enable Smart Stack Protection

2015-07-13 Thread Magnus Granberg
söndag 05 juli 2015 23.59.32 skrev  Magnus Granberg:
> Changlogs
> /gcc
> 2015-07-05  Magnus Granberg  
> 
> * common.opt (fstack-protector): Initialize to -1.
> (fstack-protector-all): Likewise.
> (fstack-protector-strong): Likewise.
> (fstack-protector-explicit): Likewise.
> * configure.ac: Add --enable-default-ssp.
> * defaults.h (DEFAULT_FLAG_SSP): New.  Default SSP to strong.
> * opts.c (finish_options): Update opts->x_flag_stack_protect if it
> is -1. * doc/install.texi: Document --enable-default-ssp.
> * config.in: Regenerated.
> * configure: Likewise.
> 
> /testsuite
> 2015-07-05  Magnus Granberg  
> 
> * lib/target-supports.exp
> (check_effective_target_fstack_protector_enabled): New test.
> * gcc.target/i386/ssp-default.c: New test.
Patch updated and tested on x86_64-unknown-linux-gnu (Gentoo)

Changlogs
/gcc
2015-07-05  Magnus Granberg  

* common.opt (fstack-protector): Initialize to -1.
(fstack-protector-all): Likewise.
(fstack-protector-strong): Likewise.
(fstack-protector-explicit): Likewise.
* configure.ac: Add --enable-default-ssp.
* defaults.h (DEFAULT_FLAG_SSP): New.  Default SSP to strong.
* opts.c (finish_options): Update opts->x_flag_stack_protect if it is 
-1.
* doc/install.texi: Document --enable-default-ssp.
* config.in: Regenerated.
* configure: Likewise.

/testsuite
2015-07-13  Magnus Granberg  

* lib/target-supports.exp
(check_effective_target_fstack_protector_enabled): New test.
* gcc.target/i386/ssp-default.c: New test.
-- a/gcc/configure.ac	2014-12-05 00:53:24.0 +0100
+++ b/gcc/configure.ac	2015-06-08 23:27:11.744348211 +0200
@@ -5221,6 +5119,25 @@ if test x$gcc_cv_libc_provides_ssp = xye
 	[Define if your target C library provides stack protector support])
 fi
 
+# Check whether --enable-default-ssp was given.
+AC_ARG_ENABLE(default-ssp,
+[AS_HELP_STRING([--enable-default-ssp],
+  [enable Smart Stack Protection as default])],[
+if test x$gcc_cv_libc_provides_ssp = xyes; then
+  case "$target" in
+ia64*-*-*) enable_default_ssp=no ;;
+*) enable_default_ssp=$enableval ;;
+  esac
+else
+  enable_default_ssp=no
+fi],
+enable_default_ssp=no)
+if test x$enable_default_ssp == xyes ; then
+  AC_DEFINE(ENABLE_DEFAULT_SSP, 1,
+  [Define if your target supports default stack protector and it is enabled.])
+fi
+AC_SUBST([enable_default_ssp])
+
 # Test for  on the target.
 GCC_TARGET_TEMPLATE([HAVE_SYS_SDT_H])
 AC_MSG_CHECKING(sys/sdt.h in the target C library)
--- a/gcc/defaults.h	2014-11-01 09:13:09.0 +0100
+++ b/gcc/defaults.h	2015-06-08 22:43:18.764269749 +0200
@@ -1263,6 +1263,18 @@ see the files COPYING3 and COPYING.RUNTI
 #define STACK_SIZE_MODE word_mode
 #endif
 
+/* Default value for flag_stack_protect when flag_stack_protect is initialized to -1:
+   --enable-default-ssp: Default flag_stack_protect to -fstack-protector-strong.
+   --disable-default-ssp: Default flag_stack_protect to 0.
+ */
+#ifdef ENABLE_DEFAULT_SSP
+# ifndef DEFAULT_FLAG_SSP
+#  define DEFAULT_FLAG_SSP 3
+# endif
+#else
+# define DEFAULT_FLAG_SSP 0
+#endif
+
 /* Provide default values for the macros controlling stack checking.  */
 
 /* The default is neither full builtin stack checking...  */
--- a/gcc/common.opt	2014-10-28 11:33:04.0 +0100
+++ b/gcc/common.opt	2015-06-08 22:41:30.114266512 +0200
@@ -2054,15 +2054,15 @@ Common RejectNegative Joined Var(common_
 -fstack-limit-symbol=	Trap if the stack goes past symbol 
 
 fstack-protector
-Common Report Var(flag_stack_protect, 1)
+Common Report Var(flag_stack_protect, 1) Init(-1)
 Use propolice as a stack protection method
 
 fstack-protector-all
-Common Report RejectNegative Var(flag_stack_protect, 2)
+Common Report RejectNegative Var(flag_stack_protect, 2) Init(-1)
 Use a stack protection method for every function
 
 fstack-protector-strong
-Common Report RejectNegative Var(flag_stack_protect, 3)
+Common Report RejectNegative Var(flag_stack_protect, 3) Init(-1)
 Use a smart stack protection method for certain functions
 
 fstack-protector-explicit
-Common Report RejectNegative Var(flag_stack_protect, 4)
+Common Report RejectNegative Var(flag_stack_protect, 4) Init(-1)
 Use stack protection method only for functions with the stack_protect attribute
 
 fstack-usage
--- a/gcc/opts.c	2015-06-10 02:37:39.0 +0200
+++ b/gcc/opts.c	2015-07-03 23:47:50.868752099 +0200
@@ -757,6 +757,11 @@ finish_options (struct gcc_options *opts
   opts->x_flag_opts_finished = true;
 }
 
+  /* We initialize opts->x_flag_stack_protect to -1 so that targets
+ can set a default value.  */
+  if (opts->x_flag_stack_protect == -1)
+opts->x_flag_stack_protect = DEFAULT_FLAG_SSP;
+
   if (opts->x_optimize == 0)
 {
   /* Inlining does not work if not optimizing,
--- a/gcc/doc/install.texi	2015-06-03 18:38:10.0 

Re: Merge trunk r225562 (2015-07-08) into gomp-4_0-branch

2015-07-13 Thread Nathan Sidwell

On 07/12/15 05:39, Thomas Schwinge wrote:

Hi!

On Fri, 10 Jul 2015 18:50:20 -0400, Nathan Sidwell  
wrote:

it looks like the most recent merge from trunk to gomp4 was early May.  I think
it is time for another one -- can you handle that?


Indeed :-) -- and, as it happens, resolving the "merge artifacts" is one
of the things I've been working on last week.  I hope I got that all
right, in particular gcc/tree-parloops.c (Tom), gcc/tree-ssa-loop-ch.c
(Tom), gcc/config/nvptx/nvptx.c (Nathan), and thereabouts.  You may want
to diff the current gomp-4_0-branch files against trunk r225562 (merge
base) as well as again gomp-4_0-branch r225715 (before the merge) to
verify.  Anyway, in the quiet of the weekend now committed to
gomp-4_0-branch in r225719:


thanks!


--
Nathan Sidwell - Director, Sourcery Services - Mentor Embedded


Re: [gomp4.1 WIP] omp_target_* libgomp APIs

2015-07-13 Thread Jakub Jelinek
On Thu, Jul 09, 2015 at 04:06:57PM +0200, Jakub Jelinek wrote:
> The latest spec adds a bunch of new functions, this patch attempts to
> implement them, except I gave up partly in omp_target_associate_ptr
> and completely in omp_target_disassociate_ptr for now.
> 
> As for the plugins, I think we'll want some plugin callback to support
> offloading device <-> offloading device memcpy (at least for the same
> devicep and target_id), and perhaps as optimization also some
> callbacks through which 2 or 3 dimensional omp_target_memcpy_rect
> in between host and device, or device and host, or device to same device
> can be optimized to avoid too many separate operations.
> 
> For the associate/disassociate, I'm waiting for some clarifications (well,
> for omp_target_is_present too) and then supposedly it should wait until
> you are done with your enter/exit data changes.
> 
> Shall I commit this now, or wait until it is clarified etc.?

Here is a new version that I've committed.  I've finished up
associate/disassociate, wrote a test and tested also with intelmicemul
offloading.

2015-07-13  Jakub Jelinek  

* omp.h.in (omp_get_initial_device,
omp_target_alloc, omp_target_free, omp_target_is_present,
omp_target_memcpy, omp_target_memcpy_rect, omp_target_associate_ptr,
omp_target_disassociate_ptr): New prototypes.
* omp_lib.f90.in (omp_get_initial_device): New interface.
* omp_lib.h.in (omp_get_initial_device): New extern.
* libgomp.map (OMP_4.1): Add omp_get_initial_device,
omp_get_initial_device_, omp_target_alloc, omp_target_free,
omp_target_is_present, omp_target_memcpy, omp_target_memcpy_rect,
omp_target_associate_ptr and omp_target_disassociate_ptr symbols.
* env.c (omp_get_initial_device): New function.  Add ialias.
* fortran.c (omp_get_initial_device): Add iredirect.
(omp_get_initial_device_): New function.
* target.c (gomp_map_vars_existing, gomp_map_vars, gomp_unmap_vars):
Handle refcount of INT_MAX as infinite.
(gomp_offload_image_to_device): Set refcount to INT_MAX.
(omp_target_alloc, omp_target_free, omp_target_is_present,
omp_target_memcpy, omp_target_memcpy_rect_worker,
omp_target_memcpy_rect, omp_target_associate_ptr,
omp_target_disassociate_ptr): New functions.
* testsuite/libgomp.c/target-12.c: New test.

--- libgomp/omp.h.in.jj 2015-07-10 14:42:57.968695046 +0200
+++ libgomp/omp.h.in2015-07-10 18:49:17.503845297 +0200
@@ -139,8 +139,25 @@ extern int omp_get_num_teams (void) __GO
 extern int omp_get_team_num (void) __GOMP_NOTHROW;
 
 extern int omp_is_initial_device (void) __GOMP_NOTHROW;
+extern int omp_get_initial_device (void) __GOMP_NOTHROW;
 extern int omp_get_max_task_priority (void) __GOMP_NOTHROW;
 
+extern void *omp_target_alloc (__SIZE_TYPE__, int) __GOMP_NOTHROW;
+extern void omp_target_free (void *, int) __GOMP_NOTHROW;
+extern int omp_target_is_present (void *, __SIZE_TYPE__, int) __GOMP_NOTHROW;
+extern int omp_target_memcpy (void *, void *, __SIZE_TYPE__, __SIZE_TYPE__,
+ __SIZE_TYPE__, int, int) __GOMP_NOTHROW;
+extern int omp_target_memcpy_rect (void *, void *, __SIZE_TYPE__, int,
+  const __SIZE_TYPE__ *,
+  const __SIZE_TYPE__ *,
+  const __SIZE_TYPE__ *,
+  const __SIZE_TYPE__ *,
+  const __SIZE_TYPE__ *, int, int)
+  __GOMP_NOTHROW;
+extern int omp_target_associate_ptr (void *, void *, __SIZE_TYPE__,
+__SIZE_TYPE__, int) __GOMP_NOTHROW;
+extern int omp_target_disassociate_ptr (void *, int) __GOMP_NOTHROW;
+
 #ifdef __cplusplus
 }
 #endif
--- libgomp/target.c.jj 2015-07-10 14:42:58.034694042 +0200
+++ libgomp/target.c2015-07-13 14:48:11.681954879 +0200
@@ -38,6 +38,7 @@
 #endif
 #include 
 #include 
+#include 
 
 #ifdef PLUGIN_SUPPORT
 #include 
@@ -171,7 +172,8 @@ gomp_map_vars_existing (struct gomp_devi
(void *) (oldn->tgt->tgt_start + oldn->tgt_offset),
(void *) newn->host_start,
newn->host_end - newn->host_start);
-  oldn->refcount++;
+  if (oldn->refcount != INT_MAX)
+oldn->refcount++;
 }
 
 static int
@@ -436,7 +438,8 @@ gomp_map_vars (struct gomp_device_descr
  tgt->list[j].key = k;
  tgt->list[j].copy_from = false;
  tgt->list[j].always_copy_from = false;
- k->refcount++;
+ if (k->refcount != INT_MAX)
+   k->refcount++;
  gomp_map_pointer (tgt,
(uintptr_t) *(void **) hostaddrs[j],
k->tgt_offset
@@ -576,7 +579,10 @@ gomp_unmap_vars (struct

Re: [PATCH][RTL-ifcvt] Make non-conditional execution if-conversion more aggressive

2015-07-13 Thread Kyrill Tkachov


On 13/07/15 11:48, Bernhard Reutner-Fischer wrote:

On July 13, 2015 11:45:55 AM GMT+02:00, Kyrill Tkachov  
wrote:

Hi Bernhard,


Did you include go in your testing?
I see:
Unexpected results in this build (new failures)
FAIL: encoding/json
FAIL: go/printer
FAIL: go/scanner
FAIL: html/template
FAIL: log
FAIL: net/http
FAIL: net/http/cgi
FAIL: net/http/cookiejar
FAIL: os
FAIL: text/template

Hmmm. I don't see these failures. I double checked right now and they
appear as PASS in my configuration.

I tested make check-go on x86_64-unknown-linux-gnu configured with
--without-isl --disable-multilib --enable-languages=c,c++,fortran,go.

Are you sure this is not some other issue in your tree?

I have ISL enabled. I do have a couple of local stuff but that tested fine 
before your patch and should not really have impact on the parts your patch 
touches. So maybe it's ISL.


I've rebuilt with ISL from scratch and the tests still pass for me.
I'm testing Ubuntu on a Haswell machine, don't know if that's relevant.

Kyrill



Thanks,





Re: [PATCH] PR/66760, ipa-inline-analysis.c compile-time hog

2015-07-13 Thread Paolo Bonzini


On 13/07/2015 14:34, Martin Jambor wrote:
> You might want to use Martin's shiny new
> function_summary class in symbol-summary.c.  That is a mechanism
> specifically designed to append to a cgraph_node information specific
> to an optimization pass (or two, as ipa-cp and ipa-inline already both
> use a few of them).  Unfortunately, the class is not very well
> documented but you should be able to figure out how to use it from
> other code using them.
> 
> If you then always deallocate everything there at the end of
> ipa-inline analysis, you'll get exactly the right life-time for the
> data.

Good.  I might as well merge func_body_info and ipa_node_params then, so
I already have ipa_node_params_sum.  WDYT?

Paolo


Re: [PATCH] PR/66760, ipa-inline-analysis.c compile-time hog

2015-07-13 Thread Martin Jambor
Hi,

On Mon, Jul 13, 2015 at 02:13:59PM +0200, Paolo Bonzini wrote:
> 
> 
> On 13/07/2015 13:55, Martin Jambor wrote:
> > I can't approve it, but FWIW, I'm generally fine with the patch.
> > Although the original idea was to share one func_body_info in between
> > ipa-cp and ipa-inline analyses, this is certainly better than what we
> > have now and perhaps even good enough generally.
> 
> Ah, so you'd add a pointer to cgraph_node?

No, that does not seem right, given that this data is only needed
during very limited time.  You might want to use Martin's shiny new
function_summary class in symbol-summary.c.  That is a mechanism
specifically designed to append to a cgraph_node information specific
to an optimization pass (or two, as ipa-cp and ipa-inline already both
use a few of them).  Unfortunately, the class is not very well
documented but you should be able to figure out how to use it from
other code using them.

If you then always deallocate everything there at the end of
ipa-inline analysis, you'll get exactly the right life-time for the
data.

> I only have a question
> then---does cgraph_node have a "destructor" where I can free the
> func_body_info?  Or is there no such thing?

No, not really, but a summary would give you that.

> 
> > The only semi-issue I have is the name of func_body_info.  If it is
> > going to be exposed in a header file, perhaps it should get an ipa_
> > prefix.
> 
> That's a patch as large as this one.  I can do the rename later, and
> maybe have it preapproved to convince me to get the new public key in
> place. :)

Yeah, I think that is good enough (but remember I can't approve or
preapprove anything).

> 
> > I also think that its initialization should be put into a
> > common function, but that is somethig I can do as a followup, if need
> > be.
> 
> Tried doing that now...  it seems better to do it together with adding a
> func_body_info* to cgraph_node*, so that the initialization is done
> lazily in cgraph_node::get_func_body_info.
> 

Well, see above, we're actually trying to  pull information like this
out of cgraph_node.

Martin


Re: [PATCH] PR/66760, ipa-inline-analysis.c compile-time hog

2015-07-13 Thread Paolo Bonzini


On 13/07/2015 13:55, Martin Jambor wrote:
> I can't approve it, but FWIW, I'm generally fine with the patch.
> Although the original idea was to share one func_body_info in between
> ipa-cp and ipa-inline analyses, this is certainly better than what we
> have now and perhaps even good enough generally.

Ah, so you'd add a pointer to cgraph_node?  I only have a question
then---does cgraph_node have a "destructor" where I can free the
func_body_info?  Or is there no such thing?

> The only semi-issue I have is the name of func_body_info.  If it is
> going to be exposed in a header file, perhaps it should get an ipa_
> prefix.

That's a patch as large as this one.  I can do the rename later, and
maybe have it preapproved to convince me to get the new public key in
place. :)

> I also think that its initialization should be put into a
> common function, but that is somethig I can do as a followup, if need
> be.

Tried doing that now...  it seems better to do it together with adding a
func_body_info* to cgraph_node*, so that the initialization is done
lazily in cgraph_node::get_func_body_info.

Paolo


Re: [gomp4] declare directive [5/5]

2015-07-13 Thread Thomas Schwinge
Hi Jim!

On Mon, 8 Jun 2015 10:06:21 -0500, James Norris  
wrote:
> --- /dev/null
> +++ b/libgomp/testsuite/libgomp.oacc-c++/declare-1.C
> @@ -0,0 +1,24 @@
> +
> +template
> +T foo()
> +{
> +  T a;
> +  #pragma acc declare create (a)
> +
> +  #pragma acc parallel
> +  {
> +a = 5;
> +  }
> +
> +  return a;
> +}
> +
> +int
> +main (void)
> +{
> +  int rc;
> +
> +  rc = foo();
> +
> +  return rc;
> +}

I wonder, in a shared-memory setting (say, host-fallback because of the
OpenACC if clause, or acc_device_host, or acc_device_host_nonshm),
shouldn't the original and "declare"d objects of variable a be the same
(just like with the other data clauses), and thus the function foo return
the value 5 instead of 0?


Anyway, as-is, this test case FAILed in 32-bit x86 GNU/Linux testing
(acc_device_host, acc_device_host_nonshm), which I fixed in r225734 as
follows:

commit 016e15e94b8511f2041646c43d4344e1ea424e62
Author: tschwinge 
Date:   Mon Jul 13 11:48:33 2015 +

libgomp testsuite: Don't read from uninitialized variables

libgomp/
* testsuite/libgomp.oacc-c++/declare-1.C (foo): Initialize a.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/branches/gomp-4_0-branch@225734 
138bc75d-0d04-0410-961f-82ee72b054a4
---
 libgomp/ChangeLog.gomp | 2 ++
 libgomp/testsuite/libgomp.oacc-c++/declare-1.C | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git libgomp/ChangeLog.gomp libgomp/ChangeLog.gomp
index fd7887a..7d1e9ad 100644
--- libgomp/ChangeLog.gomp
+++ libgomp/ChangeLog.gomp
@@ -1,5 +1,7 @@
 2015-07-13  Thomas Schwinge  
 
+   * testsuite/libgomp.oacc-c++/declare-1.C (foo): Initialize a.
+
* testsuite/libgomp.oacc-c-c++-common/private-vars-loop-gang-5.c:
Add XFAIL.
 
diff --git libgomp/testsuite/libgomp.oacc-c++/declare-1.C 
libgomp/testsuite/libgomp.oacc-c++/declare-1.C
index 268809b..6618b10 100644
--- libgomp/testsuite/libgomp.oacc-c++/declare-1.C
+++ libgomp/testsuite/libgomp.oacc-c++/declare-1.C
@@ -2,7 +2,7 @@
 template
 T foo()
 {
-  T a;
+  T a = 0;
   #pragma acc declare create (a)
 
   #pragma acc parallel


Grüße,
 Thomas


signature.asc
Description: PGP signature


Re: [PATCH] PR/66760, ipa-inline-analysis.c compile-time hog

2015-07-13 Thread Martin Jambor
Hi,

On Sun, Jul 12, 2015 at 11:39:55PM +0200, Paolo Bonzini wrote:
> From: bonz...@gnu.org
> 
> In this PR, a lot of time is spent doing the same ipa_load_from_parm_agg
> query over and over.  Luckily a memoization scheme is already there, it's
> just not used by ipa-inline-analysis.c.  The patch moves the cache struct
> (struct func_body_info) to ipa-prop.h and modify ipa-inline-analysis.c.
> On some testcases from PR26854 the "alias stmt walking" timevar goes
> off the profile while it used to be 30-70%.
> 
> Bootstrapped (regtest in progress) on x86_64-pc-linux-gnu.
> 
> Please commit the patch for me if approved, as I don't have anymore
> the key I used to use for gcc.gnu.org.  One of these days I'll send
> my new SSH public key to the overseers.

I can't approve it, but FWIW, I'm generally fine with the patch.
Although the original idea was to share one func_body_info in between
ipa-cp and ipa-inline analyses, this is certainly better than what we
have now and perhaps even good enough generally.

The only semi-issue I have is the name of func_body_info.  If it is
going to be exposed in a header file, perhaps it should get an ipa_
prefix.  I also think that its initialization should be put into a
common function, but that is somethig I can do as a followup, if need
be.

In any event, thanks for working on this,

Martin


Re: [ARM] Optimize compare against smin/umin

2015-07-13 Thread Ramana Radhakrishnan
On Thu, Jun 25, 2015 at 6:08 PM, Michael Collison
 wrote:
>
> This patch is designed to optimize constructs such as:
>
> #define min(x, y) ((x) <= (y)) ? (x) : (y)
>
> unsignedint  foo (unsignedint  i, unsignedint  x ,unsignedint  y)
> {
>   return  i < (min (x, y));
> }
>
> int  bar (int  i,int  x,int  y)
> {
>   return  i < (min (x, y));
> }
>
> Patch was tested on arm-linux-gnueabi, arm-linux-gnueabihf,
> armeb-linux-gnueabihf. Okay for trunk?

Sorry about the slow review and I wanted someone else to look at it
given I had a hand in writing this patch up.

Please add a testcase.

>
>
> 2015-06-24  Michael Collison  
> 2012-05-01  Ramana Radhakrishnan 

Please fix the Changelog formatting here.

>
> * gcc/config/arm/arm.md (*arm_smin_cmp): New pattern.
> (*arm_umin_cmp): Likewise.
>
> diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
> index 1ac8af0..994c95f 100644
> --- a/gcc/config/arm/arm.md
> +++ b/gcc/config/arm/arm.md
> @@ -3455,6 +3455,28 @@
> (set_attr "type" "multiple,multiple")]
>  )
>
> +;; t = (s/u)min (x, y)
> +;; cc = cmp (t, z)
> +;; is the same as
> +;; cmp x, z
> +;; cmpge(u) y, z
> +
> +(define_insn_and_split "*arm_smin_cmp"
> +  [(set (reg:CC CC_REGNUM)
> +(compare:CC
> + (smin:SI (match_operand:SI 0 "s_register_operand" "r")
> +  (match_operand:SI 1 "s_register_operand" "r"))
> + (match_operand:SI 2 "s_register_operand" "r")))]
> +  "TARGET_32BIT"
> +  "#"
> +  ""
> +  [(set (reg:CC CC_REGNUM)
> +(compare:CC (match_dup 0) (match_dup 2)))
> +   (cond_exec (ge:CC (reg:CC CC_REGNUM) (const_int 0))
> +  (set (reg:CC CC_REGNUM)
> +   (compare:CC (match_dup 1) (match_dup 2]
> +)


IIUC it's not entirely safe to have cond_execs in the instruction
stream prior to reload - I think the consensus was that spilling and
filling with cond-exec style instructions could end up with
non-cond-exec style spills thus destroying registers in the non
cond-exec cases. so, lets just add a reload_completed to be safe here.

See https://patches.linaro.org/6469/ for more on this topic.

> +
>  (define_expand "umaxsi3"
>[(parallel [
>  (set (match_operand:SI 0 "s_register_operand" "")
> @@ -3521,6 +3543,22 @@
> (set_attr "type" "store1")]
>  )
>
> +(define_insn_and_split "*arm_umin_cmp"
> +  [(set (reg:CC CC_REGNUM)
> +(compare:CC
> + (umin:SI (match_operand:SI 0 "s_register_operand" "r")
> +  (match_operand:SI 1 "s_register_operand" "r"))
> + (match_operand:SI 2 "s_register_operand" "r")))]
> +  "TARGET_32BIT"
> +  "#"
> +  ""
> +  [(set (reg:CC CC_REGNUM)
> +(compare:CC (match_dup 0) (match_dup 2)))
> +   (cond_exec (geu:CC (reg:CC CC_REGNUM) (const_int 0))
> +  (set (reg:CC CC_REGNUM)
> +   (compare:CC (match_dup 1) (match_dup 2]
> +)
> +

Please move this below the other pattern.

>  (define_insn "*store_minmaxsi"
>[(set (match_operand:SI 0 "memory_operand" "=m")
>  (match_operator:SI 3 "minmax_operator"
>
> --
> Michael Collison
> Linaro Toolchain Working Group
> michael.colli...@linaro.org
>


Please repost after testing those changes and then I think this is OK to go in.

regards
Ramana


Re: [gomp] Move openacc vector& worker single handling to RTL

2015-07-13 Thread Thomas Schwinge
Hi!

On Fri, 10 Jul 2015 11:04:14 +0200, I wrote:
> On Thu, 09 Jul 2015 20:25:22 -0400, Nathan Sidwell  wrote:
> > This is the patch I committed.

> 2. Don't be shy to remove a bunch of XFAILs, in fact all :-) of those
> remaining from the test cases that Julian had added in
> .
> 
> Unfortunately, there's also one regressions, but I'm seeing it only on
> Nvidia K20 hardware, not on my laptop (but it may well be
> hardware-dependent: according to a web search, CUDA error 716 translates
> to CUDA_ERROR_MISALIGNED_ADDRESS).  Are you reproducing that one, and/or
> do you have an idea where it's coming from?

Are you looking into this, or should somebody else?


Also, this one:

> --- libgomp/testsuite/libgomp.oacc-c-c++-common/private-vars-loop-gang-5.c
> +++ libgomp/testsuite/libgomp.oacc-c-c++-common/private-vars-loop-gang-5.c
> @@ -1,5 +1,3 @@
> -/* { dg-xfail-run-if "TODO" { openacc_nvidia_accel_selected } { "*" } { "" } 
> } */
> -
>  #include 
>  
>  /* Test of gang-private array variable declared on loop directive, with

... in fact still FAILs for acc_device_nvidia (maybe I've just been lucky
when I first tested your patch/commit?), so that's another thing to look
into; committed in r225733:

commit 79234191653398a5897ca9be0f28af417e1ad212
Author: tschwinge 
Date:   Mon Jul 13 11:23:13 2015 +

libgomp: XFAIL libgomp.oacc-c-c++-common/private-vars-loop-gang-5.c for 
acc_device_nvidia

private-vars-loop-gang-5.exe: 
[...]/libgomp.oacc-c-c++-common/private-vars-loop-gang-5.c:29: main: Assertion 
`arr[i] == i + (i % 8) * 2' failed.

libgomp/
* testsuite/libgomp.oacc-c-c++-common/private-vars-loop-gang-5.c:
Add XFAIL.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/branches/gomp-4_0-branch@225733 
138bc75d-0d04-0410-961f-82ee72b054a4
---
 libgomp/ChangeLog.gomp   | 5 +
 .../testsuite/libgomp.oacc-c-c++-common/private-vars-loop-gang-5.c   | 3 +++
 2 files changed, 8 insertions(+)

diff --git libgomp/ChangeLog.gomp libgomp/ChangeLog.gomp
index 6ee00be..fd7887a 100644
--- libgomp/ChangeLog.gomp
+++ libgomp/ChangeLog.gomp
@@ -1,3 +1,8 @@
+2015-07-13  Thomas Schwinge  
+
+   * testsuite/libgomp.oacc-c-c++-common/private-vars-loop-gang-5.c:
+   Add XFAIL.
+
 2015-07-12  Tom de Vries  
 
* testsuite/libgomp.oacc-c-c++-common/kernels-loop-nest.c: New test.
diff --git 
libgomp/testsuite/libgomp.oacc-c-c++-common/private-vars-loop-gang-5.c 
libgomp/testsuite/libgomp.oacc-c-c++-common/private-vars-loop-gang-5.c
index b070773..a710849 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/private-vars-loop-gang-5.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/private-vars-loop-gang-5.c
@@ -1,3 +1,6 @@
+/* main: Assertion `arr[i] == i + (i % 8) * 2' failed.
+   { dg-xfail-run-if "TODO" { openacc_nvidia_accel_selected } { "*" } { "" } } 
*/
+
 #include 
 
 /* Test of gang-private array variable declared on loop directive, with


Grüße,
 Thomas


signature.asc
Description: PGP signature


[PATCH] [gomp] Recycle non-nested team if possible

2015-07-13 Thread Sebastian Huber
Try to recycle the last non-nested team to avoid the use of malloc() and
free() in the normal case where the number of threads is the same.
Avoid superfluous destruction and initialization of team synchronization
objects.

Using the microbenchmark posted here

https://gcc.gnu.org/ml/gcc-patches/2008-03/msg00930.html

shows an improvement in the parallel bench test case (target
x86_64-unknown-linux-gnu, median out of 9 test runs, iteration count
increased to 20).

Before the patch:

parallel bench 11.2284 seconds

After the patch:

parallel bench 10.5912 seconds

libgomp/ChangeLog
2015-07-13  Sebastian Huber  

* team.c (get_recycable_team): New.
(gomp_new_team): Recycle last non-nested team if possible.
(free_team): Destroy more team synchronization objects.
(gomp_team_end): Move some team synchronization object
destructions to free_team().
---
 libgomp/team.c | 54 +++---
 1 file changed, 39 insertions(+), 15 deletions(-)

diff --git a/libgomp/team.c b/libgomp/team.c
index b98b233..0bcbaf8 100644
--- a/libgomp/team.c
+++ b/libgomp/team.c
@@ -134,6 +134,25 @@ gomp_thread_start (void *xdata)
   return NULL;
 }
 
+static struct gomp_team *
+get_recycable_team (unsigned nthreads)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  if (thr->ts.team == NULL)
+{
+  struct gomp_thread_pool *pool = thr->thread_pool;
+  if (pool != NULL)
+   {
+ struct gomp_team *last_team = pool->last_team;
+ if (last_team != NULL && last_team->nthreads == nthreads)
+   {
+ pool->last_team = NULL;
+ return last_team;
+   }
+   }
+}
+  return NULL;
+}
 
 /* Create a new team data structure.  */
 
@@ -141,18 +160,28 @@ struct gomp_team *
 gomp_new_team (unsigned nthreads)
 {
   struct gomp_team *team;
-  size_t size;
   int i;
 
-  size = sizeof (*team) + nthreads * (sizeof (team->ordered_release[0])
- + sizeof (team->implicit_task[0]));
-  team = gomp_malloc (size);
+  team = get_recycable_team (nthreads);
+  if (team == NULL)
+{
+  size_t extra = sizeof (team->ordered_release[0])
+ + sizeof (team->implicit_task[0]);
+  team = gomp_malloc (sizeof (*team) + nthreads * extra);
+
+#ifndef HAVE_SYNC_BUILTINS
+  gomp_mutex_init (&team->work_share_list_free_lock);
+#endif
+  gomp_barrier_init (&team->barrier, nthreads);
+  gomp_sem_init (&team->master_release, 0);
+  gomp_mutex_init (&team->task_lock);
+
+  team->nthreads = nthreads;
+}
 
   team->work_share_chunk = 8;
 #ifdef HAVE_SYNC_BUILTINS
   team->single_count = 0;
-#else
-  gomp_mutex_init (&team->work_share_list_free_lock);
 #endif
   team->work_shares_to_free = &team->work_shares[0];
   gomp_init_work_share (&team->work_shares[0], false, nthreads);
@@ -163,14 +192,9 @@ gomp_new_team (unsigned nthreads)
 team->work_shares[i].next_free = &team->work_shares[i + 1];
   team->work_shares[i].next_free = NULL;
 
-  team->nthreads = nthreads;
-  gomp_barrier_init (&team->barrier, nthreads);
-
-  gomp_sem_init (&team->master_release, 0);
   team->ordered_release = (void *) &team->implicit_task[nthreads];
   team->ordered_release[0] = &team->master_release;
 
-  gomp_mutex_init (&team->task_lock);
   team->task_queue = NULL;
   team->task_count = 0;
   team->task_queued_count = 0;
@@ -187,6 +211,10 @@ gomp_new_team (unsigned nthreads)
 static void
 free_team (struct gomp_team *team)
 {
+  gomp_sem_destroy (&team->master_release);
+#ifndef HAVE_SYNC_BUILTINS
+  gomp_mutex_destroy (&team->work_share_list_free_lock);
+#endif
   gomp_barrier_destroy (&team->barrier);
   gomp_mutex_destroy (&team->task_lock);
   free (team);
@@ -894,10 +922,6 @@ gomp_team_end (void)
}
   while (ws != NULL);
 }
-  gomp_sem_destroy (&team->master_release);
-#ifndef HAVE_SYNC_BUILTINS
-  gomp_mutex_destroy (&team->work_share_list_free_lock);
-#endif
 
   if (__builtin_expect (thr->ts.team != NULL, 0)
   || __builtin_expect (team->nthreads == 1, 0))
-- 
1.8.4.5



Re: Merge trunk r225562 (2015-07-08) into gomp-4_0-branch

2015-07-13 Thread Thomas Schwinge
Hi Tom!

On Mon, 13 Jul 2015 12:32:20 +0200, Tom de Vries  wrote:
> On 13/07/15 10:31, Thomas Schwinge wrote:
> > On Mon, 13 Jul 2015 09:20:16 +0200, Tom de Vries  
> > wrote:
> >> >On 12/07/15 11:39, Thomas Schwinge wrote:
> >> >I've looked at the merge commit, gcc/tree-parloops.c was not modified.

> > (Well, it was, but not "substantially".)

> Hmm, the reason why I said tree-parloops.c was not modified, was that 
> the git show of your merge commit (which invokes git diff-tree --cc) 
> does not show any differences for tree-parloops.c:
> ...
> $ git show 2c88b76c6bb1e01ab541ea7b02a2f097c888dcb1 | grep tree-parloops.c
> $
> ...
> 
> OTOH, if I use --numstat as diff-tree argument, I see:
> ...
> $ git diff-tree --numstat --cc 2c88b76c6bb1e01ab541ea7b02a2f097c888dcb1 
> | grep tree-parloops.c
> 7 34  gcc/tree-parloops.c
> ...
> 
> I'm not sure if this is expected behaviour.

Yes, I think so, because »--cc [...] compresses the patch output by
omitting uninteresting hunks whose the contents in the parents have only
two variants and the merge result picks one of them without
modification«, and, as I said, for »merge conflicts, I just retained the
code that was present on gomp-4_0-branch already«.

In contrast, see the -c and -m options (which get passed from git show to
git diff-tree):

$ git show -c 2c88b76c6bb1e01ab541ea7b02a2f097c888dcb1 -- 
gcc/tree-parloops.c
commit 2c88b76c6bb1e01ab541ea7b02a2f097c888dcb1
Merge: f9d00ca cacef50
Author: tschwinge 
Date:   Sun Jul 12 09:30:39 2015 +

svn merge -r 222860:225562 svn+ssh://gcc.gnu.org/svn/gcc/trunk


git-svn-id: 
svn+ssh://gcc.gnu.org/svn/gcc/branches/gomp-4_0-branch@225719 
138bc75d-0d04-0410-961f-82ee72b054a4

diff --combined gcc/tree-parloops.c
index 04708c0,846077a..80a215d
--- gcc/tree-parloops.c
+++ gcc/tree-parloops.c
@@@ -22,43 -22,22 +22,22 @@@ along with GCC; see the file COPYING3
  #include "config.h"
  #include "system.h"
  #include "coretypes.h"
- #include "hash-set.h"
- #include "machmode.h"
- #include "vec.h"
- #include "double-int.h"
- #include "input.h"
  #include "alias.h"
- #include "symtab.h"
- #include "options.h"
- #include "wide-int.h"
- #include "inchash.h"
+ #include "backend.h"
  #include "tree.h"
- #include "fold-const.h"
- #include "predict.h"
- #include "tm.h"
+ #include "gimple.h"
  #include "hard-reg-set.h"
- #include "input.h"
- #include "function.h"
- #include "dominance.h"
- #include "cfg.h"
- #include "basic-block.h"
- #include "tree-ssa-alias.h"
+ #include "ssa.h"
+ #include "options.h"
+ #include "fold-const.h"
  #include "internal-fn.h"
- #include "gimple-expr.h"
- #include "is-a.h"
- #include "gimple.h"
  #include "gimplify.h"
  #include "gimple-iterator.h"
  #include "gimplify-me.h"
  #include "gimple-walk.h"
  #include "stor-layout.h"
  #include "tree-nested.h"
- #include "gimple-ssa.h"
  #include "tree-cfg.h"
- #include "tree-phinodes.h"
- #include "ssa-iterators.h"
- #include "stringpool.h"
- #include "tree-ssanames.h"
  #include "tree-ssa-loop-ivopts.h"
  #include "tree-ssa-loop-manip.h"
  #include "tree-ssa-loop-niter.h"
@@@ -75,8 -54,6 +54,6 @@@
  #include "tree-parloops.h"
  #include "omp-low.h"
  #include "tree-nested.h"
- #include "plugin-api.h"
- #include "ipa-ref.h"
  #include "cgraph.h"
  #include "tree-ssa.h"
  
@@@ -218,8 -195,6 +195,8 @@@ struct reduction_inf
   of the reduction variable when existing the 
loop. */
tree initial_value; /* The initial value of the reduction 
var before entering the loop.  */
tree field; /*  the name of the field in the 
parloop data structure intended for reduction.  */
 +  tree reduc_addr;/* The address of the reduction 
variable for
 + openacc reductions.  */
tree init;  /* reduction initialization value.  */
gphi *new_phi;  /* (helper field) Newly created phi node whose 
result
   will be passed to the atomic operation.  
Represents
@@@ -229,10 -204,8 +206,8 @@@
  
  /* Reduction info hashtable helpers.  */
  
- struct reduction_hasher : typed_free_remove 
+ struct reduction_hasher : free_ptr_hash 
  {
-   typedef reduction_info *value_type;
-   typedef reduction_info *compare_type;
static inline hashval_t hash (const reduction_info *);
static inline bool equal (const reduction_info *, const reduction_info 
*);
  };
@@@ -281,10 -254,8 +256,8 @@@ struct name_to_copy_el
  
  /* Name copies hashtable helpers.  */
  
- struct name_to_copy_hasher : typed_free_remove 
+ stru

[PATCH, PR46193] Handle mix/max pointer reductions in parloops

2015-07-13 Thread Tom de Vries

Hi,

this patch fixes PR46193.

It handles min and max reductions of pointer type in parloops.

Bootstrapped and reg-tested on x86_64.

OK for trunk?

Thanks,
- Tom
Handle mix/max pointer reductions in parloops

2015-07-13  Tom de Vries  

	PR tree-optimization/46193
	* omp-low.c (omp_reduction_init): Handle pointer type for min or max
	clause.

	* gcc.dg/autopar/pr46193.c: New test.

	* testsuite/libgomp.c/pr46193.c: New test.
---
 gcc/omp-low.c  |  4 ++
 gcc/testsuite/gcc.dg/autopar/pr46193.c | 38 +++
 libgomp/testsuite/libgomp.c/pr46193.c  | 67 ++
 3 files changed, 109 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/autopar/pr46193.c
 create mode 100644 libgomp/testsuite/libgomp.c/pr46193.c

diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index 2e2070a..20d0010 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -3423,6 +3423,8 @@ omp_reduction_init (tree clause, tree type)
 	real_maxval (&min, 1, TYPE_MODE (type));
 	  return build_real (type, min);
 	}
+  else if (POINTER_TYPE_P (type))
+	return lower_bound_in_type (type, type);
   else
 	{
 	  gcc_assert (INTEGRAL_TYPE_P (type));
@@ -3439,6 +3441,8 @@ omp_reduction_init (tree clause, tree type)
 	real_maxval (&max, 0, TYPE_MODE (type));
 	  return build_real (type, max);
 	}
+  else if (POINTER_TYPE_P (type))
+	return upper_bound_in_type (type, type);
   else
 	{
 	  gcc_assert (INTEGRAL_TYPE_P (type));
diff --git a/gcc/testsuite/gcc.dg/autopar/pr46193.c b/gcc/testsuite/gcc.dg/autopar/pr46193.c
new file mode 100644
index 000..544a5da
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/autopar/pr46193.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-parallelize-loops=2 -fdump-tree-parloops-details" } */
+
+extern void abort (void);
+
+char *
+foo (int count, char **list)
+{
+  char *minaddr = list[0];
+  int i;
+
+  for (i = 0; i < count; i++)
+{
+  char *addr = list[i];
+  if (addr < minaddr)
+	minaddr = addr;
+}
+
+  return minaddr;
+}
+
+char *
+foo2 (int count, char **list)
+{
+  char *maxaddr = list[0];
+  int i;
+
+  for (i = 0; i < count; i++)
+{
+  char *addr = list[i];
+  if (addr > maxaddr)
+	maxaddr = addr;
+}
+
+  return maxaddr;
+}
+
+/* { dg-final { scan-tree-dump-times "parallelizing inner loop" 2 "parloops" } } */
diff --git a/libgomp/testsuite/libgomp.c/pr46193.c b/libgomp/testsuite/libgomp.c/pr46193.c
new file mode 100644
index 000..1e27faf
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/pr46193.c
@@ -0,0 +1,67 @@
+/* { dg-do run } */
+/* { dg-additional-options "-ftree-parallelize-loops=2" } */
+
+extern void abort (void);
+
+char *
+foo (int count, char **list)
+{
+  char *minaddr = list[0];
+  int i;
+
+  for (i = 0; i < count; i++)
+{
+  char *addr = list[i];
+  if (addr < minaddr)
+	minaddr = addr;
+}
+
+  return minaddr;
+}
+
+char *
+foo2 (int count, char **list)
+{
+  char *maxaddr = list[0];
+  int i;
+
+  for (i = 0; i < count; i++)
+{
+  char *addr = list[i];
+  if (addr > maxaddr)
+	maxaddr = addr;
+}
+
+  return maxaddr;
+}
+
+#define N 5
+
+static void
+init (char **list)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+list[i] = (char *)&list[i];
+}
+
+int
+main (void)
+{
+  char *list[N];
+  char * res;
+
+  init (list);
+
+  res = foo (N, list);
+
+  if (res != (char *)&list[0])
+abort ();
+
+  res = foo2 (N, list);
+
+  if (res != (char *)&list[N-1])
+abort ();
+
+  return 0;
+}
-- 
1.9.1



Re: [patch, driver] Ignore -ftree-parallelize-loops={0,1}

2015-07-13 Thread Tom de Vries

On 07/07/15 09:53, Tom de Vries wrote:

Hi,

currently, we have these spec strings in gcc/gcc.c involving
ftree-parallelize-loops:
...
%{fopenacc|fopenmp|ftree-parallelize-loops=*:%:include(libgomp.spec)%(link_gomp)}

%{fopenacc|fopenmp|ftree-parallelize-loops=*:-pthread}"
...

Actually, ftree-parallelize-loops={0,1} means that no parallelization is
done, but these spec strings still get activated for these values.


Attached patch fixes that, by introducing a spec function gt (short for
greather than), and using it in the spec lines.



Attached (untested) patch manages the same, without introducing the spec 
function 'gt'. But the solution is a bit convoluted, so I prefer the one 
with the gt function.


Thanks,
- Tom

Ignore -ftree-parallelize-loops={0,1}

2015-07-13  Tom de Vries  

	* gcc.c (LINK_COMMAND_SPEC_GOMP_STRING, GOMP_SELF_SPEC_STRING): Define.
	(LINK_COMMAND_SPEC_GOMP_STRING, GOMP_SELF_SPECS): Ignore
	ftree-parallelize-loops={0,1}.
---
 gcc/gcc.c | 20 +---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/gcc/gcc.c b/gcc/gcc.c
index 858ff37..c5694c7 100644
--- a/gcc/gcc.c
+++ b/gcc/gcc.c
@@ -860,6 +860,8 @@ proper position among the other output files.  */
 #define CHKP_SPEC ""
 #endif
 
+#define LINK_COMMAND_SPEC_GOMP_STRING "%:include(libgomp.spec)%(link_gomp)"
+
 /* -u* was put back because both BSD and SysV seem to support it.  */
 /* %{static:} simply prevents an error message if the target machine
doesn't handle -static.  */
@@ -881,7 +883,12 @@ proper position among the other output files.  */
 %{s} %{t} %{u*} %{z} %{Z} %{!nostdlib:%{!nostartfiles:%S}} " VTABLE_VERIFICATION_SPEC " \
 %{static:} %{L*} %(mfwrap) %(link_libgcc) " SANITIZER_EARLY_SPEC " %o\
 " CHKP_SPEC " \
-%{fopenacc|fopenmp|ftree-parallelize-loops=*:%:include(libgomp.spec)%(link_gomp)}\
+%{!ftree-parallelize-loops=0:%{!ftree-parallelize-loops=1:\
+  %{fopenacc|fopenmp|ftree-parallelize-loops=*:" \
+	LINK_COMMAND_SPEC_GOMP_STRING "}}}\
+%{ftree-parallelize-loops=0|ftree-parallelize-loops=1: \
+  %{fopenacc|fopenmp:" \
+	LINK_COMMAND_SPEC_GOMP_STRING "}} \
 %{fcilkplus:%:include(libcilkrts.spec)%(link_cilkrts)}\
 %{fgnu-tm:%:include(libitm.spec)%(link_itm)}\
 %(mflib) " STACK_SPLIT_SPEC "\
@@ -1039,11 +1046,18 @@ static const char *const multilib_defaults_raw[] = MULTILIB_DEFAULTS;
 #define DRIVER_SELF_SPECS ""
 #endif
 
+#define GOMP_SELF_SPEC_STRING "-pthread"
+
 /* Linking to libgomp implies pthreads.  This is particularly important
for targets that use different start files and suchlike.  */
 #ifndef GOMP_SELF_SPECS
-#define GOMP_SELF_SPECS "%{fopenacc|fopenmp|ftree-parallelize-loops=*: " \
-  "-pthread}"
+#define GOMP_SELF_SPECS \
+  "%{!ftree-parallelize-loops=0:%{!ftree-parallelize-loops=1:\
+ %{fopenacc|fopenmp|ftree-parallelize-loops=*: " \
+   GOMP_SELF_SPEC_STRING "}}}\
+   %{ftree-parallelize-loops=0|ftree-parallelize-loops=1:\
+ %{fopenacc|fopenmp: " \
+   GOMP_SELF_SPEC_STRING "}}"
 #endif
 
 /* Likewise for -fgnu-tm.  */
-- 
1.9.1



Re: [v3 PATCH] Implement make_array and to_array from the Fundamentals v2 TS draft

2015-07-13 Thread Ville Voutilainen
On 13 July 2015 at 01:25, Ville Voutilainen  wrote:
> On 12 July 2015 at 21:45, Ville Voutilainen  
> wrote:
>> Tested on Linux-PPC64.
>>
>> 2015-07-12  Ville Voutilainen  
>> Implement std::experimental::fundamentals_v2::make_array and
>> std::experimental::fundamentals_v2::to_array.
>> * include/Makefile.am: Add array.
>> * include/Makefile.in: Add array.
>> * include/experimental/array: New.
>> * testsuite/experimental/array/make_array.cc: Likewise.
>> * testsuite/experimental/array/neg.cc: Likewise.
>
> Very minor cleanup in a new patch, use is_void<_D> instead of is_same<_D, 
> void>,
> indent the static assert a bit more clearly.

Oops, the implementation failed to forward() in make_array, new patch attached,
with a test for make_array with a move-only type.
diff --git a/libstdc++-v3/include/Makefile.am b/libstdc++-v3/include/Makefile.am
index 05be8ad..41fc4af 100644
--- a/libstdc++-v3/include/Makefile.am
+++ b/libstdc++-v3/include/Makefile.am
@@ -646,6 +646,7 @@ experimental_builddir = ./experimental
 experimental_headers = \
${experimental_srcdir}/algorithm \
${experimental_srcdir}/any \
+   ${experimental_srcdir}/array \
${experimental_srcdir}/chrono \
${experimental_srcdir}/deque \
${experimental_srcdir}/erase_if.h \
@@ -657,6 +658,7 @@ experimental_headers = \
${experimental_srcdir}/memory \
${experimental_srcdir}/numeric \
${experimental_srcdir}/optional \
+   ${experimental_srcdir}/propagate_const \
${experimental_srcdir}/ratio \
${experimental_srcdir}/set \
${experimental_srcdir}/string \
diff --git a/libstdc++-v3/include/Makefile.in b/libstdc++-v3/include/Makefile.in
index bab83b4..b2a140c 100644
--- a/libstdc++-v3/include/Makefile.in
+++ b/libstdc++-v3/include/Makefile.in
@@ -935,6 +935,7 @@ experimental_builddir = ./experimental
 experimental_headers = \
${experimental_srcdir}/algorithm \
${experimental_srcdir}/any \
+   ${experimental_srcdir}/array \
${experimental_srcdir}/chrono \
${experimental_srcdir}/deque \
${experimental_srcdir}/erase_if.h \
@@ -946,6 +947,7 @@ experimental_headers = \
${experimental_srcdir}/memory \
${experimental_srcdir}/numeric \
${experimental_srcdir}/optional \
+   ${experimental_srcdir}/propagate_const \
${experimental_srcdir}/ratio \
${experimental_srcdir}/set \
${experimental_srcdir}/string \
diff --git a/libstdc++-v3/include/experimental/array 
b/libstdc++-v3/include/experimental/array
new file mode 100644
index 000..b72895c
--- /dev/null
+++ b/libstdc++-v3/include/experimental/array
@@ -0,0 +1,107 @@
+//  -*- C++ -*-
+
+// Copyright (C) 2015 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// Under Section 7 of GPL version 3, you are granted additional
+// permissions described in the GCC Runtime Library Exception, version
+// 3.1, as published by the Free Software Foundation.
+
+// You should have received a copy of the GNU General Public License and
+// a copy of the GCC Runtime Library Exception along with this program;
+// see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+// .
+
+/** @file experimental/array
+ *  This is a TS C++ Library header.
+ */
+
+#ifndef _GLIBCXX_EXPERIMENTAL_ARRAY
+#define _GLIBCXX_EXPERIMENTAL_ARRAY 1
+
+#pragma GCC system_header
+
+#if __cplusplus <= 201103L
+# include 
+#else
+
+#include 
+#include 
+#include 
+
+namespace std _GLIBCXX_VISIBILITY(default)
+{
+namespace experimental
+{
+inline namespace fundamentals_v2
+{
+_GLIBCXX_BEGIN_NAMESPACE_VERSION
+
+  /**
+   * @defgroup make_array Array creation functions
+   * @ingroup experimental
+   *
+   * Array creation functions as described in N4529,
+   * Working Draft, C++ Extensions for Library Fundamentals, Version 2
+   *
+   * @{
+   */
+
+template 
+struct __is_reference_wrapper : false_type
+{ };
+
+template 
+struct __is_reference_wrapper> : true_type
+{ };
+
+template 
+constexpr auto make_array(_Types&&... __t)
+  -> array,
+ common_type_t<_Types...>,
+ _D>,
+   sizeof...(_Types)>
+{
+  static_assert(__or_<
+  __not_>,
+  __and_<__not_<__is_reference_wrapper>>...>>
+::value,
+"make_array cannot be used without an explicit target type "
+"

Re: [PATCH][RTL-ifcvt] Make non-conditional execution if-conversion more aggressive

2015-07-13 Thread Bernhard Reutner-Fischer
On July 13, 2015 11:45:55 AM GMT+02:00, Kyrill Tkachov  
wrote:
>Hi Bernhard,
>

>> Did you include go in your testing?
>> I see:
>> Unexpected results in this build (new failures)
>> FAIL: encoding/json
>> FAIL: go/printer
>> FAIL: go/scanner
>> FAIL: html/template
>> FAIL: log
>> FAIL: net/http
>> FAIL: net/http/cgi
>> FAIL: net/http/cookiejar
>> FAIL: os
>> FAIL: text/template
>
>Hmmm. I don't see these failures. I double checked right now and they
>appear as PASS in my configuration.
>
>I tested make check-go on x86_64-unknown-linux-gnu configured with
>--without-isl --disable-multilib --enable-languages=c,c++,fortran,go.
>
>Are you sure this is not some other issue in your tree?

I have ISL enabled. I do have a couple of local stuff but that tested fine 
before your patch and should not really have impact on the parts your patch 
touches. So maybe it's ISL.

Thanks,



Re: Merge trunk r225562 (2015-07-08) into gomp-4_0-branch

2015-07-13 Thread Tom de Vries

On 13/07/15 10:31, Thomas Schwinge wrote:

Hi Tom!

On Mon, 13 Jul 2015 09:20:16 +0200, Tom de Vries  wrote:

>On 12/07/15 11:39, Thomas Schwinge wrote:

> >On Fri, 10 Jul 2015 18:50:20 -0400, Nathan 
Sidwell  wrote:

> >>it looks like the most recent merge from trunk to gomp4 was early May.  I 
think
> >>it is time for another one -- can you handle that?

> >
> >Indeed:-)  -- and, as it happens, resolving the "merge artifacts" is one
> >of the things I've been working on last week.  I hope I got that all
> >right, in particular gcc/tree-parloops.c (Tom),

>
>I've looked at the merge commit, gcc/tree-parloops.c was not modified.

(Well, it was, but not "substantially".)  You'd ported all your trunk
commits to gomp-4_0-branch already (thanks!), and in the functions where
I got merge conflicts, I just retained the code that was present on
gomp-4_0-branch already, which apparently was the right thing to do.;-)




Hmm, the reason why I said tree-parloops.c was not modified, was that 
the git show of your merge commit (which invokes git diff-tree --cc) 
does not show any differences for tree-parloops.c:

...
$ git show 2c88b76c6bb1e01ab541ea7b02a2f097c888dcb1 | grep tree-parloops.c
$
...

OTOH, if I use --numstat as diff-tree argument, I see:
...
$ git diff-tree --numstat --cc 2c88b76c6bb1e01ab541ea7b02a2f097c888dcb1 
| grep tree-parloops.c

7   34  gcc/tree-parloops.c
...

I'm not sure if this is expected behaviour.

Thanks,
- Tom



Re: [PATCH][AArch64][testsuite] Adjust some arith+compare tests for potentially more aggressive if-conversion

2015-07-13 Thread Kyrill Tkachov


On 10/07/15 21:21, James Greenhalgh wrote:

On Fri, Jul 10, 2015 at 01:21:05PM +0100, Kyrill Tkachov wrote:

Hi all,

Some of the testcases in aarch64.exp can fail their scan-assembler patterns
if if-conversion becomes more aggressive.

This patch adjusts the testcases in case the branches are eliminated and
further optimisations occur that may remove the scan-assembler patterns.

With this patch the patterns are always generated and the expected execute
values are updated.

Tests still pass on aarch64.
Ok for trunk?

This is OK. Please address my one comment below (looks like you left some
#if 0 kicking around in adds1.c) and fix the ChangeLog to include the
gcc.target/aarch64/adds1.c changes.


Thanks for spotting that, I'm attaching the patch I committed with r225732
and the below ChangeLog entry.

2015-07-13  Kyrylo Tkachov  

* gcc.target/aarch64/adds3.c: Adjust for more aggressive
if-conversion..
* gcc.target/aarch64/adds1.c: Likewise.
* gcc.target/aarch64/ands_1.c: Likewise.
* gcc.target/aarch64/bics_1.c: Likewise.
* gcc.target/aarch64/subs1.c: Likewise.
* gcc.target/aarch64/subs3.c: Likewise.


Thanks,
Kyrill



Thanks,
James




Index: gcc/testsuite/gcc.target/aarch64/ands_1.c
===
--- gcc/testsuite/gcc.target/aarch64/ands_1.c	(revision 225731)
+++ gcc/testsuite/gcc.target/aarch64/ands_1.c	(working copy)
@@ -12,7 +12,7 @@
   if (d == 0)
 return a + c;
   else
-return b + d + c;
+return d;
 }
 
 int
@@ -24,7 +24,7 @@
   if (d == 0)
 return a + c;
   else
-return b + d + c;
+return d;
 }
 
 int
@@ -36,7 +36,7 @@
   if (d == 0)
 return a + c;
   else
-return b + d + c;
+return d;
 }
 
 typedef long long s64;
@@ -50,7 +50,7 @@
   if (d == 0)
 return a + c;
   else
-return b + d + c;
+return d;
 }
 
 s64
@@ -62,7 +62,7 @@
   if (d == 0)
 return a + c;
   else
-return b + d + c;
+return d;
 }
 
 s64
@@ -74,7 +74,7 @@
   if (d == 0)
 return a + c;
   else
-return b + d + c;
+return d;
 }
 
 int
@@ -84,7 +84,7 @@
   s64 y;
 
   x = ands_si_test1 (29, 4, 5);
-  if (x != 13)
+  if (x != (29 & 4))
 abort ();
 
   x = ands_si_test1 (5, 2, 20);
@@ -92,7 +92,7 @@
 abort ();
 
   x = ands_si_test2 (29, 4, 5);
-  if (x != 38)
+  if (x != (29 & 0xff))
 abort ();
 
   x = ands_si_test2 (1024, 2, 20);
@@ -100,7 +100,7 @@
 abort ();
 
   x = ands_si_test3 (35, 4, 5);
-  if (x != 41)
+  if (x != (35 & (4 << 3)))
 abort ();
 
   x = ands_si_test3 (5, 2, 20);
@@ -111,7 +111,7 @@
  0x32004ll,
  0x505050505ll);
 
-  if (y != ((0x13029ll & 0x32004ll) + 0x32004ll + 0x505050505ll))
+  if (y != ((0x13029ll & 0x32004ll)))
 abort ();
 
   y = ands_di_test1 (0x5000500050005ll,
@@ -123,7 +123,7 @@
   y = ands_di_test2 (0x13029ll,
  0x32004ll,
  0x505050505ll);
-  if (y != ((0x13029ll & 0xff) + 0x32004ll + 0x505050505ll))
+  if (y != ((0x13029ll & 0xff)))
 abort ();
 
   y = ands_di_test2 (0x130002900ll,
@@ -135,8 +135,7 @@
   y = ands_di_test3 (0x13029ll,
  0x06408ll,
  0x505050505ll);
-  if (y != ((0x13029ll & (0x06408ll << 3))
-	+ 0x06408ll + 0x505050505ll))
+  if (y != ((0x13029ll & (0x06408ll << 3
 abort ();
 
   y = ands_di_test3 (0x130002900ll,
Index: gcc/testsuite/gcc.target/aarch64/subs3.c
===
--- gcc/testsuite/gcc.target/aarch64/subs3.c	(revision 225731)
+++ gcc/testsuite/gcc.target/aarch64/subs3.c	(working copy)
@@ -12,7 +12,7 @@
   if (d == 0)
 return a + c;
   else
-return b + d + c;
+return d;
 }
 
 int
@@ -23,7 +23,7 @@
   if (d == 0)
 return a + c;
   else
-return b + d + c;
+return d;
 }
 
 int main ()
@@ -32,27 +32,27 @@
   s64 y;
 
   x = subs_ext (0x1302ll, 41, 15);
-  if (x != 318767121)
+  if (x != (int)(0x1302ll - 41))
 abort ();
 
-  x = subs_ext (0x50505050ll, 29, 4);
-  if (x != 1347440724)
+  x = subs_ext (0x50505050ll, 0x50505050ll, 4);
+  if (x != (int)(0x50505050ll + 4))
 abort ();
 
   x = subs_ext (0x12121212121ll, 2, 14);
-  if (x != 555819311)
+  if (x != (int)(0x12121212121ll - 2))
 abort ();
 
   x = subs_shift_ext (0x123456789ll, 4, 12);
-  if (x != 591751033)
+  if (x != (int)(0x123456789ll - (4 << 3)))
 abort ();
 
-  x = subs_shift_ext (0x02020202ll, 9, 8);
-  if (x != 33685963)
+  x = subs_shift_ext (0x02020202ll << 3, 0x02020202ll, 8);
+  if (x != (int)(8 + (0x02020202ll << 3)))
 abort ();
 
   x = subs_shift_ext (0x987987987987ll, 23, 41);
-  if (x != -2020050673)
+  if (x != (int)(0x987987987987ll - (23 << 3)))
 abort ();
 
   return 0;
Index: gcc/testsuite/gcc.target/aarch64/adds1.c
===
--- gcc/testsuite/gcc.target/a

Re: [PATCH][RTL-ifcvt] Make non-conditional execution if-conversion more aggressive

2015-07-13 Thread Kyrill Tkachov


On 13/07/15 10:45, Kyrill Tkachov wrote:




+/* Return iff the registers that the insns in BB_A set do not
+   get used in BB_B.  */

Return true iff

I tried to be too formal here ;) https://en.wikipedia.org/wiki/If_and_only_if
I'll use a normal if here.


Err, of course you were talking about the missing 'true'.
Sorry, early morning.

Kyrill




[Committed] Mark *.omp_data_i as non-trapping

2015-07-13 Thread Tom de Vries

[ was: Re: [gomp4, committed] Handle nested loops in kernels regions ]

On 13/07/15 10:36, Jakub Jelinek wrote:

On Mon, Jul 13, 2015 at 10:19:56AM +0200, Thomas Schwinge wrote:

We rely on pass_lim to move the *.omp_data_i loads out of the loop nest.
For the test-case, pass_lim was managing to move the load out of the
inner loop, but not the outer loop, because the load was classified as
'MOVE_PRESERVE_EXECUTION'. By marking the *.omp_data_i load
non-trapping, it's now classified as 'MOVE_POSSIBLE', and moved out of
the loop nest.


Should this go into trunk already?  (Jakub?)


I think so.



I already bootstrapped and regtested (on x86_64) the patch on top of trunk.

Committed to trunk.

Thanks,
- Tom

Mark *.omp_data_i as non-trapping

2015-07-12  Tom de Vries  

	* omp-low.c (build_receiver_ref): Mark *.omp_data_i as non-trapping.
---
 gcc/omp-low.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index 22848a0..3135606 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -1127,6 +1127,7 @@ build_receiver_ref (tree var, bool by_ref, omp_context *ctx)
 field = x;
 
   x = build_simple_mem_ref (ctx->receiver_decl);
+  TREE_THIS_NOTRAP (x) = 1;
   x = omp_build_component_ref (x, field);
   if (by_ref)
 x = build_simple_mem_ref (x);
-- 
1.9.1



[PATCH][AArch64] Handle -|x| case using a single csneg

2015-07-13 Thread Kyrill Tkachov

Hi all,

For the testcase in the patch we were generating an extra neg instruction:
cmp w0, wzr
csneg   w0, w0, w0, ge
neg w0, w0
ret

instead of the optimal:
cmp w0, wzr
csneg   w0, w0, w0, lt
ret

The reason is that combine tries to merge the operation into a negation of an 
abs.
I considered teaching combine not to do that but it would require telling it 
that it shouldn't
do it if there is a conditional negate instruction. There's no optab for that 
though :(
Also, we already advertise that we have an abs optab, even though we expand to 
a compare and
a csneg anyway. This patch was the cleanest way I could do this. We just match 
the neg of an abs
and generate the same csneg sequence as for normal abs, just with the 
comparison condition inverted.

Bootstrapped and tested on aarch64.

Ok for trunk?
Thanks,
Kyrill

2015-07-13  Kyrylo Tkachov  

* config/aarch64/aarch64.md (*absneg2_insn): New
define_and_split.

2015-07-13  Kyrylo Tkachov  

* gcc.target/aarch64/neg-abs_1.c: New test.
commit 7527a76d25067ce4a5426e563e162487604ac6c1
Author: Kyrylo Tkachov 
Date:   Thu Jul 9 16:54:23 2015 +0100

[AArch64] Handle -|x| case using a single csneg

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index e6d0764..6664d1a 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -2333,6 +2333,29 @@ (define_expand "abs2"
   }
 )
 
+;; Combine will try merging (c > 0 ? -x : x) into (-|x|).  This isn't a good
+;; idea if the target has a conditional negate instruction and no integer
+;; abs instruction, but the midend doesn't have an optab for conditional neg
+;; and we advertise an optab for abs, so match that case here and emit the
+;; optimal CSNEG variant.
+(define_insn_and_split "*absneg2_insn"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+	(neg:GPI
+	  (abs:GPI (match_operand:GPI 1 "register_operand" "r"]
+  ""
+  "#"
+  ""
+  [(const_int 0)]
+  {
+rtx ccreg = aarch64_gen_compare_reg (LT, operands[1], const0_rtx);
+rtx x = gen_rtx_GE (VOIDmode, ccreg, const0_rtx);
+emit_insn (gen_csneg3_insn (operands[0], x, operands[1],
+   operands[1]));
+DONE;
+  }
+  [(set_attr "type" "csel")]
+)
+
 (define_insn "neg2"
   [(set (match_operand:GPI 0 "register_operand" "=r,w")
 	(neg:GPI (match_operand:GPI 1 "register_operand" "r,w")))]
diff --git a/gcc/testsuite/gcc.target/aarch64/neg-abs_1.c b/gcc/testsuite/gcc.target/aarch64/neg-abs_1.c
new file mode 100644
index 000..cb2a387
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/neg-abs_1.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-save-temps -O2" } */
+
+int
+f1 (int x)
+{
+  return x < 0 ? x : -x;
+}
+
+long long
+f2 (long long x)
+{
+  return x < 0 ? x : -x;
+}
+
+/* { dg-final { scan-assembler-not "\tneg\tw\[0-9\]*.*" } } */
+/* { dg-final { scan-assembler-not "\tneg\tx\[0-9\]*.*" } } */


Re: [PATCH][RTL-ifcvt] Make non-conditional execution if-conversion more aggressive

2015-07-13 Thread Kyrill Tkachov

Hi Bernhard,

On 11/07/15 00:00, Bernhard Reutner-Fischer wrote:

On 10 July 2015 at 14:31, Kyrill Tkachov  wrote:

Hi all,

This patch makes if-conversion more aggressive when handling code of the
form:
if (test)
   x := a  //THEN
else
   x := b  //ELSE
The current code adds the costs of both the THEN and ELSE blocks and proceeds 
if they don't
exceed the branch cost. I don't think that's quite a right calculation.
We're going to be executing at least one of the basic blocks anyway.
This patch we instead check the *maximum* of the two blocks against the branch 
cost.
This should still catch cases where a high latency instruction appears in one 
or both of
the paths.

Shouldn't this maximum also take probability into account? Or maybe
not, would have to think about it tomorrow.


The branch cost that we test against (recorded in if_info earlier in the ifcvt.c
call chain) is either the predictable branch cost or the unpredictable branch
cost, depending on what the predictable_edge_p machinery returned.
I think checking against that should be enough, but it's an easy thing to 
experiment
with, so I'm open to arguments in any direction.



$ contrib/check_GNU_style.sh rtl-ifcvt.00.patch

Blocks of 8 spaces should be replaced with tabs.
783:+return FALSE;


Generally ifcvt.c (resp. the whole tree) could use a
sed -i -e "s/\([[:space:]]\)FALSE/\1false/g" gcc/ifcvt.c
Maybe some of the int predicates could then become bools.


Ok, will go over the style in the patch.




+/* Return iff the registers that the insns in BB_A set do not
+   get used in BB_B.  */

Return true iff


I tried to be too formal here ;) https://en.wikipedia.org/wiki/If_and_only_if
I'll use a normal if here.




Did you include go in your testing?
I see:
Unexpected results in this build (new failures)
FAIL: encoding/json
FAIL: go/printer
FAIL: go/scanner
FAIL: html/template
FAIL: log
FAIL: net/http
FAIL: net/http/cgi
FAIL: net/http/cookiejar
FAIL: os
FAIL: text/template


Hmmm. I don't see these failures. I double checked right now and they
appear as PASS in my configuration.

I tested make check-go on x86_64-unknown-linux-gnu configured with
--without-isl --disable-multilib --enable-languages=c,c++,fortran,go.

Are you sure this is not some other issue in your tree?



bbs_ok_for_cmove_arith() looks costly but i guess you looked if
there's some pre-existing cleverness you could have used instead?


I did have a look, but couldn't find any.
The bbs_ok_for_cmove_arith is done after the costs check
so I'd hope that the costs check would already discard
really long basic-blocks.



noce_emit_bb() could use a better comment. Likewise insn_valid_noce_process_p().

insn_rtx_cost() should return an unsigned int, then_cost, else_cost
should thus be unsigned int too.

copy_of_a versus copy_of_insn_b; I'd shorten the latter.


Thanks, good suggestions.



bb_valid_for_noce_process_p() suggests that there is a JOIN_BB param
but there is none?
Also should document the return value (and should not clobber the OUT
params upon failure, no?).


bah, I forgot to update the comment once I modified the function
during development of the patch. I'll fix those.



As for the testcases, it would be nice to have at least a tiny bit for
x86_64, too.


I can put the testcases in gcc.dg and enable them for x86 as well,
but I think a couple of the already pass as is because x86 doesn't
need to do an extra zero_extend inside the basic-block.



PS: no -mbranch-cost and, a tad more seriously, no --param branch-cost either ;)
PPS: attached meant to illustrate comments above. Untested.


Thanks a lot! This is all very helpful.
I'll respin the patch.

Thanks,
Kyrill




cheers,




Re: [PATCH][4/n] Remove GENERIC stmt combining from SCCVN

2015-07-13 Thread Richard Biener
On Mon, 13 Jul 2015, Richard Biener wrote:

> On Sun, 12 Jul 2015, Jeff Law wrote:
> 
> > On 06/29/2015 01:58 AM, Richard Biener wrote:
> > > 
> > > In principle the following works for the testcase (even w/o fixing
> > > the VRP part).
> > > 
> > > Index: gcc/tree-ssa-dom.c
> > > ===
> > > --- gcc/tree-ssa-dom.c  (revision 225007)
> > > +++ gcc/tree-ssa-dom.c  (working copy)
> > > @@ -1409,6 +1409,14 @@ simplify_stmt_for_jump_threading (gimple
> > > return lookup_avail_expr (stmt, false);
> > >   }
> > > 
> > > +static tree
> > > +dom_valueize (tree t)
> > > +{
> > > +  if (TREE_CODE (t) == SSA_NAME)
> > > +return SSA_NAME_VALUE (t);
> > > +  return t;
> > > +}
> > > +
> > >   /* Record into the equivalence tables any equivalences implied by
> > >  traversing edge E (which are cached in E->aux).
> > > 
> > > @@ -1429,7 +1437,33 @@ record_temporary_equivalences (edge e)
> > > 
> > > /* If we have a simple NAME = VALUE equivalence, record it.  */
> > > if (lhs && TREE_CODE (lhs) == SSA_NAME)
> > > -   const_and_copies->record_const_or_copy (lhs, rhs);
> > > +   {
> > > + gimple use_stmt;
> > > + imm_use_iterator iter;
> > > + const_and_copies->record_const_or_copy (lhs, rhs);
> > > + FOR_EACH_IMM_USE_STMT (use_stmt, iter, lhs)
> > > +   {
> > > + /* Only bother to record more equivalences for lhs that
> > > +can be directly used by e->dest.
> > > +???  If the code gets re-organized to a worklist to
> > > +catch more indirect opportunities and it is made to
> > > +handle PHIs then this should only consider use_stmts
> > > +in basic-blocks we have already visited.  */
> > > + if (!dominated_by_p (CDI_DOMINATORS,
> > > +  e->dest, gimple_bb (use_stmt)))
> > > +   continue;
> > > + tree lhs = gimple_get_lhs (use_stmt);
> > > + if (lhs && TREE_CODE (lhs) == SSA_NAME)
> > > +   {
> > > + tree res = gimple_fold_stmt_to_constant_1 (use_stmt,
> > > +dom_valueize,
> > > +
> > > no_follow_ssa_edges);
> > > + if (TREE_CODE (res) == SSA_NAME
> > > + || is_gimple_min_invariant (res))
> > > +   const_and_copies->record_const_or_copy (lhs, res);
> > > +   }
> > > +   }
> > > +   }
> > > 
> > > /* If we have 0 = COND or 1 = COND equivalences, record them
> > >   into our expression hash tables.  */
> > > 
> > > 
> > > it's not using DOMs own stmt visiting machinery as that always modifies
> > > stmts in-place.  As stated in the comment it doesn't catch secondary
> > > opportunities.  That would be possible by using a work-list seeded
> > > by LHS we recorded new const/copies for and re-visiting their uses.
> > > You can get extra fancy here by properly handling PHIs and
> > > conditionals.  But it's a question of cost here, of course.
> > Right, the code you're modifying is only used by jump threading to record
> > temporary equivalences, particularly equivalences that are specific to a 
> > path.
> > 
> > 
> > > 
> > > Note that I think this isn't really "backward propagation" but
> > > just context sensitive value-numbering.
> > I think that's because we're looking at the problem differently.  It's
> > certainly not backward propagation in the traditional dataflow sense, so I'm
> > probably being too loose with terminology here.
> > 
> > When we discover something about X by means other than the definition of X, 
> > we
> > can look at how X was set and possibly discover a value for source operands 
> > of
> > that statement.  Similarly we can look at uses of X and possibly discover a
> > value for the destination of those statement(s).  In both cases we're going
> > backwards from an order-of-execution point of view and recording additional
> > equivalences.
> > 
> > The existing code did the former (look at X's defining statement and try to
> > discover an equivalence for a source operand in that statement). What we 
> > need
> > to optimize this case is the latter.
> > 
> > I *think* these are closely enough related that some code can be factored 
> > out
> > a bit and reused in both r_e_f_i_e and r_t_e to discover both types of
> > equivalences for DOM and for jump threading.
> 
> Indeed - the odd thing here is that one function uses
> const_and_copies->record_const_or_copy directly while the other one
> record_equality (this function is _solely_ used by 
> record_equivalences_from_incoming_edge).  I didn't want to introduce
> a callback to commonize the code (though in principle we could use
> a template function with a function template parameter...)
> 
> That said, I don't see that record_equality does sth not suitable
> if called f

Re: [PATCH] Improve in_array_bounds_p

2015-07-13 Thread Richard Biener
On Fri, 10 Jul 2015, Richard Biener wrote:

> On Fri, 10 Jul 2015, Richard Biener wrote:
> 
> > 
> > I was just testing the patch below which runs into latent issues when
> > building libjava (at least)...
> > 
> > /space/rguenther/src/svn/trunk/libjava/java/lang/natClassLoader.cc: In 
> > function ‘java::lang::Class* _Jv_FindClassInCache(_Jv_Utf8Const*)’:
> > /space/rguenther/src/svn/trunk/libjava/java/lang/natClassLoader.cc:97:1: 
> > error:BB 3 last statement has incorrectly set lp
> >  _Jv_FindClassInCache (_Jv_Utf8Const *name)
> >  ^
> > /space/rguenther/src/svn/trunk/libjava/java/lang/natClassLoader.cc:97:1: 
> > internal compiler error: verify_flow_info failed
> > 0x8e2132 verify_flow_info()
> > /space/rguenther/src/svn/trunk/gcc/cfghooks.c:261
> > 
> > so I have to debug that first.
> 
> It's stmts no longer throwing after VRP setting a value-range on
> an array index for example.  I've addressed this in the revised
> patch below which teaches CFG cleanup to deal with this (it
> already removes dead EH edges and makes similar adjustments for
> noreturn calls).
> 
> >  Still IMHO the patch makes sense apart
> > from the ugly need to go through a INTEGER_CST tree when converting
> > a wide_int to a widest_int (ugh).  Any wide-int folks around that
> > can suggest something better here (reason: the two integers we compare
> > do not have to have the same type/precision - see tree_int_cst_lt
> > which also uses widest_ints).
> 
> This issue still remains.

Fixed with widest_int::from (idx_min, TYPE_SIGN (TREE_TYPE (idx))).

But with the patch we run into the general issue that changing
a context insensitive predicate to use context sensitive information
leads to wrong-code.  (Only) gcc.c-torture/execute/pr51933.c
fails because if-conversion sees that v2[_18] cannot trap in

  if (u_14 <= 255)
goto ;
  else
goto ;

  :
  # RANGE [0, 255] NONZERO 255
  _18 = (int) u_14;
  _19 = v2[_18];

but of course it uses it to move v2[_18] out of its controling
condition.  As the patch was supposed to improve if-conversion
in the first place (and other passes might be similar confused)
I retract it.

We'd need a more specialized predicate that also gets
context information.

Richard.

> Bootstrap and regtest running on x86_64-unknown-linux-gnu.
> 
> Richard.
> 
> 2015-07-10  Richard Biener  
> 
>   * tree-eh.c (in_array_bounds_p): Use value-range information
>   when available.
>   * tree-cfgcleanup.c (cleanup_control_flow_bb): Clean stmts
>   from stale EH info.
> 
> Index: gcc/tree-eh.c
> ===
> --- gcc/tree-eh.c (revision 225655)
> +++ gcc/tree-eh.c (working copy)
> @@ -2532,8 +2532,11 @@ in_array_bounds_p (tree ref)
>  {
>tree idx = TREE_OPERAND (ref, 1);
>tree min, max;
> +  wide_int idx_min, idx_max;
>  
> -  if (TREE_CODE (idx) != INTEGER_CST)
> +  if (TREE_CODE (idx) != INTEGER_CST
> +  && (TREE_CODE (idx) != SSA_NAME
> +   || get_range_info (idx, &idx_min, &idx_max) != VR_RANGE))
>  return false;
>  
>min = array_ref_low_bound (ref);
> @@ -2544,11 +2547,26 @@ in_array_bounds_p (tree ref)
>|| TREE_CODE (max) != INTEGER_CST)
>  return false;
>  
> -  if (tree_int_cst_lt (idx, min)
> -  || tree_int_cst_lt (max, idx))
> -return false;
> +  if (TREE_CODE (idx) == INTEGER_CST)
> +{
> +  if (tree_int_cst_lt (idx, min)
> +   || tree_int_cst_lt (max, idx))
> + return false;
> +
> +  return true;
> +}
> +  else
> +{
> +  if (wi::lts_p (wi::to_widest (wide_int_to_tree (TREE_TYPE (idx),
> +   idx_min)),
> +  wi::to_widest (min))
> +   || wi::lts_p (wi::to_widest (max),
> + wi::to_widest (wide_int_to_tree (TREE_TYPE (idx),
> +  idx_max
> + return false;
>  
> -  return true;
> +  return true;
> +}
>  }
>  
>  /* Returns true if it is possible to prove that the range of
> Index: gcc/tree-cfgcleanup.c
> ===
> --- gcc/tree-cfgcleanup.c (revision 225662)
> +++ gcc/tree-cfgcleanup.c (working copy)
> @@ -256,6 +256,14 @@ cleanup_control_flow_bb (basic_block bb)
> && remove_fallthru_edge (bb->succs))
>  retval = true;
>  
> +  /* If a stmt may no longer throw, remove it from the EH tables
> + and cleanup dead EH edges.  */
> +  else if (maybe_clean_eh_stmt (stmt))
> +{
> +  gimple_purge_dead_eh_edges (bb);
> +  retval = true;
> +}
> +
>return retval;
>  }
>  

-- 
Richard Biener 
SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Dilip Upmanyu, Graham 
Norton, HRB 21284 (AG Nuernberg)

Re: [Fortran, patch, pr64589, v1] [OOP] Linking error due to undefined integer symbol with unlimited polymorphism

2015-07-13 Thread Andre Vehreschild
Hi Mikael, hi everyone,

thanks for the review, Mikael. Commited as r225730.

Regards,
Andre


On Fri, 10 Jul 2015 18:12:53 +0200
Mikael Morin  wrote:

> Le 10/07/2015 16:51, Andre Vehreschild a écrit :
> > Hi everyone,
> > 
> > attached is a rather trivial patch to fix a linker issue when unlimited
> > polymorphism is used and the vtabs of intrinsic types are referenced from
> > two different locations (e.g. module and main program). Gfortran finds the
> > vtab defined in the scope of a module's subroutine and tries to link it to a
> > reference in a subroutine of the main program. Then name mangling takes
> > place (the module's name is prefixed to the vtab's identifier) and the
> > linker later on can not link the reference in the subroutine of the main
> > program to the module's entity. By putting the vtabs of all intrinsic types
> > into the top-level scope this is easily fixed. The linker now is able to
> > find the name (although it is mangled) and linking is fine. 
> > 
> > I rather don't understand why the decision to put intrinsic type's vtabs
> > into the local scope was choosen. There are not so many intrinsic types
> > that they can effectively clutter the top-level scope. Instead putting the
> > intrinsic types into local scope bloats the executable, because the same
> > entity is created over and over again. So this time removing two lines of
> > code did the trick. 
> > 
> > Bootstraps and regtests fine on x86_64-linux-gnu/f21.
> > 
> > Ok for trunk?
> > 
> OK. Thanks.
> 
> Mikael


-- 
Andre Vehreschild * Email: vehre ad gmx dot de 
Index: gcc/fortran/ChangeLog
===
--- gcc/fortran/ChangeLog	(Revision 225729)
+++ gcc/fortran/ChangeLog	(Arbeitskopie)
@@ -1,3 +1,9 @@
+2015-07-13  Andre Vehreschild  
+
+	PR fortran/64589
+	* class.c (find_intrinsic_vtab): Put/Search vtabs for intrinsic
+	types in the top-level namespace.
+
 2015-07-12  Aldy Hernandez  
 
 	* trans-stmt.c: Fix double word typos.
Index: gcc/fortran/class.c
===
--- gcc/fortran/class.c	(Revision 225729)
+++ gcc/fortran/class.c	(Arbeitskopie)
@@ -2511,10 +2511,8 @@
 
   sprintf (name, "__vtab_%s", tname);
 
-  /* Look for the vtab symbol in various namespaces.  */
-  gfc_find_symbol (name, gfc_current_ns, 0, &vtab);
-  if (vtab == NULL)
-	gfc_find_symbol (name, ns, 0, &vtab);
+  /* Look for the vtab symbol in the top-level namespace only.  */
+  gfc_find_symbol (name, ns, 0, &vtab);
 
   if (vtab == NULL)
 	{
Index: gcc/testsuite/ChangeLog
===
--- gcc/testsuite/ChangeLog	(Revision 225729)
+++ gcc/testsuite/ChangeLog	(Arbeitskopie)
@@ -1,3 +1,8 @@
+2015-07-13  Andre Vehreschild  
+
+	PR fortran/64589
+	* gfortran.dg/pr64589.f90: New test.
+
 2015-07-13  Renlin Li  
 
 	PR rtl/66556
Index: gcc/testsuite/gfortran.dg/pr64589.f90
===
--- gcc/testsuite/gfortran.dg/pr64589.f90	(Revision 0)
+++ gcc/testsuite/gfortran.dg/pr64589.f90	(Arbeitskopie)
@@ -0,0 +1,30 @@
+! { dg-do compile }
+! Just need to check if compiling and linking is possible.
+!
+! Check that the _vtab linking issue is resolved.
+! Contributed by Damian Rouson  
+
+module m
+contains
+  subroutine fmt()
+class(*), pointer :: arg
+select type (arg)
+type is (integer)
+end select
+  end subroutine
+end module
+
+program p
+  call getSuffix()
+contains
+  subroutine makeString(arg1)
+class(*) :: arg1
+select type (arg1)
+type is (integer)
+end select
+  end subroutine
+  subroutine getSuffix()
+call makeString(1)
+  end subroutine
+end
+


Re: [PATCH, PR ipa/66566] Fix ICE in early_inliner: internal compiler error: in operator[]

2015-07-13 Thread Ilya Enkovich
Ping

2015-06-18 12:54 GMT+03:00 Ilya Enkovich :
> Hi,
>
> In early_inliner we do recompute inline summaries for edges after 
> optimize_inline_calls, but check this summary exists in case new edges 
> appear.  But then it calls inline_update_overall_summary which also going 
> through edges inline summaries but with no check this time causing segfault.  
> This patch fixes it.  Bootstrapped and regtested for 
> x86_64-unknown-linux-gnu.  Is it OK for trunk and gcc-5-branch?
>
> Thanks,
> Ilya
> --
> gcc/
>
> 2015-06-18  Ilya Enkovich  
>
> PR ipa/66566
> * ipa-inline-analysis.c (estimate_calls_size_and_time): Check
> edge summary is available.
>
> gcc/testsuite/
>
> 2015-06-18  Ilya Enkovich  
>
> PR ipa/66566
> * gcc.target/i386/mpx/pr66566.c: New test.
>
>
> diff --git a/gcc/ipa-inline-analysis.c b/gcc/ipa-inline-analysis.c
> index bbde855..e910ac5 100644
> --- a/gcc/ipa-inline-analysis.c
> +++ b/gcc/ipa-inline-analysis.c
> @@ -3122,6 +3122,9 @@ estimate_calls_size_and_time (struct cgraph_node *node, 
> int *size,
>struct cgraph_edge *e;
>for (e = node->callees; e; e = e->next_callee)
>  {
> +  if (inline_edge_summary_vec.length () <= (unsigned) e->uid)
> +   continue;
> +
>struct inline_edge_summary *es = inline_edge_summary (e);
>
>/* Do not care about zero sized builtins.  */
> @@ -3153,6 +3156,9 @@ estimate_calls_size_and_time (struct cgraph_node *node, 
> int *size,
>  }
>for (e = node->indirect_calls; e; e = e->next_callee)
>  {
> +  if (inline_edge_summary_vec.length () <= (unsigned) e->uid)
> +   continue;
> +
>struct inline_edge_summary *es = inline_edge_summary (e);
>if (!es->predicate
>   || evaluate_predicate (es->predicate, possible_truths))
> diff --git a/gcc/testsuite/gcc.target/i386/mpx/pr66566.c 
> b/gcc/testsuite/gcc.target/i386/mpx/pr66566.c
> new file mode 100644
> index 000..a405c20
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/mpx/pr66566.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fcheck-pointer-bounds -mmpx" } */
> +
> +union jsval_layout
> +{
> +  void *asPtr;
> +};
> +union jsval_layout a;
> +union jsval_layout b;
> +union jsval_layout __inline__ fn1() { return b; }
> +
> +void fn2() { a = fn1(); }


Re: [gomp4, committed] Handle nested loops in kernels regions

2015-07-13 Thread Jakub Jelinek
On Mon, Jul 13, 2015 at 10:19:56AM +0200, Thomas Schwinge wrote:
> > We rely on pass_lim to move the *.omp_data_i loads out of the loop nest. 
> > For the test-case, pass_lim was managing to move the load out of the 
> > inner loop, but not the outer loop, because the load was classified as 
> > 'MOVE_PRESERVE_EXECUTION'. By marking the *.omp_data_i load 
> > non-trapping, it's now classified as 'MOVE_POSSIBLE', and moved out of 
> > the loop nest.
> 
> Should this go into trunk already?  (Jakub?)

I think so.

> Do we need to audit the
> code for constructs that need similar treatment?

That might be helpful.

Jakub


Re: Merge trunk r225562 (2015-07-08) into gomp-4_0-branch

2015-07-13 Thread Thomas Schwinge
Hi Tom!

On Mon, 13 Jul 2015 09:20:16 +0200, Tom de Vries  wrote:
> On 12/07/15 11:39, Thomas Schwinge wrote:
> > On Fri, 10 Jul 2015 18:50:20 -0400, Nathan Sidwell 
> >  wrote:
> >> it looks like the most recent merge from trunk to gomp4 was early May.  I 
> >> think
> >> it is time for another one -- can you handle that?
> >
> > Indeed :-) -- and, as it happens, resolving the "merge artifacts" is one
> > of the things I've been working on last week.  I hope I got that all
> > right, in particular gcc/tree-parloops.c (Tom),
> 
> I've looked at the merge commit, gcc/tree-parloops.c was not modified.

(Well, it was, but not "substantially".)  You'd ported all your trunk
commits to gomp-4_0-branch already (thanks!), and in the functions where
I got merge conflicts, I just retained the code that was present on
gomp-4_0-branch already, which apparently was the right thing to do.  ;-)


> > gcc/tree-ssa-loop-ch.c (Tom),
> 
> That looks ok. I just wonder whether we could have derived 
> pass_ch_oacc_kernels from pass_ch instead of from ch_base, avoiding 
> duplicating the execute function, and have 
> pass_ch_oacc_kernels::process_loop_p call pass_ch::process_loop_p rather 
> than inline it.

Your call, depending on what makes the most sense regarding the semantics
of pass_ch_oacc_kernels.

I was just (pleasantly) surprised to find myself (capable of) doing a
little C++ programming, with classes, inheritance, and so on.  ;-)


Grüße,
 Thomas


pgpKJoS7CjriA.pgp
Description: PGP signature


Re: [gomp4, committed] Handle nested loops in kernels regions

2015-07-13 Thread Thomas Schwinge
Hi!

On Sun, 12 Jul 2015 14:46:02 +0200, Tom de Vries  wrote:
> This patch allows parallelization of an outer loop in an openacc kernels 
> region.

\o/


> We rely on pass_lim to move the *.omp_data_i loads out of the loop nest. 
> For the test-case, pass_lim was managing to move the load out of the 
> inner loop, but not the outer loop, because the load was classified as 
> 'MOVE_PRESERVE_EXECUTION'. By marking the *.omp_data_i load 
> non-trapping, it's now classified as 'MOVE_POSSIBLE', and moved out of 
> the loop nest.

Should this go into trunk already?  (Jakub?)  Do we need to audit the
code for constructs that need similar treatment?

* omp-low.c (build_receiver_ref): Mark *.omp_data_i as non-trapping.

--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -1147,6 +1147,7 @@ build_receiver_ref (tree var, bool by_ref, omp_context 
*ctx)
 field = x;
 
   x = build_simple_mem_ref (ctx->receiver_decl);
+  TREE_THIS_NOTRAP (x) = 1;
   x = omp_build_component_ref (x, field);
   if (by_ref)
 x = build_simple_mem_ref (x);


> I've committed this patch to gomp-4_0-branch.

Are you planning to also add a Fortran test case?


> --- /dev/null
> +++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-nest.c
> @@ -0,0 +1,42 @@
> +/* { dg-additional-options "-O2" } */
> +/* { dg-additional-options "-ftree-parallelize-loops=32" } */
> +/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */
> +/* { dg-additional-options "-fdump-tree-optimized" } */
> +[...]
> +/* { dg-final { cleanup-tree-dump "parloops_oacc_kernels" } } */
> +/* { dg-final { cleanup-tree-dump "optimized" } } */

Committed in r225728:

commit c2da6458c51cc71dccec2e49481560b91d57aa1c
Author: tschwinge 
Date:   Mon Jul 13 08:14:05 2015 +

cleanup-tree-dump is no more

ERROR: (DejaGnu) proc "cleanup-tree-dump parloops_oacc_kernels" does 
not exist.

gcc/testsuite/
* c-c++-common/goacc/kernels-loop-nest.c: Remove cleanup-tree-dump
directives.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/branches/gomp-4_0-branch@225728 
138bc75d-0d04-0410-961f-82ee72b054a4
---
 gcc/testsuite/ChangeLog.gomp |5 +
 gcc/testsuite/c-c++-common/goacc/kernels-loop-nest.c |3 ---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git gcc/testsuite/ChangeLog.gomp gcc/testsuite/ChangeLog.gomp
index 636a087..4694a60 100644
--- gcc/testsuite/ChangeLog.gomp
+++ gcc/testsuite/ChangeLog.gomp
@@ -1,3 +1,8 @@
+2015-07-13  Thomas Schwinge  
+
+   * c-c++-common/goacc/kernels-loop-nest.c: Remove cleanup-tree-dump
+   directives.
+
 2015-07-12  Tom de Vries  
 
* c-c++-common/goacc/kernels-loop-nest.c: New test.
diff --git gcc/testsuite/c-c++-common/goacc/kernels-loop-nest.c 
gcc/testsuite/c-c++-common/goacc/kernels-loop-nest.c
index 3e06c9f..e8eebaa 100644
--- gcc/testsuite/c-c++-common/goacc/kernels-loop-nest.c
+++ gcc/testsuite/c-c++-common/goacc/kernels-loop-nest.c
@@ -37,6 +37,3 @@ main (void)
 /* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 
"optimized" } } */
 
 /* { dg-final { scan-tree-dump-times "(?n)pragma omp target 
oacc_parallel.*num_gangs\\(32\\)" 1 "parloops_oacc_kernels" } } */
-
-/* { dg-final { cleanup-tree-dump "parloops_oacc_kernels" } } */
-/* { dg-final { cleanup-tree-dump "optimized" } } */


Grüße,
 Thomas


signature.asc
Description: PGP signature


Re: [PATCH, ARM] stop changing signedness in PROMOTE_MODE

2015-07-13 Thread Richard Biener
On Fri, Jul 10, 2015 at 5:46 PM, Jim Wilson  wrote:
> On Tue, Jul 7, 2015 at 2:35 PM, Richard Biener
>  wrote:
>> On July 7, 2015 6:29:21 PM GMT+02:00, Jim Wilson  
>> wrote:
>>>signed sub-word locals.  Thus to detect the need for a conversion, you
>>>have to have the decls, and we don't have them here.  There is also
>>
>> It probably is.  The decks for the parameter based SSA names are available, 
>> for the PHI destination there might be no decl.
>
> I tried looking again, and found the decls.  I'm able to get correct
> code for my testcase with the attached patch to force the conversion.
> It is rather inelegant, but I think I can cache the values I need to
> make this simpler and cleaner.  I still don't have decls from
> insert_part_to_rtx_on_edge and insert_rtx_to_part_on_edge, but it
> looks like those are for breaking cycles, and hence might not need
> conversions.

Yes, that looks like a defect.  CCing Micha who wrote this code

Richard.

> Jim


Re: [PATCH][4/n] Remove GENERIC stmt combining from SCCVN

2015-07-13 Thread Richard Biener
On Sun, 12 Jul 2015, Jeff Law wrote:

> On 06/29/2015 01:58 AM, Richard Biener wrote:
> > 
> > In principle the following works for the testcase (even w/o fixing
> > the VRP part).
> > 
> > Index: gcc/tree-ssa-dom.c
> > ===
> > --- gcc/tree-ssa-dom.c  (revision 225007)
> > +++ gcc/tree-ssa-dom.c  (working copy)
> > @@ -1409,6 +1409,14 @@ simplify_stmt_for_jump_threading (gimple
> > return lookup_avail_expr (stmt, false);
> >   }
> > 
> > +static tree
> > +dom_valueize (tree t)
> > +{
> > +  if (TREE_CODE (t) == SSA_NAME)
> > +return SSA_NAME_VALUE (t);
> > +  return t;
> > +}
> > +
> >   /* Record into the equivalence tables any equivalences implied by
> >  traversing edge E (which are cached in E->aux).
> > 
> > @@ -1429,7 +1437,33 @@ record_temporary_equivalences (edge e)
> > 
> > /* If we have a simple NAME = VALUE equivalence, record it.  */
> > if (lhs && TREE_CODE (lhs) == SSA_NAME)
> > -   const_and_copies->record_const_or_copy (lhs, rhs);
> > +   {
> > + gimple use_stmt;
> > + imm_use_iterator iter;
> > + const_and_copies->record_const_or_copy (lhs, rhs);
> > + FOR_EACH_IMM_USE_STMT (use_stmt, iter, lhs)
> > +   {
> > + /* Only bother to record more equivalences for lhs that
> > +can be directly used by e->dest.
> > +???  If the code gets re-organized to a worklist to
> > +catch more indirect opportunities and it is made to
> > +handle PHIs then this should only consider use_stmts
> > +in basic-blocks we have already visited.  */
> > + if (!dominated_by_p (CDI_DOMINATORS,
> > +  e->dest, gimple_bb (use_stmt)))
> > +   continue;
> > + tree lhs = gimple_get_lhs (use_stmt);
> > + if (lhs && TREE_CODE (lhs) == SSA_NAME)
> > +   {
> > + tree res = gimple_fold_stmt_to_constant_1 (use_stmt,
> > +dom_valueize,
> > +
> > no_follow_ssa_edges);
> > + if (TREE_CODE (res) == SSA_NAME
> > + || is_gimple_min_invariant (res))
> > +   const_and_copies->record_const_or_copy (lhs, res);
> > +   }
> > +   }
> > +   }
> > 
> > /* If we have 0 = COND or 1 = COND equivalences, record them
> >   into our expression hash tables.  */
> > 
> > 
> > it's not using DOMs own stmt visiting machinery as that always modifies
> > stmts in-place.  As stated in the comment it doesn't catch secondary
> > opportunities.  That would be possible by using a work-list seeded
> > by LHS we recorded new const/copies for and re-visiting their uses.
> > You can get extra fancy here by properly handling PHIs and
> > conditionals.  But it's a question of cost here, of course.
> Right, the code you're modifying is only used by jump threading to record
> temporary equivalences, particularly equivalences that are specific to a path.
> 
> 
> > 
> > Note that I think this isn't really "backward propagation" but
> > just context sensitive value-numbering.
> I think that's because we're looking at the problem differently.  It's
> certainly not backward propagation in the traditional dataflow sense, so I'm
> probably being too loose with terminology here.
> 
> When we discover something about X by means other than the definition of X, we
> can look at how X was set and possibly discover a value for source operands of
> that statement.  Similarly we can look at uses of X and possibly discover a
> value for the destination of those statement(s).  In both cases we're going
> backwards from an order-of-execution point of view and recording additional
> equivalences.
> 
> The existing code did the former (look at X's defining statement and try to
> discover an equivalence for a source operand in that statement). What we need
> to optimize this case is the latter.
> 
> I *think* these are closely enough related that some code can be factored out
> a bit and reused in both r_e_f_i_e and r_t_e to discover both types of
> equivalences for DOM and for jump threading.

Indeed - the odd thing here is that one function uses
const_and_copies->record_const_or_copy directly while the other one
record_equality (this function is _solely_ used by 
record_equivalences_from_incoming_edge).  I didn't want to introduce
a callback to commonize the code (though in principle we could use
a template function with a function template parameter...)

That said, I don't see that record_equality does sth not suitable
if called from record_temporary_equivalences.  So if we make
use of that function we could simply call record_temporary_equivalences
from record_equivalences_from_incoming_edge.

Richard.
 
> Jeff
> 
> 

-- 
Richard Biener 
SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Dilip Upma

  1   2   >