[Ada] Adjust Esize processing in gigi

2012-06-11 Thread Eric Botcazou
This makes it so that the size of types (Esize) is retrieved early and properly 
capped for elementary types, in order to avoid obscure failures later on for 
huge sizes.  Nothing is changed for composite types.

Tested on i586-suse-linux, applied on the mainline.


2012-06-11  Eric Botcazou  ebotca...@adacore.com

* gcc-interface/decl.c (gnat_to_gnu_entity): Translate the Esize on
entry only for elementary types and abort if it is too large.
E_Record_Type: Make sure the Esize is known before using it.


-- 
Eric Botcazou
Index: gcc-interface/decl.c
===
--- gcc-interface/decl.c	(revision 188377)
+++ gcc-interface/decl.c	(working copy)
@@ -377,11 +377,12 @@ gnat_to_gnu_entity (Entity_Id gnat_entit
 	prepend_attributes (First_Subtype (Base_Type (gnat_entity)),
 			attr_list);
 
-  /* Compute a default value for the size of the type.  */
-  if (Known_Esize (gnat_entity)
-	   UI_Is_In_Int_Range (Esize (gnat_entity)))
+  /* Compute a default value for the size of an elementary type.  */
+  if (Known_Esize (gnat_entity)  Is_Elementary_Type (gnat_entity))
 	{
 	  unsigned int max_esize;
+
+	  gcc_assert (UI_Is_In_Int_Range (Esize (gnat_entity)));
 	  esize = UI_To_Int (Esize (gnat_entity));
 
 	  if (IN (kind, Float_Kind))
@@ -2948,14 +2949,16 @@ gnat_to_gnu_entity (Entity_Id gnat_entit
 	if (Known_Alignment (gnat_entity))
 	  TYPE_ALIGN (gnu_type)
 	= validate_alignment (Alignment (gnat_entity), gnat_entity, 0);
-	else if (Is_Atomic (gnat_entity))
-	  TYPE_ALIGN (gnu_type)
-	= esize = BITS_PER_WORD ? BITS_PER_WORD : ceil_pow2 (esize);
+	else if (Is_Atomic (gnat_entity)  Known_Esize (gnat_entity))
+	  {
+	unsigned int size = UI_To_Int (Esize (gnat_entity));
+	TYPE_ALIGN (gnu_type)
+	  = size = BITS_PER_WORD ? BITS_PER_WORD : ceil_pow2 (size);
+	  }
 	/* If a type needs strict alignment, the minimum size will be the
 	   type size instead of the RM size (see validate_size).  Cap the
 	   alignment, lest it causes this type size to become too large.  */
-	else if (Strict_Alignment (gnat_entity)
-		  Known_RM_Size (gnat_entity))
+	else if (Strict_Alignment (gnat_entity)  Known_RM_Size (gnat_entity))
 	  {
 	unsigned int raw_size = UI_To_Int (RM_Size (gnat_entity));
 	unsigned int raw_align = raw_size  -raw_size;


[Ada] Fix link error on conditional expression

2012-06-11 Thread Eric Botcazou
The executable cannot be linked because of a duplicated symbol in the object 
files, which comes from a conditional expression used to initialize a constant 
that is the upper bound of an index type in an array.

Tested on i586-suse-linux, applied on the mainline.


2012-06-11  Eric Botcazou  ebotca...@adacore.com

* gcc-interface/decl.c (gnat_to_gnu_entity): Do not specifically deal
with external constants wrt force_global here...
E_Constant: ...but here instead.
* gcc-interface/utils.c (gnat_pushdecl): Do not put external DECLs onto
the list of global DECLs.


2012-06-11  Eric Botcazou  ebotca...@adacore.com

* gnat.dg/constant4.adb: New test.
* gnat.dg/constant4_pkg.ads: New helper.


-- 
Eric Botcazou
Index: gcc-interface/utils.c
===
--- gcc-interface/utils.c	(revision 188377)
+++ gcc-interface/utils.c	(working copy)
@@ -572,14 +572,14 @@ gnat_pushdecl (tree decl, Node_Id gnat_n
   if (!(TREE_CODE (decl) == TYPE_DECL
  TREE_CODE (TREE_TYPE (decl)) == UNCONSTRAINED_ARRAY_TYPE))
 {
-  if (global_bindings_p ())
+  if (DECL_EXTERNAL (decl))
 	{
-	  VEC_safe_push (tree, gc, global_decls, decl);
-
 	  if (TREE_CODE (decl) == FUNCTION_DECL  DECL_BUILT_IN (decl))
 	VEC_safe_push (tree, gc, builtin_decls, decl);
 	}
-  else if (!DECL_EXTERNAL (decl))
+  else if (global_bindings_p ())
+	VEC_safe_push (tree, gc, global_decls, decl);
+  else
 	{
 	  DECL_CHAIN (decl) = BLOCK_VARS (current_binding_level-block);
 	  BLOCK_VARS (current_binding_level-block) = decl;
Index: gcc-interface/decl.c
===
--- gcc-interface/decl.c	(revision 188378)
+++ gcc-interface/decl.c	(working copy)
@@ -348,12 +348,10 @@ gnat_to_gnu_entity (Entity_Id gnat_entit
  another compilation unit) public entities, show we are at global level
  for the purpose of computing scopes.  Don't do this for components or
  discriminants since the relevant test is whether or not the record is
- being defined.  Don't do this for constants either as we'll look into
- their defining expression in the local context.  */
+ being defined.  */
   if (!definition
kind != E_Component
kind != E_Discriminant
-   kind != E_Constant
Is_Public (gnat_entity)
!Is_Statically_Allocated (gnat_entity))
 force_global++, this_global = true;
@@ -424,6 +422,7 @@ gnat_to_gnu_entity (Entity_Id gnat_entit
 	 != N_Allocator)
 	{
 	  bool went_into_elab_proc = false;
+	  int save_force_global = force_global;
 
 	  /* The expression may contain N_Expression_With_Actions nodes and
 	 thus object declarations from other units.  In this case, even
@@ -436,11 +435,13 @@ gnat_to_gnu_entity (Entity_Id gnat_entit
 	  current_function_decl = get_elaboration_procedure ();
 	  went_into_elab_proc = true;
 	}
+	  force_global = 0;
 	  gnat_pushlevel ();
 
 	  gnu_expr = gnat_to_gnu (Expression (Declaration_Node (gnat_entity)));
 
 	  gnat_zaplevel ();
+	  force_global = save_force_global;
 	  if (went_into_elab_proc)
 	current_function_decl = NULL_TREE;
 	}
-- { dg-do link }
-- { dg-options -gnat12 }

with Constant4_Pkg; use Constant4_Pkg;

procedure Constant4 is
   Sum : Counter := 0;
begin
   for Count of Steals loop
  Sum := Sum + Count;
   end loop;
end;
with Ada.Command_Line;   use Ada.Command_Line;
with System.Multiprocessors; use System.Multiprocessors;

package Constant4_Pkg is

   Max_CPUs : constant CPU := (if Argument_Count  2 then Number_Of_CPUs
   else CPU'Value (Argument (2)));

   subtype Worker_Id is CPU range 1 .. Max_CPUs;

   type Counter is range 0 .. 10**18;

   Steals : array (Worker_Id) of Counter := (others = 0);

end Constant4_Pkg;


Re: [PATCH 1/2] SH epilogue unwind, dwarf2 pass changes

2012-06-11 Thread Chung-Lin Tang
Ping?

On 2012/6/1 06:24 PM, Chung-Lin Tang wrote:
 On 12/5/23 1:46 AM, Richard Henderson wrote:
 On 05/18/12 03:48, Chung-Lin Tang wrote:
 @@ -2401,6 +2401,7 @@ scan_trace (dw_trace_info *trace)
 {
   /* Propagate across fallthru edges.  */
   dwarf2out_flush_queued_reg_saves ();
 + def_cfa_1 (this_cfa);
   maybe_record_trace_start (insn, NULL);
   break;
 }
 @@ -2455,6 +2456,18 @@ scan_trace (dw_trace_info *trace)
   cur_cfa = this_cfa;
   continue;
 }
 + else
 +   {
 + /* If ELT is a annulled branch-taken instruction (i.e. 
 executed
 +only when branch is not taken), the args_size and CFA 
 should
 +not change through the jump.  */
 + create_trace_edges (control);
 +
 + /* Update and continue with the trace.  */
 + add_cfi_insn = insn;
 + scan_insn_after (elt);
 + continue;
 +   }

 I think the def_cfa_1 is misplaced.  It should be immediately before
 that last continue.  That mirrors the sort of flow you get via the
 other paths through the loop.
 
 Or possibly moved before the dwarf2out_flush_queued_reg_saves () call in
 the patch? (in the save_point_p () break case)  Note I'm only saying
 this based on overall ordering of those two routine calls in the loop.
 
 Attached is a testcase (adapted from libgomp) that, with the SH epilogue
 unwind patch applied alone, produces the ICE I'm seeing (-O2
 -funwind-tables).
 
 This dwarf2 patch, with any of the three new def_cfa_1() call sites
 seems to solve it, though you might want to comment on which call site
 seems correct
 
 Thanks,
 Chung-Lin



Re: [patch] Do not include rtl.h in tree-phinodes.c

2012-06-11 Thread Richard Guenther
On Wed, Jun 6, 2012 at 5:16 PM, Steven Bosscher stevenb@gmail.com wrote:
 Hello,

 tree-phinodes.c includes rtl.h for ... ceil_log2 (?!).

 Moving ceil_log2 to hwint.c/hwint.h to join its sibling floor_log2
 breaks this dependency.

 Bootstrapped on x86_64-unknown-linux-gnu. OK for trunk?

Ok.

Thanks,
Richard.

 Ciao!
 Steven


Re: [patch] Do not include rtl.h in cfgloop.h

2012-06-11 Thread Richard Guenther
On Wed, Jun 6, 2012 at 5:16 PM, Steven Bosscher stevenb@gmail.com wrote:
 Hello,

 cfgloop.h only depends on rtl.h for rtx_code. The patch introduces a
 new enum and a simple function to convert from one enum to another in
 a few non-performance critical places.

 The upside is that this makes most of the tree-* files independent of rtl.h.

 Bootstrapped on x86_64-unknown-linux-gnu. OK for trunk?

Ok with updating the comment here:

   /* The type of extend applied to it (SIGN_EXTEND, ZERO_EXTEND or
UNKNOWN).  */
-  enum rtx_code extend;
+  enum iv_extend_code extend;

thus, IV_SIGN_EXTEND, etc.

Thanks,
Richard.


 Ciao!
 Steven


[Ada] Plug hole with constants used by reference

2012-06-11 Thread Eric Botcazou
This aligns the processing of constants used by reference in Identifier_to_gnu 
with that of regular constants; in particular we stop returning the underlying 
constant if this is the address of a constant and an lvalue is required.

Tested on i586-suse-linux, applied on the mainline.


2012-06-11  Eric Botcazou  ebotca...@adacore.com

* gcc-interface/trans.c (Identifier_to_gnu): Test Is_Elementary_Type
instead of Is_Scalar_Type for a constant with an address clause.
Do not return the underlying constant for a constant used by reference
if it holds the address of a constant and an lvalue is required.


-- 
Eric Botcazou
Index: gcc-interface/trans.c
===
--- gcc-interface/trans.c	(revision 188377)
+++ gcc-interface/trans.c	(working copy)
@@ -1019,7 +1019,7 @@ Identifier_to_gnu (Node_Id gnat_node, tr
  order-of-elaboration issue here.  */
   gnu_result_type = get_unpadded_type (gnat_temp_type);
 
-  /* If this is a non-imported scalar constant with an address clause,
+  /* If this is a non-imported elementary constant with an address clause,
  retrieve the value instead of a pointer to be dereferenced unless
  an lvalue is required.  This is generally more efficient and actually
  required if this is a static expression because it might be used
@@ -1028,7 +1028,7 @@ Identifier_to_gnu (Node_Id gnat_node, tr
  volatile-ness short-circuit here since Volatile constants must be
  imported per C.6.  */
   if (Ekind (gnat_temp) == E_Constant
-   Is_Scalar_Type (gnat_temp_type)
+   Is_Elementary_Type (gnat_temp_type)
!Is_Imported (gnat_temp)
Present (Address_Clause (gnat_temp)))
 {
@@ -1080,7 +1080,10 @@ Identifier_to_gnu (Node_Id gnat_node, tr
 	  = convert (build_pointer_type (gnu_result_type), gnu_result);
 
   /* If it's a CONST_DECL, return the underlying constant like below.  */
-  else if (TREE_CODE (gnu_result) == CONST_DECL)
+  else if (TREE_CODE (gnu_result) == CONST_DECL
+	!(DECL_CONST_ADDRESS_P (gnu_result)
+		 lvalue_required_p (gnat_node, gnu_result_type, true,
+	  true, false)))
 	gnu_result = DECL_INITIAL (gnu_result);
 
   /* If it's a renaming pointer and we are at the right binding level,


[PATCH, GCC DOC][AArch64] Fix layout in AArch64 target-specific documentation

2012-06-11 Thread Sofiane Naci
Hi,

This patch fixes several layout, formatting and wording issues in the
AArch64 target-specific documentation.

Thanks
Sofiane

-

2012-06-11  Sofiane Naci sofiane.n...@arm.com

[AArch64] Fix documentation layout.

* doc/invoke.texi: Fix white spaces after dots.
Change aarch64*be-*-* to aarch64_be-*-*.
Add documentation for -mcmodel=tiny.
(-march): Fix formatting.
(-mcpu): Likewise.
(-mtune): Rephrase.
(-march and -mcpu feature modifiers): New subsubsection.


aarch64-fix-doc.patch
Description: Binary data


Re: Go patch RFA: Fix taking the address of a field of a local variable

2012-06-11 Thread Richard Guenther
On Thu, Jun 7, 2012 at 10:08 AM, Ian Lance Taylor i...@google.com wrote:
 This patch to the Go compiler fixes the case of taking the address of a
 field of a local variable.  Previously taking the address of the field
 was not treated as taking the address of the variable.  But, of course,
 it is.  This patch fixes the problem.  Bootstrapped and ran Go testsuite
 on x86_64-unknown-linux-gnu.  Committed to mainline.

 OK for 4.7 branch?  This is a real bug, bad runtime code generated for
 valid Go code.

Ok for 4.7.2, thus after 4.7.1 has been released.

Thanks,
Richard.

 Ian



Re: [patch] Remove cfglayout.h, and distribute cfglayout.c code to users

2012-06-11 Thread Richard Guenther
On Thu, Jun 7, 2012 at 10:18 PM, Steven Bosscher stevenb@gmail.com wrote:
 Hello,

 The attached patch removes cfglayout.h. It was included in a lot of
 places that didn't need it, and it didn't define much anyway.

 While checking the users of cfglayout.h, I noticed that a lot of code
 in cfglayout.c can be made local to users (reemit_insn_block_notes -
 final.c) or doesn't have to do much with cfglayout mode per se (all
 insn locator code - emit-rtl.c). This of course uncovered a buglet:
 function.h's x_last_location wasn't used, but emit-rtl.c had a define
 that interfered with the last_location for the insn locators.

 And it seemed rather strange that emit-rtl.c should include
 tree-flow.h, so I moved some non-GIMPLE tree-EH related function
 prototypes from tree-flow.h to tree.h (in retrospect, I should
 probably have made that a separate patch, but these patches tend to
 evolve, rather than be designed... ;-)

 Bootstrapped on x86_64-unknown-linux-gnu. OK?

Ok!

Thanks,
Richard.

 Ciao!
 Steven


Re: [patch] Move split_double from final.c to rtlanal.c, do not include output.h in expr.c

2012-06-11 Thread Richard Guenther
On Fri, Jun 8, 2012 at 12:23 AM, Steven Bosscher stevenb@gmail.com wrote:
 Hello,

 Another patch to move around some code to break up a strange
 dependency. expr.c isn't supposed to emit assembly so it shouldn't
 need output.h, but it needs split_double from final.c. This
 split_double has nothing to do with writing out assembly, so it
 shouldn't be in final.c, and I moved it to rtlanal.c. This, and moving
 one more prototype from output.h to tree.h (an oversight in a previous
 patch) , makes it possible to no longer include output.h in expr.c.

 Bootstrappedtested on powerpc64-unknown-linux-gnu. OK for trunk?

Ok.

Thanks,
Richard.

 Ciao!
 Steven


[arm-embedded] Backport mainline r181172, r184089, r184180

2012-06-11 Thread Joey Ye
Committed to ARM/embedded-4_6-branch

2012-06-11  Joey Ye  joey...@arm.com

Backport r184089,184180 from mainline
2012-02-10  Jan Hubicka  j...@suse.cz

PR middle-end/48600
* predict.c (predict_paths_for_bb): Prevent looping.
(predict_paths_leading_to_edge, predict_paths_leading_to): Update.

2012-02-13  Jan Hubicka  j...@suse.cz

PR middle-end/52214
* predict.c (predict_paths_for_bb): Fix thinko in prevoius patch.


Backport partly r181172 from mainline
2011-11-08  Michael Matz  m...@suse.de

PR bootstrap/51969
* gengtype.c (write_field_root): Avoid out-of-scope access of newv.

testsuite:

Backport r184089 from mainline
2012-02-10  Jan Hubicka  j...@suse.cz

PR middle-end/48600
* g++.dg/torture/pr48600.C: New testcase.







Re: [PATCH, GCC DOC][AArch64] Fix layout in AArch64 target-specific documentation

2012-06-11 Thread Marcus Shawcroft

On 11/06/12 09:54, Sofiane Naci wrote:


(-march and -mcpu feature modifiers): New subsubsection.


Copy and paste subsub?

/Marcus



[Ada] Fix unexpected allocation change with size clause

2012-06-11 Thread Eric Botcazou
The attached packages declare two big objects of the same size, but for the 
first package the object is allocated statically whereas, for the second one, 
the object is allocated dynamically because of the size clause.

The discrepancy stems from a bit vs byte confusion in a specific place in gigi.

Tested on i586-suse-linux, applied on the mainline.


2012-06-11  Eric Botcazou  ebotca...@adacore.com

* gcc-interface/decl.c (gnat_to_gnu_entity) object: Convert GNU_SIZE
to units before invoking allocatable_size_p on it.
Remove orphaned comment.  Do not use ssize_int.
E_Record_Subtype: Traverse list in original order.  Minor tweak.
(allocatable_size_p): Adjust and simplify.
(build_subst_list): Use consistent terminology throughout.
(build_variant_list): Likewise.  Traverse list in original order.
(create_field_decl_from): Likewise.
(copy_and_substitute_in_size): Likewise.
(create_variant_part_from): Add comment about field list order.
* gcc-interface/utils.c (build_vms_descriptor): Do not use ssize_int.
* gcc-interface/utils2.c (build_allocator): Likewise.


2012-06-11  Eric Botcazou  ebotca...@adacore.com

* gnat.dg/specs/array1.ads: New test.
* gnat.dg/specs/array2.ads: Likewise.
* gnat.dg/array22.adb: Likewise.


-- 
Eric Botcazou
-- { dg-do compile }

pragma Restrictions (No_Elaboration_Code);

package Array1 is

  type Arr is array (Positive range ) of Boolean;
  A : Arr (1 .. 2 ** 29);

end Array1;
-- { dg-do compile }
-- { dg-options -gnatws }

pragma Restrictions (No_Elaboration_Code);

package Array2 is

  type Arr is array (Positive range ) of Boolean;
  A : Arr (1 .. 2 ** 2);
  for A'Size use 16#1000__0#;

end Array2;
-- { dg-do compile }

with System; use System;

procedure Array22 is

   type Integer_Address is mod Memory_Size;

   type Memory is array (Integer_Address range ) of Character;

   type Chunk (First, Last : Integer_Address) is record
  Mem : Memory (First .. Last);
   end record;

   C : Chunk (1, 8);
   for C'Alignment use 8;
   pragma Unreferenced (C);

begin
   null;
end;
Index: gcc-interface/utils.c
===
--- gcc-interface/utils.c	(revision 188379)
+++ gcc-interface/utils.c	(working copy)
@@ -3601,7 +3601,7 @@ build_vms_descriptor (tree type, Mechani
 			 record_type, size_int (klass), field_list);
   field_list
 = make_descriptor_field (MBMO, gnat_type_for_size (32, 1),
-			 record_type, ssize_int (-1), field_list);
+			 record_type, size_int (-1), field_list);
   field_list
 = make_descriptor_field (LENGTH, gnat_type_for_size (64, 1),
 			 record_type,
Index: gcc-interface/decl.c
===
--- gcc-interface/decl.c	(revision 188379)
+++ gcc-interface/decl.c	(working copy)
@@ -1283,10 +1283,14 @@ gnat_to_gnu_entity (Entity_Id gnat_entit
  global_bindings_p ()
  || !definition
  || static_p)
-	|| (gnu_size  !allocatable_size_p (gnu_size,
-		 global_bindings_p ()
-		 || !definition
-		 || static_p)))
+	|| (gnu_size
+		 !allocatable_size_p (convert (sizetype,
+		 size_binop
+		 (CEIL_DIV_EXPR, gnu_size,
+		  bitsize_unit_node)),
+	global_bindings_p ()
+	|| !definition
+	|| static_p)))
 	  {
 	gnu_type = build_reference_type (gnu_type);
 	gnu_size = NULL_TREE;
@@ -2204,8 +2208,6 @@ gnat_to_gnu_entity (Entity_Id gnat_entit
 			debug_info_p);
 	TYPE_READONLY (gnu_template_type) = 1;
 
-	/* Now build the array type.  */
-
 	/* If Component_Size is not already specified, annotate it with the
 	   size of the component.  */
 	if (Unknown_Component_Size (gnat_entity))
@@ -2810,12 +2812,14 @@ gnat_to_gnu_entity (Entity_Id gnat_entit
 	tree gnu_lower_bound
 	  = convert (gnu_string_index_type,
 		 gnat_to_gnu (String_Literal_Low_Bound (gnat_entity)));
-	int length = UI_To_Int (String_Literal_Length (gnat_entity));
-	tree gnu_length = ssize_int (length - 1);
+	tree gnu_length
+	  = UI_To_gnu (String_Literal_Length (gnat_entity),
+		   gnu_string_index_type);
 	tree gnu_upper_bound
 	  = build_binary_op (PLUS_EXPR, gnu_string_index_type,
 			 gnu_lower_bound,
-			 convert (gnu_string_index_type, gnu_length));
+			 int_const_binop (MINUS_EXPR, gnu_length,
+	  integer_one_node));
 	tree gnu_index_type
 	  = create_index_type (convert (sizetype, gnu_lower_bound),
 			   convert (sizetype, gnu_upper_bound),
@@ -3298,7 +3302,7 @@ gnat_to_gnu_entity (Entity_Id gnat_entit
 	  if (gnu_variant_part)
 		{
 		  variant_desc *v;
-		  unsigned ix;
+		  unsigned int i;
 
 		  gnu_variant_list
 		= build_variant_list (TREE_TYPE (gnu_variant_part),
@@ -3307,8 +3311,7 @@ gnat_to_gnu_entity (Entity_Id gnat_entit
 		  /* If all the qualifiers are unconditionally true, the
 		 innermost variant is statically selected.  */
 		  

Re: [PATCH] Correct cost model for strided loads

2012-06-11 Thread Richard Guenther
On Sun, Jun 10, 2012 at 5:58 PM, William J. Schmidt
wschm...@linux.vnet.ibm.com wrote:
 The fix for PR53331 caused a degradation to 187.facerec on
 powerpc64-unknown-linux-gnu.  The following simple patch reverses the
 degradation without otherwise affecting SPEC cpu2000 or cpu2006.
 Bootstrapped and regtested on that platform with no new regressions.  Ok
 for trunk?

Well, would the real cost not be subparts * scalar_to_vec plus
subparts * vec_perm?
At least vec_perm isn't the cost for building up a vector from N scalar elements
either (it might be close enough if subparts == 2).  What's the case
with facerec
here?  Does it have subparts == 2?  I really wanted to pessimize this case
for say AVX and char elements, thus building up a vector from 32 scalars which
certainly does not cost a mere vec_perm.  So, maybe special-case the
subparts == 2 case and assume vec_perm would match the cost only in that
case.

Thanks,
Richard.

 Thanks,
 Bill


 2012-06-10  Bill Schmidt  wschm...@linux.ibm.com

        * tree-vect-stmts.c (vect_model_load_cost):  Change cost model
        for strided loads.


 Index: gcc/tree-vect-stmts.c
 ===
 --- gcc/tree-vect-stmts.c       (revision 188341)
 +++ gcc/tree-vect-stmts.c       (working copy)
 @@ -1031,11 +1031,10 @@ vect_model_load_cost (stmt_vec_info stmt_info, int
   /* The loads themselves.  */
   if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
     {
 -      /* N scalar loads plus gathering them into a vector.
 -         ???  scalar_to_vec isn't the cost for that.  */
 +      /* N scalar loads plus gathering them into a vector.  */
       inside_cost += (vect_get_stmt_cost (scalar_load) * ncopies
                      * TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)));
 -      inside_cost += ncopies * vect_get_stmt_cost (scalar_to_vec);
 +      inside_cost += ncopies * vect_get_stmt_cost (vec_perm);
     }
   else
     vect_get_load_cost (first_dr, ncopies,




Re: [Fortran, DRAFT patch] PR 46321 - [OOP] Polymorphic deallocation

2012-06-11 Thread Alessandro Fanfarillo
Thank you for the review, with this patch I get some ICEs during the
regstest with:

gfortran.dg/coarray/poly_run_3.f90
gfortran.dg/elemental_optional_args_5.f03
gfortran.dg/select_type_26.f03
gfortran.dg/select_type_27.f03
gfortran.dg/class_48.f90
gfortran.dg/class_allocate_10.f03
gfortran.dg/class_allocate_8.f03
gfortran.dg/class_array_1.f03
gfortran.dg/class_array_2.f03
gfortran.dg/assumed_type_2.f90
gfortran.dg/class_array_9.f03
gfortran.dg/coarray_lib_alloc_2.f90

I've debugged only the first 2 and the problem seems to be related
with tmp =  fold_build2_loc (input_location, NE_EXPR,
boolean_type_node, se.expr, build_int_cst (TREE_TYPE (se.expr), 0)); 
in trans-stmt.c at line 5376. The ICE message is the following:

$ gcc/bin/gfortran -c elemental_optional_args_5.f03
elemental_optional_args_5.f03: In function ‘MAIN__’:
elemental_optional_args_5.f03:220:0: internal compiler error: in
build_int_cst_wide, at tree.c:1219
 deallocate (taa, tpa, caa, cpa)
 ^
Please submit a full bug report,
with preprocessed source if appropriate.
See http://gcc.gnu.org/bugs.html for instructions.



2012/6/10 Tobias Burnus bur...@net-b.de:
 Alessandro Fanfarillo wrote:

 with the priceless support of Tobias I've almost realized the patch
 for this PR. In attachment there's the second draft. During the
 regression test I have only one error with select_type_4.f90. The
 problem is in the destroy_list subroutine when it checks
 associated(node) after the first deallocate(node).


 --- gcc/fortran/trans-stmt.c    (revisione 188002)
 +++ gcc/fortran/trans-stmt.c    (copia locale)
 @@ -5341,7 +5341,12 @@ gfc_trans_deallocate (gfc_code *code)
    for (al = code-ext.alloc.list; al != NULL; al = al-next)
     {
 -      gfc_expr *expr = gfc_copy_expr (al-expr);
 +      gfc_expr *expr;
 +      gfc_expr *ppc;
 +      gfc_code *ppc_code;
 +      gfc_actual_arglist *actual;
 +      expr = gfc_copy_expr (al-expr);
 +      ppc = gfc_copy_expr (expr);
 ...
 +      if (expr-symtree-n.sym-ts.type == BT_CLASS)


 I'd prefer:

 gfc_expr *ppc = NULL;
 ...
 if (expr-ts.type == BT_CLASS)
  ppc = gfc_copy_expr (expr);
 ...
 if (ppc)
  ...

 Namely: Only copy the expression if needed.

 Additionally, the check if (expr-symtree-n.sym-ts.type == BT_CLASS) is
 wrong. For instance, for
  type(t) :: x
  deallocate(x%class)
 it won't trigger, but it should.

 Actually, I think a cleaner version would be:

 if (al-expr-ts.type == BT_CLASS)
  {
    gfc_expr *ppc;
    ppc = gfc_copy_expr (al-expr);

  * * *

 Furthermore, I think you call _free + free for the same component for:

 type t
   integer, allocatable :: x
 end type t
 class(t), allocatable :: y
 ...
 deallocate (y)

 * * *

 Regarding your code: You assume that al-expr points to an allocated
 variable, that's not the always the case - hence, select_type_4.f90 fails.

 * * *

 You always create a _free function; I wonder whether it makes sense to use
 _vtab-free with NULL in case that no _free is needed.

  * * *

 Attached an updated version, which does that all. No guarantee that it works
 correctly, but it should at least fix select_type_4.f90.

 Tobias


Re: Wider modes for partial int modes

2012-06-11 Thread Bernd Schmidt

On 06/09/2012 01:58 AM, Mike Stump wrote:

On Jun 8, 2012, at 4:11 PM, Mike Stump wrote:

On Apr 17, 2012, at 2:08 PM, Bernd Schmidt wrote:

This patch enables GET_MODE_WIDER_MODE for MODE_PARTIAL_INT (by setting
the wider mode to the one the partial mode is based on), which is useful
for the port I'm working on: I can avoid defining operations on the
partial modes.


I think this breaks my port, in emit-rtl.c:

B   for (mode = GET_CLASS_NARROWEST_MODE (MODE_PARTIAL_INT);
 mode != VOIDmode;
 mode = GET_MODE_WIDER_MODE (mode))
  =   const_tiny_rtx[i][(int) mode] = GEN_INT (i);

Doesn't work, as that does expects to step through all partial int modes.  The 
problem is, we went from P1QI -  P1SI, and now we go from P1QI to QI.  The 
problem is this, we never initialize P1SI anymore, and later, when initializing 
vector constants, things like V2P1SI dies, as P1SI was never built:



So, maybe we want to go through all modes, and just ask, are you a 
MODE_PARTIAL_INT, and initialize them that way?  Thoughts?


The below patch allows my port to at least initialize once more...  Thoughts?  
Ok?


Looks good to me.


Bernd


Re: [Fortran, DRAFT patch] PR 46321 - [OOP] Polymorphic deallocation

2012-06-11 Thread Tobias Burnus

On 06/11/2012 11:24 AM, Alessandro Fanfarillo wrote:

gfortran.dg/coarray/poly_run_3.f90


That one fails because I for forgot that se.expr in gfc_trans_deallocate 
contains the descriptor and not the pointer to the data. That's fixed by:


  tmp = se.expr;
  if (GFC_DESCRIPTOR_TYPE_P (TREE_TYPE (tmp)))
{
  tmp = gfc_conv_descriptor_data_get (tmp);
  STRIP_NOPS (tmp);
}
  tmp =  fold_build2_loc (input_location, NE_EXPR, 
boolean_type_node,

  tmp, build_int_cst (TREE_TYPE (tmp), 0));

However, it still fails for the

type t
  integer, allocatable :: comp
end type t
contains
  subroutine foo(x)
class(t), allocatable, intent(out) :: x(:)
  end subroutine
end

(The intent(out) causes automatic deallocation.) The backtrace does not 
really point to some code which the patch touched; it shouldn't be 
affected by the class.c changes and gfc_trans_deallocate does not seem 
to be entered.


While I do not immediately see why it fails, I wonder whether it is due 
to the removed else if ... BT_CLASS) case in 
gfc_deallocate_scalar_with_status. In any case, the change to 
gfc_trans_deallocate might be also needed for 
gfc_deallocate_scalar_with_status. At least, automatic deallocation 
(with intent(out) or when leaving the scope) does not seem to go through 
gfc_trans_deallocate but only through gfc_deallocate_scalar_with_status.


Tobias


Patch ping

2012-06-11 Thread Jakub Jelinek
Hi!

http://gcc.gnu.org/ml/gcc-patches/2012-06/msg00105.html
  - add warning for complit with incomplte type

Jakub


Re: [PATCH] Hoist adjacent pointer loads

2012-06-11 Thread Richard Guenther
On Mon, Jun 4, 2012 at 3:45 PM, William J. Schmidt
wschm...@linux.vnet.ibm.com wrote:
 Hi Richard,

 Here's a revision of the hoist-adjacent-loads patch.  I'm sorry for the
 delay since the last revision, but my performance testing has been
 blocked waiting for a fix to PR53487.  I ended up applying a test
 version of the patch to 4.7 and ran performance numbers with that
 instead, with no degradations.

 In addition to addressing your comments, this patch contains one bug fix
 where local_mem_dependence was called on the wrong blocks after swapping
 def1 and def2.

 Bootstrapped with no regressions on powerpc64-unknown-linux-gnu.  Is
 this version ok for trunk?  I won't commit it until I can do final
 testing on trunk in conjunction with a fix for PR53487.

 Thanks,
 Bill


 2012-06-04  Bill Schmidt  wschm...@linux.vnet.ibm.com

        * opts.c: Add -fhoist_adjacent_loads to -O2 and above.
        * tree-ssa-phiopt.c (tree_ssa_phiopt_worker): Add argument to forward
        declaration.
        (hoist_adjacent_loads, gate_hoist_loads): New forward declarations.
        (tree_ssa_phiopt): Call gate_hoist_loads.
        (tree_ssa_cs_elim): Add parm to tree_ssa_phiopt_worker call.
        (tree_ssa_phiopt_worker): Add do_hoist_loads to formal arg list; call
        hoist_adjacent_loads.
        (local_mem_dependence): New function.
        (hoist_adjacent_loads): Likewise.
        (gate_hoist_loads): Likewise.
        * common.opt (fhoist-adjacent-loads): New switch.
        * Makefile.in (tree-ssa-phiopt.o): Added dependencies.
        * params.def (PARAM_MIN_CMOVE_STRUCT_ALIGN): New param.


 Index: gcc/opts.c
 ===
 --- gcc/opts.c  (revision 187805)
 +++ gcc/opts.c  (working copy)
 @@ -489,6 +489,7 @@ static const struct default_options default_option
     { OPT_LEVELS_2_PLUS, OPT_falign_functions, NULL, 1 },
     { OPT_LEVELS_2_PLUS, OPT_ftree_tail_merge, NULL, 1 },
     { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_foptimize_strlen, NULL, 1 },
 +    { OPT_LEVELS_2_PLUS, OPT_fhoist_adjacent_loads, NULL, 1 },

     /* -O3 optimizations.  */
     { OPT_LEVELS_3_PLUS, OPT_ftree_loop_distribute_patterns, NULL, 1 },
 Index: gcc/tree-ssa-phiopt.c
 ===
 --- gcc/tree-ssa-phiopt.c       (revision 187805)
 +++ gcc/tree-ssa-phiopt.c       (working copy)
 @@ -37,9 +37,17 @@ along with GCC; see the file COPYING3.  If not see
  #include cfgloop.h
  #include tree-data-ref.h
  #include tree-pretty-print.h
 +#include gimple-pretty-print.h
 +#include insn-config.h
 +#include expr.h
 +#include optabs.h

 +#ifndef HAVE_conditional_move
 +#define HAVE_conditional_move (0)
 +#endif
 +
  static unsigned int tree_ssa_phiopt (void);
 -static unsigned int tree_ssa_phiopt_worker (bool);
 +static unsigned int tree_ssa_phiopt_worker (bool, bool);
  static bool conditional_replacement (basic_block, basic_block,
                                     edge, edge, gimple, tree, tree);
  static int value_replacement (basic_block, basic_block,
 @@ -53,6 +61,9 @@ static bool cond_store_replacement (basic_block, b
  static bool cond_if_else_store_replacement (basic_block, basic_block, 
 basic_block);
  static struct pointer_set_t * get_non_trapping (void);
  static void replace_phi_edge_with_variable (basic_block, edge, gimple, tree);
 +static void hoist_adjacent_loads (basic_block, basic_block,
 +                                 basic_block, basic_block);
 +static bool gate_hoist_loads (void);

  /* This pass tries to replaces an if-then-else block with an
    assignment.  We have four kinds of transformations.  Some of these
 @@ -138,12 +149,56 @@ static void replace_phi_edge_with_variable (basic_
      bb2:
        x = PHI x' (bb0), ...;

 -   A similar transformation is done for MAX_EXPR.  */
 +   A similar transformation is done for MAX_EXPR.

 +
 +   This pass also performs a fifth transformation of a slightly different
 +   flavor.
 +
 +   Adjacent Load Hoisting
 +   --
 +
 +   This transformation replaces
 +
 +     bb0:
 +       if (...) goto bb2; else goto bb1;
 +     bb1:
 +       x1 = (expr).field1;
 +       goto bb3;
 +     bb2:
 +       x2 = (expr).field2;
 +     bb3:
 +       # x = PHI x1, x2;
 +
 +   with
 +
 +     bb0:
 +       x1 = (expr).field1;
 +       x2 = (expr).field2;
 +       if (...) goto bb2; else goto bb1;
 +     bb1:
 +       goto bb3;
 +     bb2:
 +     bb3:
 +       # x = PHI x1, x2;
 +
 +   The purpose of this transformation is to enable generation of conditional
 +   move instructions such as Intel CMOVE or PowerPC ISEL.  Because one of
 +   the loads is speculative, the transformation is restricted to very
 +   specific cases to avoid introducing a page fault.  We are looking for
 +   the common idiom:
 +
 +     if (...)
 +       x = y-left;
 +     else
 +       x = y-right;
 +
 +   where left and right are typically adjacent pointers in a tree structure. 
  */
 +
  static 

Re: [PATCH] Add vector cost model density heuristic

2012-06-11 Thread Richard Guenther
On Fri, 8 Jun 2012, William J. Schmidt wrote:

 This patch adds a heuristic to the vectorizer when estimating the
 minimum profitable number of iterations.  The heuristic is
 target-dependent, and is currently disabled for all targets except
 PowerPC.  However, the intent is to make it general enough to be useful
 for other targets that want to opt in.
 
 A previous patch addressed some PowerPC SPEC degradations by modifying
 the vector cost model values for vec_perm and vec_promote_demote.  The
 values were set a little higher than their natural values because the
 natural values were not sufficient to prevent a poor vectorization
 choice.  However, this is not the right long-term solution, since it can
 unnecessarily constrain other vectorization choices involving permute
 instructions.
 
 Analysis of the badly vectorized loop (in sphinx3) showed that the
 problem was overcommitment of vector resources -- too many vector
 instructions issued without enough non-vector instructions available to
 cover the delays.  The vector cost model assumes that instructions
 always have a constant cost, and doesn't have a way of judging this kind
 of density of vector instructions.
 
 The present patch adds a heuristic to recognize when a loop is likely to
 overcommit resources, and adds a small penalty to the inside-loop cost
 to account for the expected stalls.  The heuristic is parameterized with
 three target-specific values:
 
  * Density threshold: The heuristic will apply only when the
percentage of inside-loop cost attributable to vectorized
instructions exceeds this value.
 
  * Size threshold: The heuristic will apply only when the
inside-loop cost exceeds this value.
 
  * Penalty: The inside-loop cost will be increased by this
percentage value when the heuristic applies.
 
 Thus only reasonably large loop bodies that are mostly vectorized
 instructions will be affected.
 
 By applying only a small percentage bump to the inside-loop cost, the
 heuristic will only turn off vectorization for loops that were
 considered barely profitable to begin with (such as the sphinx3 loop).
 So the heuristic is quite conservative and should not affect the vast
 majority of vectorization decisions.
 
 Together with the new heuristic, this patch reduces the vec_perm and
 vec_promote_demote costs for PowerPC to their natural values.
 
 I've regstrapped this with no regressions on powerpc64-unknown-linux-gnu
 and verified that no performance regressions occur on SPEC cpu2006.  Is
 this ok for trunk?

Hmm.  I don't like this patch or its general idea too much.  Instead
I'd like us to move more of the cost model detail to the target, giving
it a chance to look at the whole loop before deciding on a cost.  ISTR
posting the overall idea at some point, but let me repeat it here instead
of trying to find that e-mail.

The basic interface of the cost model should be, in targetm.vectorize

  /* Tell the target to start cost analysis of a loop or a basic-block
 (if the loop argument is NULL).  Returns an opaque pointer to
 target-private data.  */
  void *init_cost (struct loop *loop);

  /* Add cost for N vectorized-stmt-kind statements in vector_mode.  */
  void add_stmt_cost (void *data, unsigned n,
  vectorized-stmt-kind,
  enum machine_mode vector_mode);

  /* Tell the target to compute and return the cost of the accumulated
 statements and free any target-private data.  */
  unsigned finish_cost (void *data);

with eventually slightly different signatures for add_stmt_cost
(like pass in the original scalar stmt?).

It allows the target, at finish_cost time, to evaluate things like
register pressure and resource utilization.

Thanks,
Richard.

 Thanks,
 Bill
 
 
 2012-06-08  Bill Schmidt  wschm...@linux.ibm.com
 
   * doc/tm.texi.in: Add vectorization density hooks.
   * doc/tm.texi: Regenerate.
   * targhooks.c (default_density_pct_threshold): New.
   (default_density_size_threshold): New.
   (default_density_penalty): New.
   * targhooks.h: New decls for new targhooks.c functions.
   * target.def (density_pct_threshold): New DEF_HOOK.
   (density_size_threshold): Likewise.
   (density_penalty): Likewise.
   * tree-vect-loop.c (accum_stmt_cost): New.
   (vect_estimate_min_profitable_iters): Perform density test.
   * config/rs6000/rs6000.c (TARGET_VECTORIZE_DENSITY_PCT_THRESHOLD):
   New macro definition.
   (TARGET_VECTORIZE_DENSITY_SIZE_THRESHOLD): Likewise.
   (TARGET_VECTORIZE_DENSITY_PENALTY): Likewise.
   (rs6000_builtin_vectorization_cost): Reduce costs of vec_perm and
   vec_promote_demote to correct values.
   (rs6000_density_pct_threshold): New.
   (rs6000_density_size_threshold): New.
   (rs6000_density_penalty): New.
 
 
 Index: gcc/doc/tm.texi
 ===
 --- gcc/doc/tm.texi   (revision 188305)
 +++ 

[Ping] Fix gcc/gcov.c and libgcc/libgcov.c to fix build on VxWorks

2012-06-11 Thread rbmj

Hi everyone,

Ping RE: http://gcc.gnu.org/ml/gcc-patches/2012-06/msg00086.html

Thanks,

Robert Mason

On 06/01/2012 02:43 PM, rbmj wrote:

On 06/01/2012 02:40 PM, rbmj wrote:

Hi everyone,

These fixes are to allow building on vxWorks.  Currently there are 
two issues:


1.  VxWorks does not have a variadic open - it must receive three 
arguments.  gcc/gcov.c however opens a file for reading and does not 
pass in a mode argument, which causes an error on vxWorks.  This just 
adds a platform-based ifdef around this.  I am aware that this is 
against coding conventions, and if that is an issue, I can think of 
two resolutions.  One, an autoconf macro to check for a non-variadic 
open, or two, simply pass the extra mode argument in unconditionally, 
as it should be transparent to the function and ignored if it is 
variadic (I'm no expert on calling conventions though).


2.  VxWorks has a two argument mkdir().  libgcc/libgcov.c uses 
mkdir() and calls it with two arguments if TARGET_POSIX_IO is defined 
and only one argument otherwise.  I don't know what TARGET_POSIX_IO 
is, but if it's anything more than just mkdir(), this is probably 
correct, as vxworks is only partially POSIX compliant.  Again, 
another platform-based ifdef, so if that is anathema, it can be 
replaced by more autoconf hackery.


The patch as-is compiles targeting on vxworks and bootstraps on a 
native x86_64-linux-gnu build (which makes sense, since it doesn't 
touch anything for non-vxworks).


Gerald Pfeifer has volunteered to apply the patch if approved.

Also, in an earlier message [1] Janne Blomqvist mentioned that newer 
versions of VxWorks do have the variadic open(), and this is true.  
However, as far as I know, this version is still not available for 
kernel modules, only real-time processes.


Thanks,

Robert Mason


Sorry, forgot to add in the link.
[1]: http://gcc.gnu.org/ml/gcc-patches/2012-05/msg01102.html





Re: [Patch] Add AC_ARG_ENABLE for libstdc++-v3

2012-06-11 Thread rbmj

On 05/22/2012 04:37 PM, rbmj wrote:
This patch adds an AC_ARG_ENABLE option to build/not build 
libstdc++-v3.  I wrote the patch in order to allow the user to 
override the automatic disable for libstdc++-v3 for certain targets.

Ping^2 on http://gcc.gnu.org/ml/gcc-patches/2012-05/msg01525.html

Robert Mason


Re: [Ping] Fix gcc/gcov.c and libgcc/libgcov.c to fix build on VxWorks

2012-06-11 Thread Paolo Bonzini
Il 11/06/2012 13:56, rbmj ha scritto:
 1.  VxWorks does not have a variadic open - it must receive three
 arguments.  gcc/gcov.c however opens a file for reading and does not
 pass in a mode argument, which causes an error on vxWorks.  This just
 adds a platform-based ifdef around this.  I am aware that this is
 against coding conventions, and if that is an issue, I can think of
 two resolutions.  [...] simply pass the extra mode argument in 
 unconditionally,
 as it should be transparent to the function and ignored if it is
 variadic (I'm no expert on calling conventions though).

Yes, please do this.

 2.  VxWorks has a two argument mkdir().  libgcc/libgcov.c uses
 mkdir() and calls it with two arguments if TARGET_POSIX_IO is defined
 and only one argument otherwise.  I don't know what TARGET_POSIX_IO
 is, but if it's anything more than just mkdir(), this is probably
 correct, as vxworks is only partially POSIX compliant.  Again,
 another platform-based ifdef, so if that is anathema, it can be
 replaced by more autoconf hackery.

VxWorks should define TARGET_POSIX_IO if it has both access and mkdir.
Please add it to gcc/config/vxworks.h if this is the case.

Paolo


Re: [Patch] Add AC_ARG_ENABLE for libstdc++-v3

2012-06-11 Thread Paolo Bonzini
Il 11/06/2012 13:59, rbmj ha scritto:
 On 05/22/2012 04:37 PM, rbmj wrote:
 This patch adds an AC_ARG_ENABLE option to build/not build
 libstdc++-v3.  I wrote the patch in order to allow the user to
 override the automatic disable for libstdc++-v3 for certain targets.
 Ping^2 on http://gcc.gnu.org/ml/gcc-patches/2012-05/msg01525.html

Ok.

Paolo




[PATCH][C++] Fix PR53605

2012-06-11 Thread Richard Guenther

When I changed empty arrays domain to use a signed sizetype [0, -1]
domain mangling of a empty-array-type broke because mangling adds
an unsigned one to the signed -1 which causes an ICE (I chose to
do that instead of shifting the range to [1, 0] because much more
code relies on a zero lower bound ...).

The following fixes that by using double-ints for the arithmetic
and thus also does not generate a not needed tree node.

Bootstrapped and tested on x86_64-unknown-linux-gnu, ok?

Thanks,
Richard.

2012-06-11  Richard Guenther  rguent...@suse.de

PR c++/53616
* mangle.c (write_array_type): Use double-ints for array domain
arithmetic.

* g++.dg/ext/pr53605.C: New testcase.

Index: gcc/cp/mangle.c
===
--- gcc/cp/mangle.c (revision 188384)
+++ gcc/cp/mangle.c (working copy)
@@ -3119,8 +3119,10 @@ write_array_type (const tree type)
{
  /* The ABI specifies that we should mangle the number of
 elements in the array, not the largest allowed index.  */
- max = size_binop (PLUS_EXPR, max, size_one_node);
- write_unsigned_number (tree_low_cst (max, 1));
+ double_int dmax
+   = double_int_add (tree_to_double_int (max), double_int_one);
+ gcc_assert (double_int_fits_in_uhwi_p (dmax));
+ write_unsigned_number (dmax.low);
}
   else
{
Index: gcc/testsuite/g++.dg/ext/pr53605.C
===
--- gcc/testsuite/g++.dg/ext/pr53605.C  (revision 0)
+++ gcc/testsuite/g++.dg/ext/pr53605.C  (revision 0)
@@ -0,0 +1,16 @@
+// { dg-do compile }
+
+// Avoid -pedantic-error default
+// { dg-options  }
+
+template bool lhs_is_null_literal
+class EqHelper {
+public:
+template typename T1, typename T2
+   static int  Compare( const T1 expected,
+const T2 actual);
+};
+void foo(){
+static const int kData[] = {};
+::EqHelperfalse::Compare(kData, abc);
+}


Re: [PATCH] Hoist adjacent pointer loads

2012-06-11 Thread William J. Schmidt
On Mon, 2012-06-11 at 13:28 +0200, Richard Guenther wrote:
 On Mon, Jun 4, 2012 at 3:45 PM, William J. Schmidt
 wschm...@linux.vnet.ibm.com wrote:
  Hi Richard,
 
  Here's a revision of the hoist-adjacent-loads patch.  I'm sorry for the
  delay since the last revision, but my performance testing has been
  blocked waiting for a fix to PR53487.  I ended up applying a test
  version of the patch to 4.7 and ran performance numbers with that
  instead, with no degradations.
 
  In addition to addressing your comments, this patch contains one bug fix
  where local_mem_dependence was called on the wrong blocks after swapping
  def1 and def2.
 
  Bootstrapped with no regressions on powerpc64-unknown-linux-gnu.  Is
  this version ok for trunk?  I won't commit it until I can do final
  testing on trunk in conjunction with a fix for PR53487.
 
  Thanks,
  Bill
 
 
  2012-06-04  Bill Schmidt  wschm...@linux.vnet.ibm.com
 
 * opts.c: Add -fhoist_adjacent_loads to -O2 and above.
 * tree-ssa-phiopt.c (tree_ssa_phiopt_worker): Add argument to forward
 declaration.
 (hoist_adjacent_loads, gate_hoist_loads): New forward declarations.
 (tree_ssa_phiopt): Call gate_hoist_loads.
 (tree_ssa_cs_elim): Add parm to tree_ssa_phiopt_worker call.
 (tree_ssa_phiopt_worker): Add do_hoist_loads to formal arg list; call
 hoist_adjacent_loads.
 (local_mem_dependence): New function.
 (hoist_adjacent_loads): Likewise.
 (gate_hoist_loads): Likewise.
 * common.opt (fhoist-adjacent-loads): New switch.
 * Makefile.in (tree-ssa-phiopt.o): Added dependencies.
 * params.def (PARAM_MIN_CMOVE_STRUCT_ALIGN): New param.
 
 
  Index: gcc/opts.c
  ===
  --- gcc/opts.c  (revision 187805)
  +++ gcc/opts.c  (working copy)
  @@ -489,6 +489,7 @@ static const struct default_options default_option
  { OPT_LEVELS_2_PLUS, OPT_falign_functions, NULL, 1 },
  { OPT_LEVELS_2_PLUS, OPT_ftree_tail_merge, NULL, 1 },
  { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_foptimize_strlen, NULL, 1 },
  +{ OPT_LEVELS_2_PLUS, OPT_fhoist_adjacent_loads, NULL, 1 },
 
  /* -O3 optimizations.  */
  { OPT_LEVELS_3_PLUS, OPT_ftree_loop_distribute_patterns, NULL, 1 },
  Index: gcc/tree-ssa-phiopt.c
  ===
  --- gcc/tree-ssa-phiopt.c   (revision 187805)
  +++ gcc/tree-ssa-phiopt.c   (working copy)
  @@ -37,9 +37,17 @@ along with GCC; see the file COPYING3.  If not see
   #include cfgloop.h
   #include tree-data-ref.h
   #include tree-pretty-print.h
  +#include gimple-pretty-print.h
  +#include insn-config.h
  +#include expr.h
  +#include optabs.h
 
  +#ifndef HAVE_conditional_move
  +#define HAVE_conditional_move (0)
  +#endif
  +
   static unsigned int tree_ssa_phiopt (void);
  -static unsigned int tree_ssa_phiopt_worker (bool);
  +static unsigned int tree_ssa_phiopt_worker (bool, bool);
   static bool conditional_replacement (basic_block, basic_block,
  edge, edge, gimple, tree, tree);
   static int value_replacement (basic_block, basic_block,
  @@ -53,6 +61,9 @@ static bool cond_store_replacement (basic_block, b
   static bool cond_if_else_store_replacement (basic_block, basic_block, 
  basic_block);
   static struct pointer_set_t * get_non_trapping (void);
   static void replace_phi_edge_with_variable (basic_block, edge, gimple, 
  tree);
  +static void hoist_adjacent_loads (basic_block, basic_block,
  + basic_block, basic_block);
  +static bool gate_hoist_loads (void);
 
   /* This pass tries to replaces an if-then-else block with an
 assignment.  We have four kinds of transformations.  Some of these
  @@ -138,12 +149,56 @@ static void replace_phi_edge_with_variable (basic_
   bb2:
 x = PHI x' (bb0), ...;
 
  -   A similar transformation is done for MAX_EXPR.  */
  +   A similar transformation is done for MAX_EXPR.
 
  +
  +   This pass also performs a fifth transformation of a slightly different
  +   flavor.
  +
  +   Adjacent Load Hoisting
  +   --
  +
  +   This transformation replaces
  +
  + bb0:
  +   if (...) goto bb2; else goto bb1;
  + bb1:
  +   x1 = (expr).field1;
  +   goto bb3;
  + bb2:
  +   x2 = (expr).field2;
  + bb3:
  +   # x = PHI x1, x2;
  +
  +   with
  +
  + bb0:
  +   x1 = (expr).field1;
  +   x2 = (expr).field2;
  +   if (...) goto bb2; else goto bb1;
  + bb1:
  +   goto bb3;
  + bb2:
  + bb3:
  +   # x = PHI x1, x2;
  +
  +   The purpose of this transformation is to enable generation of 
  conditional
  +   move instructions such as Intel CMOVE or PowerPC ISEL.  Because one of
  +   the loads is speculative, the transformation is restricted to very
  +   specific cases to avoid introducing a page fault.  We are looking for
  + 

Re: [PATCH] Hoist adjacent pointer loads

2012-06-11 Thread Richard Guenther
On Mon, 11 Jun 2012, William J. Schmidt wrote:

 On Mon, 2012-06-11 at 13:28 +0200, Richard Guenther wrote:
  On Mon, Jun 4, 2012 at 3:45 PM, William J. Schmidt
  wschm...@linux.vnet.ibm.com wrote:
   Hi Richard,
  
   Here's a revision of the hoist-adjacent-loads patch.  I'm sorry for the
   delay since the last revision, but my performance testing has been
   blocked waiting for a fix to PR53487.  I ended up applying a test
   version of the patch to 4.7 and ran performance numbers with that
   instead, with no degradations.
  
   In addition to addressing your comments, this patch contains one bug fix
   where local_mem_dependence was called on the wrong blocks after swapping
   def1 and def2.
  
   Bootstrapped with no regressions on powerpc64-unknown-linux-gnu.  Is
   this version ok for trunk?  I won't commit it until I can do final
   testing on trunk in conjunction with a fix for PR53487.
  
   Thanks,
   Bill
  
  
   2012-06-04  Bill Schmidt  wschm...@linux.vnet.ibm.com
  
  * opts.c: Add -fhoist_adjacent_loads to -O2 and above.
  * tree-ssa-phiopt.c (tree_ssa_phiopt_worker): Add argument to 
   forward
  declaration.
  (hoist_adjacent_loads, gate_hoist_loads): New forward declarations.
  (tree_ssa_phiopt): Call gate_hoist_loads.
  (tree_ssa_cs_elim): Add parm to tree_ssa_phiopt_worker call.
  (tree_ssa_phiopt_worker): Add do_hoist_loads to formal arg list; 
   call
  hoist_adjacent_loads.
  (local_mem_dependence): New function.
  (hoist_adjacent_loads): Likewise.
  (gate_hoist_loads): Likewise.
  * common.opt (fhoist-adjacent-loads): New switch.
  * Makefile.in (tree-ssa-phiopt.o): Added dependencies.
  * params.def (PARAM_MIN_CMOVE_STRUCT_ALIGN): New param.
  
  
   Index: gcc/opts.c
   ===
   --- gcc/opts.c  (revision 187805)
   +++ gcc/opts.c  (working copy)
   @@ -489,6 +489,7 @@ static const struct default_options default_option
   { OPT_LEVELS_2_PLUS, OPT_falign_functions, NULL, 1 },
   { OPT_LEVELS_2_PLUS, OPT_ftree_tail_merge, NULL, 1 },
   { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_foptimize_strlen, NULL, 1 },
   +{ OPT_LEVELS_2_PLUS, OPT_fhoist_adjacent_loads, NULL, 1 },
  
   /* -O3 optimizations.  */
   { OPT_LEVELS_3_PLUS, OPT_ftree_loop_distribute_patterns, NULL, 1 },
   Index: gcc/tree-ssa-phiopt.c
   ===
   --- gcc/tree-ssa-phiopt.c   (revision 187805)
   +++ gcc/tree-ssa-phiopt.c   (working copy)
   @@ -37,9 +37,17 @@ along with GCC; see the file COPYING3.  If not see
#include cfgloop.h
#include tree-data-ref.h
#include tree-pretty-print.h
   +#include gimple-pretty-print.h
   +#include insn-config.h
   +#include expr.h
   +#include optabs.h
  
   +#ifndef HAVE_conditional_move
   +#define HAVE_conditional_move (0)
   +#endif
   +
static unsigned int tree_ssa_phiopt (void);
   -static unsigned int tree_ssa_phiopt_worker (bool);
   +static unsigned int tree_ssa_phiopt_worker (bool, bool);
static bool conditional_replacement (basic_block, basic_block,
   edge, edge, gimple, tree, tree);
static int value_replacement (basic_block, basic_block,
   @@ -53,6 +61,9 @@ static bool cond_store_replacement (basic_block, b
static bool cond_if_else_store_replacement (basic_block, basic_block, 
   basic_block);
static struct pointer_set_t * get_non_trapping (void);
static void replace_phi_edge_with_variable (basic_block, edge, gimple, 
   tree);
   +static void hoist_adjacent_loads (basic_block, basic_block,
   + basic_block, basic_block);
   +static bool gate_hoist_loads (void);
  
/* This pass tries to replaces an if-then-else block with an
  assignment.  We have four kinds of transformations.  Some of these
   @@ -138,12 +149,56 @@ static void replace_phi_edge_with_variable (basic_
bb2:
  x = PHI x' (bb0), ...;
  
   -   A similar transformation is done for MAX_EXPR.  */
   +   A similar transformation is done for MAX_EXPR.
  
   +
   +   This pass also performs a fifth transformation of a slightly different
   +   flavor.
   +
   +   Adjacent Load Hoisting
   +   --
   +
   +   This transformation replaces
   +
   + bb0:
   +   if (...) goto bb2; else goto bb1;
   + bb1:
   +   x1 = (expr).field1;
   +   goto bb3;
   + bb2:
   +   x2 = (expr).field2;
   + bb3:
   +   # x = PHI x1, x2;
   +
   +   with
   +
   + bb0:
   +   x1 = (expr).field1;
   +   x2 = (expr).field2;
   +   if (...) goto bb2; else goto bb1;
   + bb1:
   +   goto bb3;
   + bb2:
   + bb3:
   +   # x = PHI x1, x2;
   +
   +   The purpose of this transformation is to enable generation of 
   conditional
   +   move instructions such as Intel CMOVE 

[RFC] Target-specific limits on vector alignment

2012-06-11 Thread Richard Earnshaw
The ARM ABI states that vectors larger than 64 bits in size still have
64-bit alignment; never-the-less, the HW supports alignment hints of up
to 128-bits in some cases and will trap in a vector has an alignment
that less than the hint.  GCC currently hard-codes larger vectors to be
aligned by the size of the vector, which means that 128-bit vectors are
marked as being 128-bit aligned.

The ARM ABI unfortunately does not support generating such alignment for
parameters passed by value and this can lead to traps at run time.  It
seems that the best way to solve this problem is to allow the back-end
to set an upper limit on the alignment permitted for a vector.

I've implemented this as a separate hook, rather than using the existing
hooks because there's a strong likelihood of breaking some existing ABIs
if I did it another way.

There are a couple of tests that will need some re-working before this
can be committed to deal with the fall-out of making this change; I'll
prepare those changes if this patch is deemed generally acceptable.

R.

* target.def (TARGET_VECTOR_ALIGNMENT): New hook.
* doc/tm.texi.in (TARGET_VECTOR_ALIGNMENT): Likewise.
* doc/tm.texi: Regenerate.
* targhooks.c (default_vector_alignment): New function.
* arm.c (arm_vector_alignment): New function.
(TARGET_VECTOR_ALIGNMENT): Define.--- config/arm/arm.c(revision 187425)
+++ config/arm/arm.c(local)
@@ -259,6 +259,8 @@ static bool arm_array_mode_supported_p (
unsigned HOST_WIDE_INT);
 static enum machine_mode arm_preferred_simd_mode (enum machine_mode);
 static bool arm_class_likely_spilled_p (reg_class_t);
+static HOST_WIDE_INT arm_vector_alignment (const_tree type,
+  HOST_WIDE_INT align);
 static bool arm_vector_alignment_reachable (const_tree type, bool is_packed);
 static bool arm_builtin_support_vector_misalignment (enum machine_mode mode,
 const_tree type,
@@ -607,6 +609,9 @@ static const struct attribute_spec arm_a
 #undef TARGET_CLASS_LIKELY_SPILLED_P
 #define TARGET_CLASS_LIKELY_SPILLED_P arm_class_likely_spilled_p
 
+#undef TARGET_VECTOR_ALIGNMENT
+#define TARGET_VECTOR_ALIGNMENT arm_vector_alignment
+
 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
   arm_vector_alignment_reachable
@@ -24850,6 +25292,15 @@ arm_have_conditional_execution (void)
   return !TARGET_THUMB1;
 }
 
+/* The AAPCS sets the maximum alignment of a vector to 64 bits.  */
+static HOST_WIDE_INT
+arm_vector_alignment (const_tree type ATTRIBUTE_UNUSED, HOST_WIDE_INT align)
+{
+  if (TARGET_AAPCS_BASED)
+return MIN (align, 64);
+  return align;
+}
+
 static unsigned int
 arm_autovectorize_vector_sizes (void)
 {
--- doc/tm.texi (revision 187425)
+++ doc/tm.texi (local)
@@ -1099,6 +1099,12 @@ make it all fit in fewer cache lines.
 If the value of this macro has a type, it should be an unsigned type.
 @end defmac
 
+@deftypefn {Target Hook} HOST_WIDE_INT TARGET_VECTOR_ALIGNMENT (const_tree 
@var{type}, HOST_WIDE_INT @var{align})
+This hook can be used to limit the alignment for a vector of type
+@var{type}, in order to comply with a platform ABI.  The default is to
+return @var{align}.
+@end deftypefn
+
 @defmac STACK_SLOT_ALIGNMENT (@var{type}, @var{mode}, @var{basic-align})
 If defined, a C expression to compute the alignment for stack slot.
 @var{type} is the data type, @var{mode} is the widest mode available,
--- doc/tm.texi.in  (revision 187425)
+++ doc/tm.texi.in  (local)
@@ -1087,6 +1087,8 @@ make it all fit in fewer cache lines.
 If the value of this macro has a type, it should be an unsigned type.
 @end defmac
 
+@hook TARGET_VECTOR_ALIGNMENT
+
 @defmac STACK_SLOT_ALIGNMENT (@var{type}, @var{mode}, @var{basic-align})
 If defined, a C expression to compute the alignment for stack slot.
 @var{type} is the data type, @var{mode} is the widest mode available,
--- target.def  (revision 187425)
+++ target.def  (local)
@@ -1615,6 +1615,14 @@ DEFHOOK
  bool, (enum machine_mode mode),
  hook_bool_mode_false)
 
+DEFHOOK
+(vector_alignment,
+ This hook can be used to limit the alignment for a vector of type\n\
+@var{type}, in order to comply with a platform ABI.  The default is to\n\
+return @var{align}.,
+ HOST_WIDE_INT, (const_tree type, HOST_WIDE_INT align),
+ default_vector_alignment)
+
 /* True if we should try to use a scalar mode to represent an array,
overriding the usual MAX_FIXED_MODE limit.  */
 DEFHOOK
--- targhooks.c (revision 187425)
+++ targhooks.c (local)
@@ -939,6 +939,13 @@ tree default_mangle_decl_assembler_name 
return id;
 }
 
+HOST_WIDE_INT
+default_vector_alignment (const_tree type ATTRIBUTE_UNUSED,
+ HOST_WIDE_INT align)
+{
+  return align;
+}
+
 bool
 default_builtin_vector_alignment_reachable (const_tree type, bool is_packed)
 {

Re: [PATCH][C++] Fix PR53605

2012-06-11 Thread Jason Merrill

OK.

Jason


Re: [PATCH] Correct cost model for strided loads

2012-06-11 Thread William J. Schmidt


On Mon, 2012-06-11 at 11:15 +0200, Richard Guenther wrote:
 On Sun, Jun 10, 2012 at 5:58 PM, William J. Schmidt
 wschm...@linux.vnet.ibm.com wrote:
  The fix for PR53331 caused a degradation to 187.facerec on
  powerpc64-unknown-linux-gnu.  The following simple patch reverses the
  degradation without otherwise affecting SPEC cpu2000 or cpu2006.
  Bootstrapped and regtested on that platform with no new regressions.  Ok
  for trunk?
 
 Well, would the real cost not be subparts * scalar_to_vec plus
 subparts * vec_perm?
 At least vec_perm isn't the cost for building up a vector from N scalar 
 elements
 either (it might be close enough if subparts == 2).  What's the case
 with facerec
 here?  Does it have subparts == 2?  

In this case, subparts == 4 (32-bit floats, 128-bit vec reg).  On
PowerPC, this requires two merge instructions and a permute instruction
to get the four 32-bit quantities into the right place in a 128-bit
register.  Currently this is modeled as a vec_perm in other parts of the
vectorizer per Ira's earlier patches, so I naively changed this to do
the same thing.

The types of vectorizer instructions aren't documented, and I can't
infer much from the i386.c cost model, so I need a little education.
What semantics are represented by scalar_to_vec?

On PowerPC, we have this mapping of the floating-point registers and
vector float registers where they overlap (the low-order half of each of
the first 32 vector float regs corresponds to a scalar float reg).  So
in this case we have four scalar loads that place things in the bottom
half of four vector registers, two vector merge instructions that
collapse the four registers into two vector registers, and a vector
permute that gets things in the right order.(*)  I wonder if what we
refer to as a merge instruction is similar to scalar_to_vec.

If so, then maybe we need something like

 subparts = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
 inside_cost += vect_get_stmt_cost (scalar_load) * ncopies * subparts;
 inside_cost += ncopies * vect_get_stmt_cost (scalar_to_vec) * subparts / 2;
 inside_cost += ncopies * vect_get_stmt_cost (vec_perm);

But then we'd have to change how vec_perm is modeled elsewhere for
PowerPC based on Ira's earlier patches.  As I said, it's difficult for
me to figure out all the intent of cost model decisions that have been
made in the past, using current documentation.

 I really wanted to pessimize this case
 for say AVX and char elements, thus building up a vector from 32 scalars which
 certainly does not cost a mere vec_perm.  So, maybe special-case the
 subparts == 2 case and assume vec_perm would match the cost only in that
 case.

(I'm a little confused by this as what you have at the moment is a
single scalar_to_vec per copy, which has a cost of 1 on most i386
targets (occasionally 2).  The subparts multiplier is only applied to
the loads.  So changing this to vec_perm seemed to be a no-op for i386.)

(*) There are actually a couple more instructions here to convert 64-bit
values to 32-bit values, since on PowerPC 32-bit loads are converted to
64-bit values in scalar float registers and they have to be coerced back
to 32-bit.  Very ugly.  The cost model currently doesn't represent this
at all, which I'll have to look at fixing at some point in some way that
isn't too nasty for the other targets.  The cost model for PowerPC seems
to need a lot of TLC.

Thanks,
Bill

 
 Thanks,
 Richard.
 
  Thanks,
  Bill
 
 
  2012-06-10  Bill Schmidt  wschm...@linux.ibm.com
 
 * tree-vect-stmts.c (vect_model_load_cost):  Change cost model
 for strided loads.
 
 
  Index: gcc/tree-vect-stmts.c
  ===
  --- gcc/tree-vect-stmts.c   (revision 188341)
  +++ gcc/tree-vect-stmts.c   (working copy)
  @@ -1031,11 +1031,10 @@ vect_model_load_cost (stmt_vec_info stmt_info, int
/* The loads themselves.  */
if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
  {
  -  /* N scalar loads plus gathering them into a vector.
  - ???  scalar_to_vec isn't the cost for that.  */
  +  /* N scalar loads plus gathering them into a vector.  */
inside_cost += (vect_get_stmt_cost (scalar_load) * ncopies
   * TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE 
  (stmt_info)));
  -  inside_cost += ncopies * vect_get_stmt_cost (scalar_to_vec);
  +  inside_cost += ncopies * vect_get_stmt_cost (vec_perm);
  }
else
  vect_get_load_cost (first_dr, ncopies,
 
 
 



Re: [PATCH] Correct cost model for strided loads

2012-06-11 Thread Richard Guenther
On Mon, 11 Jun 2012, William J. Schmidt wrote:

 
 
 On Mon, 2012-06-11 at 11:15 +0200, Richard Guenther wrote:
  On Sun, Jun 10, 2012 at 5:58 PM, William J. Schmidt
  wschm...@linux.vnet.ibm.com wrote:
   The fix for PR53331 caused a degradation to 187.facerec on
   powerpc64-unknown-linux-gnu.  The following simple patch reverses the
   degradation without otherwise affecting SPEC cpu2000 or cpu2006.
   Bootstrapped and regtested on that platform with no new regressions.  Ok
   for trunk?
  
  Well, would the real cost not be subparts * scalar_to_vec plus
  subparts * vec_perm?
  At least vec_perm isn't the cost for building up a vector from N scalar 
  elements
  either (it might be close enough if subparts == 2).  What's the case
  with facerec
  here?  Does it have subparts == 2?  
 
 In this case, subparts == 4 (32-bit floats, 128-bit vec reg).  On
 PowerPC, this requires two merge instructions and a permute instruction
 to get the four 32-bit quantities into the right place in a 128-bit
 register.  Currently this is modeled as a vec_perm in other parts of the
 vectorizer per Ira's earlier patches, so I naively changed this to do
 the same thing.

I see.

 The types of vectorizer instructions aren't documented, and I can't
 infer much from the i386.c cost model, so I need a little education.
 What semantics are represented by scalar_to_vec?

It's a vector splat, thus x - { x, x, x, ... }.  You can create
{ x, y, z, ... } by N such splats plus N - 1 permutes (if a permute,
as VEC_PERM_EXPR, takes two input vectors).  That's by far not
the most efficient way to build up such a vector of course (with AVX
you could do one splat plus N - 1 inserts for example).  The cost
is of course dependent on the number of vector elements, so a
simple new enum vect_cost_for_stmt kind does not cover it but
the target would have to look at the vector type passed and do
some reasonable guess.

 On PowerPC, we have this mapping of the floating-point registers and
 vector float registers where they overlap (the low-order half of each of
 the first 32 vector float regs corresponds to a scalar float reg).  So
 in this case we have four scalar loads that place things in the bottom
 half of four vector registers, two vector merge instructions that
 collapse the four registers into two vector registers, and a vector
 permute that gets things in the right order.(*)  I wonder if what we
 refer to as a merge instruction is similar to scalar_to_vec.

Looks similar to x86 SSE then.

 If so, then maybe we need something like
 
  subparts = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
  inside_cost += vect_get_stmt_cost (scalar_load) * ncopies * subparts;
  inside_cost += ncopies * vect_get_stmt_cost (scalar_to_vec) * subparts / 
 2;
  inside_cost += ncopies * vect_get_stmt_cost (vec_perm);
 
 But then we'd have to change how vec_perm is modeled elsewhere for
 PowerPC based on Ira's earlier patches.  As I said, it's difficult for
 me to figure out all the intent of cost model decisions that have been
 made in the past, using current documentation.

Heh, usually the intent was to make the changes simple, not to compute
a proper cost.

I think we simply need a new scalars_to_vec cost kind.

  I really wanted to pessimize this case
  for say AVX and char elements, thus building up a vector from 32 scalars 
  which
  certainly does not cost a mere vec_perm.  So, maybe special-case the
  subparts == 2 case and assume vec_perm would match the cost only in that
  case.
 
 (I'm a little confused by this as what you have at the moment is a
 single scalar_to_vec per copy, which has a cost of 1 on most i386
 targets (occasionally 2).  The subparts multiplier is only applied to
 the loads.  So changing this to vec_perm seemed to be a no-op for i386.)

Oh, I somehow read the patch as you were removing the multiplication
by TYPE_VECTOR_SUBPARTS.  But yes, the cost is way off and I'd wanted
to reflect it with N scalar loads plus N splats plus N - 1 permutes
originally.  You could also model it with N scalar loads plus N
inserts (but we don't have a vec_insert cost either).  I think adding
a scalars_to_vec or vec_init or however we want to call it - basically
what the cost of a vector CONSTRUCTOR would be - is best.

 (*) There are actually a couple more instructions here to convert 64-bit
 values to 32-bit values, since on PowerPC 32-bit loads are converted to
 64-bit values in scalar float registers and they have to be coerced back
 to 32-bit.  Very ugly.  The cost model currently doesn't represent this
 at all, which I'll have to look at fixing at some point in some way that
 isn't too nasty for the other targets.  The cost model for PowerPC seems
 to need a lot of TLC.

Maybe the above would be a possible way to do it.  On x86 for example
a vector of two doubles is extremely cheap for generic SSE2 to construct,
we can directly load into the low/high part, thus when we use it as
N scalar loads plus the vector-build the 

[PATCH] Fix PR53470

2012-06-11 Thread Richard Guenther

This fixes PR53470 - I for quite some time wanted to remove that
conditional clearing of TYPE_CONTEXT from free_lang_data_in_type
but failed to do so because we regress.  I've debugged it down to
the C frontend having sometimes BLOCK as TYPE_CONTEXT for a type.
So, simply replace such BLOCK with the nearest non-BLOCK we can
get at.

Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk.

Richard.

2012-06-11  Richard Guenther  rguent...@suse.de

PR middle-end/53470
* tree.c (free_lang_data_in_type): Do not clear TYPE_CONTEXT but
replace it with the first non-BLOCK context.

* g++.dg/lto/pr53470_0.C: New testcase.
* gcc.dg/lto/pr53470_0.c: Likewise.

Index: gcc/tree.c
===
--- gcc/tree.c  (revision 188384)
+++ gcc/tree.c  (working copy)
@@ -4575,11 +4575,17 @@ free_lang_data_in_type (tree type)
   free_lang_data_in_one_sizepos (TYPE_SIZE (type));
   free_lang_data_in_one_sizepos (TYPE_SIZE_UNIT (type));
 
-  if (debug_info_level  DINFO_LEVEL_TERSE
-  || (TYPE_CONTEXT (type)
-  TREE_CODE (TYPE_CONTEXT (type)) != FUNCTION_DECL
-  TREE_CODE (TYPE_CONTEXT (type)) != NAMESPACE_DECL))
-TYPE_CONTEXT (type) = NULL_TREE;
+  if (TYPE_CONTEXT (type)
+   TREE_CODE (TYPE_CONTEXT (type)) == BLOCK)
+{
+  tree ctx = TYPE_CONTEXT (type);
+  do
+   {
+ ctx = BLOCK_SUPERCONTEXT (ctx);
+   }
+  while (ctx  TREE_CODE (ctx) == BLOCK);
+  TYPE_CONTEXT (type) = ctx;
+}
 }
 
 
Index: gcc/testsuite/g++.dg/lto/pr53470_0.C
===
--- gcc/testsuite/g++.dg/lto/pr53470_0.C(revision 0)
+++ gcc/testsuite/g++.dg/lto/pr53470_0.C(revision 0)
@@ -0,0 +1,26 @@
+// { dg-lto-do link }
+// { dg-lto-options { { -g -flto } } }
+
+class sp_counted_base;
+class shared_count {
+  sp_counted_base *pi_;
+public:
+  templateclass Y shared_count(Y) : pi_() {}
+  ~shared_count() {}
+};
+templateclass T struct shared_ptr {
+  T element_type;
+  templateclass Y shared_ptr(Y) : pn(0) {}
+  shared_count pn;
+};
+templateclass class ECGetterBase;
+templateclass T struct ExtensionCord {
+  struct Holder {
+ECGetterBaseT *getter_;
+  };
+  ExtensionCord() : holder_(new Holder) {}
+
+  shared_ptrHolder holder_;
+};
+ExtensionCordint a;
+int main() {}
Index: gcc/testsuite/gcc.dg/lto/pr53470_0.c
===
--- gcc/testsuite/gcc.dg/lto/pr53470_0.c(revision 0)
+++ gcc/testsuite/gcc.dg/lto/pr53470_0.c(revision 0)
@@ -0,0 +1,9 @@
+/* { dg-lto-do link } */
+/* { dg-lto-options { { -flto } { -flto -g } } } */
+
+int main ()
+{
+  { 
+union A { } v; 
+  }
+}


Re: [RFC] Target-specific limits on vector alignment

2012-06-11 Thread Richard Guenther
On Mon, Jun 11, 2012 at 3:16 PM, Richard Earnshaw rearn...@arm.com wrote:
 The ARM ABI states that vectors larger than 64 bits in size still have
 64-bit alignment; never-the-less, the HW supports alignment hints of up
 to 128-bits in some cases and will trap in a vector has an alignment
 that less than the hint.  GCC currently hard-codes larger vectors to be
 aligned by the size of the vector, which means that 128-bit vectors are
 marked as being 128-bit aligned.

 The ARM ABI unfortunately does not support generating such alignment for
 parameters passed by value and this can lead to traps at run time.  It
 seems that the best way to solve this problem is to allow the back-end
 to set an upper limit on the alignment permitted for a vector.

 I've implemented this as a separate hook, rather than using the existing
 hooks because there's a strong likelihood of breaking some existing ABIs
 if I did it another way.

 There are a couple of tests that will need some re-working before this
 can be committed to deal with the fall-out of making this change; I'll
 prepare those changes if this patch is deemed generally acceptable.

Hm.  Where would you use that target hook?

Richard.

 R.

        * target.def (TARGET_VECTOR_ALIGNMENT): New hook.
        * doc/tm.texi.in (TARGET_VECTOR_ALIGNMENT): Likewise.
        * doc/tm.texi: Regenerate.
        * targhooks.c (default_vector_alignment): New function.
        * arm.c (arm_vector_alignment): New function.
        (TARGET_VECTOR_ALIGNMENT): Define.


RFA: Fix path based disambiguation in RTL oracle

2012-06-11 Thread Michael Matz
Hi,

I haven't yet checked in my patch from last week about not creating 
alias-set conflicts for stack slot sharing mostly due to the caveat I 
mentioned.  Namely that the RTL disambiguator still uses path-based means 
(which belong to type-based aliasing) even in the write_dependence tests.  
That's really wrong, but I couldn't (and I think can't) create a testcase 
using stack slot sharing that would fail with the current unchanged 
compiler.

Luckily, we don't need stack slot sharing to demonstrate the bug.  The 
testcase below will fail on x86_64 with -O2 -fschedule-insns.  I've added 
it to the torture so it has a chance to fail on other targets too (e.g. on 
i686 it doesn't because it doesn't want to do the wrong swap of two 
insns).

I've retained the path based disambiguation in the true_dependence 
variants, where it is okay according to our mem model.

There are some cleanup opportunities in checking what the O(N^2) algorithm 
of nonoverlapping_component_refs_p catches that rtx_refs_may_alias_p 
doesn't.  We know already some cases where MEM_OFFSET_KNOWN_P is set too 
conservatively for the latter to work on some cases.  In any case that 
would be a follow-up.

So, patch fixes testcase and is in regstrap on x86_64-linux.  Okay if that 
passes?


Ciao,
Michael.

* alias.c (nonoverlapping_component_refs_p): Take two rtx arguments.
(nonoverlapping_memrefs_p): Don't call it here ...
(true_dependence_1): ... but here.

testsuite/
* gcc.dg/torture/alias-1.c: New test.

Index: alias.c
===
--- alias.c (revision 188384)
+++ alias.c (working copy)
@@ -156,7 +156,7 @@ static rtx find_base_value (rtx);
 static int mems_in_disjoint_alias_sets_p (const_rtx, const_rtx);
 static int insert_subset_children (splay_tree_node, void*);
 static alias_set_entry get_alias_set_entry (alias_set_type);
-static bool nonoverlapping_component_refs_p (const_tree, const_tree);
+static bool nonoverlapping_component_refs_p (const_rtx, const_rtx);
 static tree decl_for_component_ref (tree);
 static int write_dependence_p (const_rtx, const_rtx, int);
 
@@ -2184,11 +2184,15 @@ read_dependence (const_rtx mem, const_rt
overlap for any pair of objects.  */
 
 static bool
-nonoverlapping_component_refs_p (const_tree x, const_tree y)
+nonoverlapping_component_refs_p (const_rtx rtlx, const_rtx rtly)
 {
+  const_tree x = MEM_EXPR (rtlx), y = MEM_EXPR (rtly);
   const_tree fieldx, fieldy, typex, typey, orig_y;
 
-  if (!flag_strict_aliasing)
+  if (!flag_strict_aliasing
+  || !x || !y
+  || TREE_CODE (x) != COMPONENT_REF
+  || TREE_CODE (y) != COMPONENT_REF)
 return false;
 
   do
@@ -2307,13 +2311,6 @@ nonoverlapping_memrefs_p (const_rtx x, c
   ! MEM_OFFSET_KNOWN_P (y)))
 return 0;
 
-  /* If both are field references, we may be able to determine something.  */
-  if (TREE_CODE (exprx) == COMPONENT_REF
-   TREE_CODE (expry) == COMPONENT_REF
-   nonoverlapping_component_refs_p (exprx, expry))
-return 1;
-
-
   /* If the field reference test failed, look at the DECLs involved.  */
   moffsetx_known_p = MEM_OFFSET_KNOWN_P (x);
   if (moffsetx_known_p)
@@ -2519,6 +2516,9 @@ true_dependence_1 (const_rtx mem, enum m
   if (nonoverlapping_memrefs_p (mem, x, false))
 return 0;
 
+  if (nonoverlapping_component_refs_p (mem, x))
+return 0;
+
   return rtx_refs_may_alias_p (x, mem, true);
 }
 
Index: testsuite/gcc.dg/torture/alias-1.c
===
--- testsuite/gcc.dg/torture/alias-1.c  (revision 0)
+++ testsuite/gcc.dg/torture/alias-1.c  (revision 0)
@@ -0,0 +1,38 @@
+/* { dg-do run } */
+/* { dg-options -fschedule-insns } */
+
+extern void abort (void) __attribute__((noreturn));
+
+struct B { int a; int b;};
+struct wrapper {
+union setconflict
+{
+  struct S { char one1; struct B b1; } s;
+  struct T { struct B b2; char two2; } t;
+} a;
+};
+
+int
+main ()
+{
+  int sum = 0;
+  int i;
+  struct wrapper w;
+  struct B *p;
+
+  p = w.a.s.b1;
+  asm (: =r (p):0 (p));
+  p-a = 0;
+  asm (: =r (p):0 (p));
+  sum += p-a;
+
+  p = w.a.t.b2;
+  asm (: =r (p):0 (p));
+  p-b = 1;
+  asm (: =r (p):0 (p));
+  sum += p-b;
+
+  if (sum != 1)
+abort();
+  return 0;
+}


Re: RFA: Fix path based disambiguation in RTL oracle

2012-06-11 Thread Richard Guenther
On Mon, Jun 11, 2012 at 4:19 PM, Michael Matz m...@suse.de wrote:
 Hi,

 I haven't yet checked in my patch from last week about not creating
 alias-set conflicts for stack slot sharing mostly due to the caveat I
 mentioned.  Namely that the RTL disambiguator still uses path-based means
 (which belong to type-based aliasing) even in the write_dependence tests.
 That's really wrong, but I couldn't (and I think can't) create a testcase
 using stack slot sharing that would fail with the current unchanged
 compiler.

 Luckily, we don't need stack slot sharing to demonstrate the bug.  The
 testcase below will fail on x86_64 with -O2 -fschedule-insns.  I've added
 it to the torture so it has a chance to fail on other targets too (e.g. on
 i686 it doesn't because it doesn't want to do the wrong swap of two
 insns).

 I've retained the path based disambiguation in the true_dependence
 variants, where it is okay according to our mem model.

 There are some cleanup opportunities in checking what the O(N^2) algorithm
 of nonoverlapping_component_refs_p catches that rtx_refs_may_alias_p
 doesn't.  We know already some cases where MEM_OFFSET_KNOWN_P is set too
 conservatively for the latter to work on some cases.  In any case that
 would be a follow-up.

 So, patch fixes testcase and is in regstrap on x86_64-linux.  Okay if that
 passes?

Ok.

Thanks,
Richard.


 Ciao,
 Michael.
 
        * alias.c (nonoverlapping_component_refs_p): Take two rtx arguments.
        (nonoverlapping_memrefs_p): Don't call it here ...
        (true_dependence_1): ... but here.

 testsuite/
        * gcc.dg/torture/alias-1.c: New test.

 Index: alias.c
 ===
 --- alias.c     (revision 188384)
 +++ alias.c     (working copy)
 @@ -156,7 +156,7 @@ static rtx find_base_value (rtx);
  static int mems_in_disjoint_alias_sets_p (const_rtx, const_rtx);
  static int insert_subset_children (splay_tree_node, void*);
  static alias_set_entry get_alias_set_entry (alias_set_type);
 -static bool nonoverlapping_component_refs_p (const_tree, const_tree);
 +static bool nonoverlapping_component_refs_p (const_rtx, const_rtx);
  static tree decl_for_component_ref (tree);
  static int write_dependence_p (const_rtx, const_rtx, int);

 @@ -2184,11 +2184,15 @@ read_dependence (const_rtx mem, const_rt
    overlap for any pair of objects.  */

  static bool
 -nonoverlapping_component_refs_p (const_tree x, const_tree y)
 +nonoverlapping_component_refs_p (const_rtx rtlx, const_rtx rtly)
  {
 +  const_tree x = MEM_EXPR (rtlx), y = MEM_EXPR (rtly);
   const_tree fieldx, fieldy, typex, typey, orig_y;

 -  if (!flag_strict_aliasing)
 +  if (!flag_strict_aliasing
 +      || !x || !y
 +      || TREE_CODE (x) != COMPONENT_REF
 +      || TREE_CODE (y) != COMPONENT_REF)
     return false;

   do
 @@ -2307,13 +2311,6 @@ nonoverlapping_memrefs_p (const_rtx x, c
           ! MEM_OFFSET_KNOWN_P (y)))
     return 0;

 -  /* If both are field references, we may be able to determine something.  */
 -  if (TREE_CODE (exprx) == COMPONENT_REF
 -       TREE_CODE (expry) == COMPONENT_REF
 -       nonoverlapping_component_refs_p (exprx, expry))
 -    return 1;
 -
 -
   /* If the field reference test failed, look at the DECLs involved.  */
   moffsetx_known_p = MEM_OFFSET_KNOWN_P (x);
   if (moffsetx_known_p)
 @@ -2519,6 +2516,9 @@ true_dependence_1 (const_rtx mem, enum m
   if (nonoverlapping_memrefs_p (mem, x, false))
     return 0;

 +  if (nonoverlapping_component_refs_p (mem, x))
 +    return 0;
 +
   return rtx_refs_may_alias_p (x, mem, true);
  }

 Index: testsuite/gcc.dg/torture/alias-1.c
 ===
 --- testsuite/gcc.dg/torture/alias-1.c  (revision 0)
 +++ testsuite/gcc.dg/torture/alias-1.c  (revision 0)
 @@ -0,0 +1,38 @@
 +/* { dg-do run } */
 +/* { dg-options -fschedule-insns } */
 +
 +extern void abort (void) __attribute__((noreturn));
 +
 +struct B { int a; int b;};
 +struct wrapper {
 +union setconflict
 +{
 +  struct S { char one1; struct B b1; } s;
 +  struct T { struct B b2; char two2; } t;
 +} a;
 +};
 +
 +int
 +main ()
 +{
 +  int sum = 0;
 +  int i;
 +  struct wrapper w;
 +  struct B *p;
 +
 +  p = w.a.s.b1;
 +  asm (: =r (p):0 (p));
 +  p-a = 0;
 +  asm (: =r (p):0 (p));
 +  sum += p-a;
 +
 +  p = w.a.t.b2;
 +  asm (: =r (p):0 (p));
 +  p-b = 1;
 +  asm (: =r (p):0 (p));
 +  sum += p-b;
 +
 +  if (sum != 1)
 +    abort();
 +  return 0;
 +}


[PATCH][2/n] alias.c TLC

2012-06-11 Thread Richard Guenther

This makes ao_ref_from_mem less conservative if either MEM_OFFSET or
MEM_SIZE is not set.  From other alias.c code and 
set_mem_attributes_minus_bitpos one has to conclude that MEM_EXPR
is always conservatively correct (it only can cover a larger memory
area) and the MEM_OFFSET / MEM_SIZE pair can refine it.  Thus,
we can make ao_ref_from_mem less conservative when, for example,
faced with variable array accesses which set_mem_attributes_minus_bitpos
ends up representing with an unknown MEM_OFFSET.

Bootstrap and regtest running on x86_64-unknown-linux-gnu.

Richard.

2012-06-11  Richard Guenther  rguent...@suse.de

* emit-rtl.c (set_mem_attributes_minus_bitpos): Remove dead code.
* alias.c (ao_ref_from_mem): MEM_EXPR is conservative, MEM_OFFSET
and MEM_SIZE only refines it.  Reflect that and be less conservative
if either of the latter is not known.

Index: gcc/emit-rtl.c
===
*** gcc/emit-rtl.c  (revision 188384)
--- gcc/emit-rtl.c  (working copy)
*** set_mem_attributes_minus_bitpos (rtx ref
*** 1838,1852 
  /* ??? Any reason the field size would be different than
 the size we got from the type?  */
}
- 
- /* If this is an indirect reference, record it.  */
- else if (TREE_CODE (t) == MEM_REF)
-   {
- attrs.expr = t;
- attrs.offset_known_p = true;
- attrs.offset = 0;
- apply_bitpos = bitpos;
-   }
}
  
/* If this is an indirect reference, record it.  */
--- 1838,1843 
Index: gcc/alias.c
===
*** gcc/alias.c (revision 188384)
--- gcc/alias.c (working copy)
*** ao_ref_from_mem (ao_ref *ref, const_rtx
*** 326,342 
  
ref-ref_alias_set = MEM_ALIAS_SET (mem);
  
!   /* If MEM_OFFSET or MEM_SIZE are unknown we have to punt.
!  Keep points-to related information though.  */
if (!MEM_OFFSET_KNOWN_P (mem)
|| !MEM_SIZE_KNOWN_P (mem))
! {
!   ref-ref = NULL_TREE;
!   ref-offset = 0;
!   ref-size = -1;
!   ref-max_size = -1;
!   return true;
! }
  
/* If the base decl is a parameter we can have negative MEM_OFFSET in
   case of promoted subregs on bigendian targets.  Trust the MEM_EXPR
--- 326,336 
  
ref-ref_alias_set = MEM_ALIAS_SET (mem);
  
!   /* If MEM_OFFSET or MEM_SIZE are unknown what we got from MEM_EXPR
!  is conservative, so trust it.  */
if (!MEM_OFFSET_KNOWN_P (mem)
|| !MEM_SIZE_KNOWN_P (mem))
! return true;
  
/* If the base decl is a parameter we can have negative MEM_OFFSET in
   case of promoted subregs on bigendian targets.  Trust the MEM_EXPR
*** ao_ref_from_mem (ao_ref *ref, const_rtx
*** 345,350 
--- 339,347 
 (MEM_SIZE (mem) + MEM_OFFSET (mem)) * BITS_PER_UNIT == ref-size)
  return true;
  
+   /* Otherwise continue and refine size and offset we got from analyzing
+  MEM_EXPR by using MEM_SIZE and MEM_OFFSET.  */
+ 
ref-offset += MEM_OFFSET (mem) * BITS_PER_UNIT;
ref-size = MEM_SIZE (mem) * BITS_PER_UNIT;
  


Re: [PATCH] Add vector cost model density heuristic

2012-06-11 Thread William J. Schmidt
On Mon, 2012-06-11 at 13:40 +0200, Richard Guenther wrote:
 On Fri, 8 Jun 2012, William J. Schmidt wrote:
 
  This patch adds a heuristic to the vectorizer when estimating the
  minimum profitable number of iterations.  The heuristic is
  target-dependent, and is currently disabled for all targets except
  PowerPC.  However, the intent is to make it general enough to be useful
  for other targets that want to opt in.
  
  A previous patch addressed some PowerPC SPEC degradations by modifying
  the vector cost model values for vec_perm and vec_promote_demote.  The
  values were set a little higher than their natural values because the
  natural values were not sufficient to prevent a poor vectorization
  choice.  However, this is not the right long-term solution, since it can
  unnecessarily constrain other vectorization choices involving permute
  instructions.
  
  Analysis of the badly vectorized loop (in sphinx3) showed that the
  problem was overcommitment of vector resources -- too many vector
  instructions issued without enough non-vector instructions available to
  cover the delays.  The vector cost model assumes that instructions
  always have a constant cost, and doesn't have a way of judging this kind
  of density of vector instructions.
  
  The present patch adds a heuristic to recognize when a loop is likely to
  overcommit resources, and adds a small penalty to the inside-loop cost
  to account for the expected stalls.  The heuristic is parameterized with
  three target-specific values:
  
   * Density threshold: The heuristic will apply only when the
 percentage of inside-loop cost attributable to vectorized
 instructions exceeds this value.
  
   * Size threshold: The heuristic will apply only when the
 inside-loop cost exceeds this value.
  
   * Penalty: The inside-loop cost will be increased by this
 percentage value when the heuristic applies.
  
  Thus only reasonably large loop bodies that are mostly vectorized
  instructions will be affected.
  
  By applying only a small percentage bump to the inside-loop cost, the
  heuristic will only turn off vectorization for loops that were
  considered barely profitable to begin with (such as the sphinx3 loop).
  So the heuristic is quite conservative and should not affect the vast
  majority of vectorization decisions.
  
  Together with the new heuristic, this patch reduces the vec_perm and
  vec_promote_demote costs for PowerPC to their natural values.
  
  I've regstrapped this with no regressions on powerpc64-unknown-linux-gnu
  and verified that no performance regressions occur on SPEC cpu2006.  Is
  this ok for trunk?
 
 Hmm.  I don't like this patch or its general idea too much.  Instead
 I'd like us to move more of the cost model detail to the target, giving
 it a chance to look at the whole loop before deciding on a cost.  ISTR
 posting the overall idea at some point, but let me repeat it here instead
 of trying to find that e-mail.
 
 The basic interface of the cost model should be, in targetm.vectorize
 
   /* Tell the target to start cost analysis of a loop or a basic-block
  (if the loop argument is NULL).  Returns an opaque pointer to
  target-private data.  */
   void *init_cost (struct loop *loop);
 
   /* Add cost for N vectorized-stmt-kind statements in vector_mode.  */
   void add_stmt_cost (void *data, unsigned n,
 vectorized-stmt-kind,
   enum machine_mode vector_mode);
 
   /* Tell the target to compute and return the cost of the accumulated
  statements and free any target-private data.  */
   unsigned finish_cost (void *data);
 
 with eventually slightly different signatures for add_stmt_cost
 (like pass in the original scalar stmt?).
 
 It allows the target, at finish_cost time, to evaluate things like
 register pressure and resource utilization.

OK, I'm trying to understand how you would want this built into the
present structure.  Taking just the loop case for now:

Judging by your suggested API, we would have to call add_stmt_cost ()
everywhere that we now call stmt_vinfo_set_inside_of_loop_cost ().  For
now this would be an additional call, not a replacement, though maybe
the other goes away eventually.  This allows the target to save more
data about the vectorized instructions than just an accumulated cost
number (order and quantity of various kinds of instructions can be
maintained for better modeling).  Presumably the call to finish_cost
would be done within vect_estimate_min_profitable_iters () to produce
the final value of inside_cost for the loop.

The default target hook for add_stmt_cost would duplicate what we
currently do for calculating the inside_cost of a statement, and the
default target hook for finish_cost would just return the sum.

I'll have to go hunting where the similar code would fit for SLP in a
basic block.

If I read you correctly, you don't object to a density heuristic such as
the one I implemented here, but you want 

Re: [RFC] Target-specific limits on vector alignment

2012-06-11 Thread Richard Earnshaw
On 11/06/12 15:17, Richard Guenther wrote:
 On Mon, Jun 11, 2012 at 3:16 PM, Richard Earnshaw rearn...@arm.com wrote:
 The ARM ABI states that vectors larger than 64 bits in size still have
 64-bit alignment; never-the-less, the HW supports alignment hints of up
 to 128-bits in some cases and will trap in a vector has an alignment
 that less than the hint.  GCC currently hard-codes larger vectors to be
 aligned by the size of the vector, which means that 128-bit vectors are
 marked as being 128-bit aligned.

 The ARM ABI unfortunately does not support generating such alignment for
 parameters passed by value and this can lead to traps at run time.  It
 seems that the best way to solve this problem is to allow the back-end
 to set an upper limit on the alignment permitted for a vector.

 I've implemented this as a separate hook, rather than using the existing
 hooks because there's a strong likelihood of breaking some existing ABIs
 if I did it another way.

 There are a couple of tests that will need some re-working before this
 can be committed to deal with the fall-out of making this change; I'll
 prepare those changes if this patch is deemed generally acceptable.
 
 Hm.  Where would you use that target hook?
 

Doh!

It was supposed to be in the patch set... :-(

in layout_type(), where the alignment of a vector is forced to the size
of the vector.

R.--- stor-layout.c   (revision 188348)
+++ stor-layout.c   (local)
@@ -2131,9 +2131,11 @@ layout_type (tree type)
TYPE_SIZE (type) = int_const_binop (MULT_EXPR, TYPE_SIZE (innertype),
bitsize_int (nunits));
 
-   /* Always naturally align vectors.  This prevents ABI changes
-  depending on whether or not native vector modes are supported.  */
-   TYPE_ALIGN (type) = tree_low_cst (TYPE_SIZE (type), 0);
+   /* Naturally align vectors, but let the target set an upper
+  limit.  This prevents ABI changes depending on whether or
+  not native vector modes are supported.  */
+   TYPE_ALIGN (type)
+ = targetm.vector_alignment (type, tree_low_cst (TYPE_SIZE (type), 0));
 break;
   }
 

Re: [PATCH] Add vector cost model density heuristic

2012-06-11 Thread Richard Guenther
On Mon, 11 Jun 2012, William J. Schmidt wrote:

 On Mon, 2012-06-11 at 13:40 +0200, Richard Guenther wrote:
  On Fri, 8 Jun 2012, William J. Schmidt wrote:
  
   This patch adds a heuristic to the vectorizer when estimating the
   minimum profitable number of iterations.  The heuristic is
   target-dependent, and is currently disabled for all targets except
   PowerPC.  However, the intent is to make it general enough to be useful
   for other targets that want to opt in.
   
   A previous patch addressed some PowerPC SPEC degradations by modifying
   the vector cost model values for vec_perm and vec_promote_demote.  The
   values were set a little higher than their natural values because the
   natural values were not sufficient to prevent a poor vectorization
   choice.  However, this is not the right long-term solution, since it can
   unnecessarily constrain other vectorization choices involving permute
   instructions.
   
   Analysis of the badly vectorized loop (in sphinx3) showed that the
   problem was overcommitment of vector resources -- too many vector
   instructions issued without enough non-vector instructions available to
   cover the delays.  The vector cost model assumes that instructions
   always have a constant cost, and doesn't have a way of judging this kind
   of density of vector instructions.
   
   The present patch adds a heuristic to recognize when a loop is likely to
   overcommit resources, and adds a small penalty to the inside-loop cost
   to account for the expected stalls.  The heuristic is parameterized with
   three target-specific values:
   
* Density threshold: The heuristic will apply only when the
  percentage of inside-loop cost attributable to vectorized
  instructions exceeds this value.
   
* Size threshold: The heuristic will apply only when the
  inside-loop cost exceeds this value.
   
* Penalty: The inside-loop cost will be increased by this
  percentage value when the heuristic applies.
   
   Thus only reasonably large loop bodies that are mostly vectorized
   instructions will be affected.
   
   By applying only a small percentage bump to the inside-loop cost, the
   heuristic will only turn off vectorization for loops that were
   considered barely profitable to begin with (such as the sphinx3 loop).
   So the heuristic is quite conservative and should not affect the vast
   majority of vectorization decisions.
   
   Together with the new heuristic, this patch reduces the vec_perm and
   vec_promote_demote costs for PowerPC to their natural values.
   
   I've regstrapped this with no regressions on powerpc64-unknown-linux-gnu
   and verified that no performance regressions occur on SPEC cpu2006.  Is
   this ok for trunk?
  
  Hmm.  I don't like this patch or its general idea too much.  Instead
  I'd like us to move more of the cost model detail to the target, giving
  it a chance to look at the whole loop before deciding on a cost.  ISTR
  posting the overall idea at some point, but let me repeat it here instead
  of trying to find that e-mail.
  
  The basic interface of the cost model should be, in targetm.vectorize
  
/* Tell the target to start cost analysis of a loop or a basic-block
   (if the loop argument is NULL).  Returns an opaque pointer to
   target-private data.  */
void *init_cost (struct loop *loop);
  
/* Add cost for N vectorized-stmt-kind statements in vector_mode.  */
void add_stmt_cost (void *data, unsigned n,
vectorized-stmt-kind,
enum machine_mode vector_mode);
  
/* Tell the target to compute and return the cost of the accumulated
   statements and free any target-private data.  */
unsigned finish_cost (void *data);
  
  with eventually slightly different signatures for add_stmt_cost
  (like pass in the original scalar stmt?).
  
  It allows the target, at finish_cost time, to evaluate things like
  register pressure and resource utilization.
 
 OK, I'm trying to understand how you would want this built into the
 present structure.  Taking just the loop case for now:
 
 Judging by your suggested API, we would have to call add_stmt_cost ()
 everywhere that we now call stmt_vinfo_set_inside_of_loop_cost ().  For
 now this would be an additional call, not a replacement, though maybe
 the other goes away eventually.  This allows the target to save more
 data about the vectorized instructions than just an accumulated cost
 number (order and quantity of various kinds of instructions can be
 maintained for better modeling).  Presumably the call to finish_cost
 would be done within vect_estimate_min_profitable_iters () to produce
 the final value of inside_cost for the loop.

Yes.  I didn't look in detail at the difference between inner/outer
costs though.  Maybe that complicates things, maybe not.

 The default target hook for add_stmt_cost would duplicate what we
 currently do for calculating the inside_cost of a statement, 

Re: [PATCH] Add vector cost model density heuristic

2012-06-11 Thread Richard Guenther
On Mon, 11 Jun 2012, David Edelsohn wrote:

 On Mon, Jun 11, 2012 at 7:40 AM, Richard Guenther rguent...@suse.de wrote:
 
  Hmm.  I don't like this patch or its general idea too much.  Instead
  I'd like us to move more of the cost model detail to the target, giving
  it a chance to look at the whole loop before deciding on a cost.  ISTR
  posting the overall idea at some point, but let me repeat it here instead
  of trying to find that e-mail.
 
 Richard,
 
 Allowing the target to see the entire loop and have more control over
 the cost model is a fine goal, but the current code is using incorrect
 heuristics. Requiring a complete re-write of the auto-vectorizer cost
 model should not be a requirement for fixing the current code.

If fixing the current code adds more magic hooks at the wrong spot
then yes, in stage1 I can make you think about a more proper solution
to the issue.

Richard.

[AARCH64][PATCH] Fix legitimate address checking for TImode and TFmode

2012-06-11 Thread Marcus Shawcroft
This patch fixes the behaviour of legitimate address checking for TImode 
and TFmode values.


The current implementation does not correctly distinguish between the 
address modes available for ldp/stp X,X versus ldr/str Q.


/Marcus

ChangeLog:

* config/aarch64/aarch64.c
(aarch64_regno_ok_for_index_p): Handle NULL reg_renumber.
(aarch64_regno_ok_for_base_p): Likewise.
(offset_7bit_signed_scaled_p): New.
(offset_9bit_signed_unscaled_p): New.
(offset_12bit_unsigned_scaled_p): New.
(aarch64_classify_address): Replace pair_p with allow_reg_index_p.
Conservative test for valid TImode and TFmode addresses.  Use
offset_7bit_signed_scaled_p offset_9bit_signed_unscaled_p and
offset_12bit_unsigned_scaled_p.  Remove explicit TImode and TFmode
tests.
* config/aarch64/aarch64.md (movti_aarch64): Replace 'm' with 'Ump'.
(movtf_aarch64): Replace 'm' with 'Ump', replace 'Utf' with 'm'.
* config/aarch64/constraints.md (Utf): Remove.
(Ump) Pass strict.diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 34d04755900a75376c0afe5b9b4b8b3cd1ac84e2..b985d63c4fd591c7ae17d519a7b858c0264edec7 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -2441,6 +2441,10 @@ aarch64_regno_ok_for_index_p (int regno,
 {
   if (!strict_p)
 	return true;
+
+  if (!reg_renumber)
+	return false;
+
   regno = reg_renumber[regno];
 }
   return GP_REGNUM_P (regno);
@@ -2456,6 +2460,10 @@ aarch64_regno_ok_for_base_p (int regno,
 {
   if (!strict_p)
 	return true;
+
+  if (!reg_renumber)
+	return false;
+
   regno = reg_renumber[regno];
 }
 
@@ -2639,6 +2647,29 @@ aarch64_classify_index (struct aarch64_a
   return false;
 }
 
+static inline bool
+offset_7bit_signed_scaled_p (enum machine_mode mode, HOST_WIDE_INT offset)
+{
+  return (offset = -64 * GET_MODE_SIZE (mode)
+	   offset  64 * GET_MODE_SIZE (mode)
+	   offset % GET_MODE_SIZE (mode) == 0);
+}
+
+static inline bool
+offset_9bit_signed_unscaled_p (enum machine_mode mode ATTRIBUTE_UNUSED,
+			   HOST_WIDE_INT offset)
+{
+  return offset = -256  offset  256;
+}
+
+static inline bool
+offset_12bit_unsigned_scaled_p (enum machine_mode mode, HOST_WIDE_INT offset)
+{
+  return (offset = 0
+	   offset  4096 * GET_MODE_SIZE (mode)
+	   offset % GET_MODE_SIZE (mode) == 0);
+}
+
 /* Return true if X is a valid address for machine mode MODE.  If it is,
fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
@@ -2650,7 +2681,8 @@ aarch64_classify_address (struct aarch64
 {
   enum rtx_code code = GET_CODE (x);
   rtx op0, op1;
-  bool pair_p = (outer_code == PARALLEL || mode == TImode);
+  bool allow_reg_index_p =
+outer_code != PARALLEL  GET_MODE_SIZE(mode) != 16;
 
   /* Don't support anything other than POST_INC or REG addressing for
  AdvSIMD.  */
@@ -2679,25 +2711,27 @@ aarch64_classify_address (struct aarch64
 	  info-type = ADDRESS_REG_IMM;
 	  info-base = op0;
 	  info-offset = op1;
+
+	  /* TImode and TFmode values are allowed in both pairs of X
+	 registers and individual Q registers.  The available
+	 address modes are:
+	 X,X: 7-bit signed scaled offset
+	 Q:   9-bit signed offset
+	 We conservatively require an offset representable in either mode.
+	   */
+	  if (mode == TImode || mode == TFmode)
+	return (offset_7bit_signed_scaled_p (mode, offset)
+		 offset_9bit_signed_unscaled_p (mode, offset));
+
 	  if (outer_code == PARALLEL)
-	/* load/store pair: 7-bit signed, scaled offset.  */
 	return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
-		 offset = -64 * GET_MODE_SIZE (mode)
-		 offset  64 * GET_MODE_SIZE (mode)
-		 offset % GET_MODE_SIZE (mode) == 0);
-	  else if (mode == TImode)
-	/* load/store pair of dwords: 7-bit signed, scaled offset.  */
-	return offset = -512  offset  512  offset % 8 == 0;
+		 offset_7bit_signed_scaled_p (mode, offset));
 	  else
-	/* load/store single: 9-bit signed, unscaled offset.  */
-	/* load/store single: 12-bit unsigned, scaled offset.  */
-	return ((offset = -256  offset  256)
-		|| (offset = 0
-			 offset  4096 * GET_MODE_SIZE (mode)
-			 offset % GET_MODE_SIZE (mode) == 0));
+	return (offset_9bit_signed_unscaled_p (mode, offset)
+		|| offset_12bit_unsigned_scaled_p (mode, offset));
 	}
 
-  if (!pair_p)
+  if (allow_reg_index_p)
 	{
 	  /* Look for base + (scaled/extended) index register.  */
 	  if (aarch64_base_register_rtx_p (op0, strict_p)
@@ -2737,18 +2771,23 @@ aarch64_classify_address (struct aarch64
 	  HOST_WIDE_INT offset;
 	  info-offset = XEXP (XEXP (x, 1), 1);
 	  offset = INTVAL (info-offset);
+
+	  /* TImode and TFmode values are allowed in both pairs of X
+	 registers and individual Q registers.  The available
+	 address modes 

[Ada] fix use of PICFLAG for Ada library variants

2012-06-11 Thread Olivier Hainque
Hello,

We used to have in gcc-interface/Makefile.in:

 GNATLIBCFLAGS_FOR_C = -W -Wall $(GNATLIBCFLAGS) $(TARGET_LIBGCC2_CFLAGS) ...
 
As part of the libgcc move to toplevel,

 http://gcc.gnu.org/ml/gcc-patches/2011-08/msg00917.html

introduced PIC_FLAG_FOR_TARGET and replaced all the uses of
TARGET_LIBGCC2_CFLAGS by that, on the assumption that it was
essentially used for gnatlib-shared related targets.

That was true for most of the uses indeed, in particular those
directly part of the gnatlib-shared targets which now have:

  gnatlib-shared-default:
 $(MAKE) $(FLAGS_TO_PASS) \
 GNATLIBFLAGS=$(GNATLIBFLAGS) \
  ==GNATLIBCFLAGS=$(GNATLIBCFLAGS) $(PICFLAG_FOR_TARGET) \
 GNATLIBCFLAGS_FOR_C=$(GNATLIBCFLAGS_FOR_C) \
 gnatlib

However, we now alo have:

 GNATLIBCFLAGS_FOR_C = -W -Wall $(GNATLIBCFLAGS) $(PICFLAG_FOR_TARGET)
 ^

which is used for all the gnatlib related targets in this Makefile, so we
end up passing -fpic or alike for the compilation of C files aimed at non
shared libs as well.

This causes kernel-mode library build failures with --diable-libada on VxWorks
because the compiler stops in presence of -fPIC without -mrtp. The extraneaous
PICFLAG is visible in libada/Makefile.in as well, with similar consequences
in the libada enabled case.

The attached patch addresses this with two simple steps: remove
PICFLAG_FOR_TARGET from the top definition of GNATLIBCFLAGS_FOR_C,
then add it explicitly to the options passed by the gnatlib-shared
targets only. Like

 GNATLIBCFLAGS_FOR_C = -W -Wall $(GNATLIBCFLAGS)

 gnatlib-shared-default:
$(MAKE) $(FLAGS_TO_PASS) \
GNATLIBFLAGS=$(GNATLIBFLAGS) \
 ==GNATLIBCFLAGS=$(GNATLIBCFLAGS) $(PICFLAG_FOR_TARGET) \
 ==GNATLIBCFLAGS_FOR_C=$(GNATLIBCFLAGS_FOR_C) $(PICFLAG_FOR_TARGET) \
gnatlib

This lets the simple VxWorks builds proceed to completion, and
bootstrap+regtest fine x86_64-linux.

Committing to mainline.

Olivier

2012-06-10  Olivier Hainque  hain...@adacore.com

* Makefile.in (GNATLIBCFLAGS_FOR_C): Remove $(PIC_FLAG_FOR_TARGET).
(gnatlib-shared-default): Add $(PIC_FLAG_FOR_TARGET) to
GNATLIBCFLAGS_FOR_C passed to gnatlib.
(gnatlib-shared-win32): Likewise.
(gnatlib-shared-darwin): Likewise.
(gnatlib-shared-dual-win32): Pass PIC_FLAG_FOR_TARGET to 
gnatlib-shared-win32.

libada/
* Makefile.in (GNATLIBCFLAGS_FOR_C): Remove $(PICFLAG).


ada-picflag.diff
Description: Binary data


Re: [PATCH] Add vector cost model density heuristic

2012-06-11 Thread David Edelsohn
On Mon, Jun 11, 2012 at 10:48 AM, Richard Guenther rguent...@suse.de wrote:
 On Mon, 11 Jun 2012, David Edelsohn wrote:

 On Mon, Jun 11, 2012 at 7:40 AM, Richard Guenther rguent...@suse.de wrote:

  Hmm.  I don't like this patch or its general idea too much.  Instead
  I'd like us to move more of the cost model detail to the target, giving
  it a chance to look at the whole loop before deciding on a cost.  ISTR
  posting the overall idea at some point, but let me repeat it here instead
  of trying to find that e-mail.

 Richard,

 Allowing the target to see the entire loop and have more control over
 the cost model is a fine goal, but the current code is using incorrect
 heuristics. Requiring a complete re-write of the auto-vectorizer cost
 model should not be a requirement for fixing the current code.

 If fixing the current code adds more magic hooks at the wrong spot
 then yes, in stage1 I can make you think about a more proper solution
 to the issue.

First, these are not magic hooks.

Second, I suggest that you need to rephrase I can make you and
re-send your reply.

Thanks, David


Re: [RFC] Target-specific limits on vector alignment

2012-06-11 Thread Richard Guenther
On Mon, Jun 11, 2012 at 4:38 PM, Richard Earnshaw rearn...@arm.com wrote:
 On 11/06/12 15:17, Richard Guenther wrote:
 On Mon, Jun 11, 2012 at 3:16 PM, Richard Earnshaw rearn...@arm.com wrote:
 The ARM ABI states that vectors larger than 64 bits in size still have
 64-bit alignment; never-the-less, the HW supports alignment hints of up
 to 128-bits in some cases and will trap in a vector has an alignment
 that less than the hint.  GCC currently hard-codes larger vectors to be
 aligned by the size of the vector, which means that 128-bit vectors are
 marked as being 128-bit aligned.

 The ARM ABI unfortunately does not support generating such alignment for
 parameters passed by value and this can lead to traps at run time.  It
 seems that the best way to solve this problem is to allow the back-end
 to set an upper limit on the alignment permitted for a vector.

 I've implemented this as a separate hook, rather than using the existing
 hooks because there's a strong likelihood of breaking some existing ABIs
 if I did it another way.

 There are a couple of tests that will need some re-working before this
 can be committed to deal with the fall-out of making this change; I'll
 prepare those changes if this patch is deemed generally acceptable.

 Hm.  Where would you use that target hook?


 Doh!

 It was supposed to be in the patch set... :-(

 in layout_type(), where the alignment of a vector is forced to the size
 of the vector.

Ok.

+   /* Naturally align vectors, but let the target set an upper
+  limit.  This prevents ABI changes depending on whether or
+  not native vector modes are supported.  */
+   TYPE_ALIGN (type)
+ = targetm.vector_alignment (type, tree_low_cst (TYPE_SIZE (type), 0));

The type argument or the size argument looks redundant.  Note that we still
can have such vector properly aligned, thus the vectorizer would need to
use build_aligned_type also if it knows the type is aligned, not only when
it thinks it is misaligned.  You basically change the alignment of the default
vector type.

Richard.

 R.


[PATCH, GCC][AArch64] Remove support for __float128

2012-06-11 Thread Sofiane Naci
Hi,

This patch removes __float128 support in the AArch64 port.

Thanks
Sofiane

-

gcc/ChangeLog:

2012-06-11  Sofiane Naci sofiane.n...@arm.com

[AArch64] Remove __float128 support.

* config/aarch64/aarch64.c
(aarch64_mangle_type): Remove function.
(aarch64_init_builtins): Remove __float128 support.
(aarch64_expand_builtin): Likewise.
(TARGET_MANGLE_TYPE): Remove definition.
* config/aarch64/aarch64.h (enum aarch64_builtins): Update.
* doc/extend.texi: Remove __float128 documentation.


gcc/testsuite/ChangeLog:

2012-06-11  Sofiane Naci sofiane.n...@arm.com

[AArch64] Remove __float128 support.

* gcc.dg/const-float128-ped.c: Remove AArch64 from list of
targets.
* gcc.dg/const-float128.c: Likewise.
* gcc.dg/torture/fp-int-convert-float128-timode.c: Likewise.
* gcc.dg/torture/fp-int-convert-float128.c: Likewise.


aarch64-remove-float128.patch
Description: Binary data


Re: [PATCH] Add vector cost model density heuristic

2012-06-11 Thread Richard Guenther
On Mon, 11 Jun 2012, David Edelsohn wrote:

 On Mon, Jun 11, 2012 at 10:48 AM, Richard Guenther rguent...@suse.de wrote:
  On Mon, 11 Jun 2012, David Edelsohn wrote:
 
  On Mon, Jun 11, 2012 at 7:40 AM, Richard Guenther rguent...@suse.de 
  wrote:
 
   Hmm.  I don't like this patch or its general idea too much.  Instead
   I'd like us to move more of the cost model detail to the target, giving
   it a chance to look at the whole loop before deciding on a cost.  ISTR
   posting the overall idea at some point, but let me repeat it here instead
   of trying to find that e-mail.
 
  Richard,
 
  Allowing the target to see the entire loop and have more control over
  the cost model is a fine goal, but the current code is using incorrect
  heuristics. Requiring a complete re-write of the auto-vectorizer cost
  model should not be a requirement for fixing the current code.
 
  If fixing the current code adds more magic hooks at the wrong spot
  then yes, in stage1 I can make you think about a more proper solution
  to the issue.
 
 First, these are not magic hooks.

Well, they are at least magic numbers and heuristics that apply
generally and not only to the single issue in sphinx.  And in
fact how it works for sphinx _is_ magic.

 Second, I suggest that you need to rephrase I can make you and
 re-send your reply.

Sorry for my bad english.  Consider it meaning that I'd rather have
you think about a more proper solution.  That's what patch review
is about after all, no?  Sometimes a complete re-write (which gets
more difficult which each of the patches enhancing the not ideal
current state) is the best thing to do.

Richard.

Re: [PATCH] Add vector cost model density heuristic

2012-06-11 Thread Richard Guenther
On Mon, 11 Jun 2012, Richard Guenther wrote:

 On Mon, 11 Jun 2012, William J. Schmidt wrote:
 
  On Mon, 2012-06-11 at 13:40 +0200, Richard Guenther wrote:
   On Fri, 8 Jun 2012, William J. Schmidt wrote:
   
This patch adds a heuristic to the vectorizer when estimating the
minimum profitable number of iterations.  The heuristic is
target-dependent, and is currently disabled for all targets except
PowerPC.  However, the intent is to make it general enough to be useful
for other targets that want to opt in.

A previous patch addressed some PowerPC SPEC degradations by modifying
the vector cost model values for vec_perm and vec_promote_demote.  The
values were set a little higher than their natural values because the
natural values were not sufficient to prevent a poor vectorization
choice.  However, this is not the right long-term solution, since it can
unnecessarily constrain other vectorization choices involving permute
instructions.

Analysis of the badly vectorized loop (in sphinx3) showed that the
problem was overcommitment of vector resources -- too many vector
instructions issued without enough non-vector instructions available to
cover the delays.  The vector cost model assumes that instructions
always have a constant cost, and doesn't have a way of judging this kind
of density of vector instructions.

The present patch adds a heuristic to recognize when a loop is likely to
overcommit resources, and adds a small penalty to the inside-loop cost
to account for the expected stalls.  The heuristic is parameterized with
three target-specific values:

 * Density threshold: The heuristic will apply only when the
   percentage of inside-loop cost attributable to vectorized
   instructions exceeds this value.

 * Size threshold: The heuristic will apply only when the
   inside-loop cost exceeds this value.

 * Penalty: The inside-loop cost will be increased by this
   percentage value when the heuristic applies.

Thus only reasonably large loop bodies that are mostly vectorized
instructions will be affected.

By applying only a small percentage bump to the inside-loop cost, the
heuristic will only turn off vectorization for loops that were
considered barely profitable to begin with (such as the sphinx3 loop).
So the heuristic is quite conservative and should not affect the vast
majority of vectorization decisions.

Together with the new heuristic, this patch reduces the vec_perm and
vec_promote_demote costs for PowerPC to their natural values.

I've regstrapped this with no regressions on powerpc64-unknown-linux-gnu
and verified that no performance regressions occur on SPEC cpu2006.  Is
this ok for trunk?
   
   Hmm.  I don't like this patch or its general idea too much.  Instead
   I'd like us to move more of the cost model detail to the target, giving
   it a chance to look at the whole loop before deciding on a cost.  ISTR
   posting the overall idea at some point, but let me repeat it here instead
   of trying to find that e-mail.
   
   The basic interface of the cost model should be, in targetm.vectorize
   
 /* Tell the target to start cost analysis of a loop or a basic-block
(if the loop argument is NULL).  Returns an opaque pointer to
target-private data.  */
 void *init_cost (struct loop *loop);
   
 /* Add cost for N vectorized-stmt-kind statements in vector_mode.  */
 void add_stmt_cost (void *data, unsigned n,
   vectorized-stmt-kind,
 enum machine_mode vector_mode);
   
 /* Tell the target to compute and return the cost of the accumulated
statements and free any target-private data.  */
 unsigned finish_cost (void *data);
   
   with eventually slightly different signatures for add_stmt_cost
   (like pass in the original scalar stmt?).
   
   It allows the target, at finish_cost time, to evaluate things like
   register pressure and resource utilization.
  
  OK, I'm trying to understand how you would want this built into the
  present structure.  Taking just the loop case for now:
  
  Judging by your suggested API, we would have to call add_stmt_cost ()
  everywhere that we now call stmt_vinfo_set_inside_of_loop_cost ().  For
  now this would be an additional call, not a replacement, though maybe
  the other goes away eventually.  This allows the target to save more
  data about the vectorized instructions than just an accumulated cost
  number (order and quantity of various kinds of instructions can be
  maintained for better modeling).  Presumably the call to finish_cost
  would be done within vect_estimate_min_profitable_iters () to produce
  the final value of inside_cost for the loop.
 
 Yes.  I didn't look in detail at the difference between inner/outer
 costs though.  Maybe that complicates 

Re: [PATCH] Correct cost model for strided loads

2012-06-11 Thread William J. Schmidt


On Mon, 2012-06-11 at 16:10 +0200, Richard Guenther wrote:
 On Mon, 11 Jun 2012, William J. Schmidt wrote:
 
  
  
  On Mon, 2012-06-11 at 11:15 +0200, Richard Guenther wrote:
   On Sun, Jun 10, 2012 at 5:58 PM, William J. Schmidt
   wschm...@linux.vnet.ibm.com wrote:
The fix for PR53331 caused a degradation to 187.facerec on
powerpc64-unknown-linux-gnu.  The following simple patch reverses the
degradation without otherwise affecting SPEC cpu2000 or cpu2006.
Bootstrapped and regtested on that platform with no new regressions.  Ok
for trunk?
   
   Well, would the real cost not be subparts * scalar_to_vec plus
   subparts * vec_perm?
   At least vec_perm isn't the cost for building up a vector from N scalar 
   elements
   either (it might be close enough if subparts == 2).  What's the case
   with facerec
   here?  Does it have subparts == 2?  
  
  In this case, subparts == 4 (32-bit floats, 128-bit vec reg).  On
  PowerPC, this requires two merge instructions and a permute instruction
  to get the four 32-bit quantities into the right place in a 128-bit
  register.  Currently this is modeled as a vec_perm in other parts of the
  vectorizer per Ira's earlier patches, so I naively changed this to do
  the same thing.
 
 I see.
 
  The types of vectorizer instructions aren't documented, and I can't
  infer much from the i386.c cost model, so I need a little education.
  What semantics are represented by scalar_to_vec?
 
 It's a vector splat, thus x - { x, x, x, ... }.  You can create
 { x, y, z, ... } by N such splats plus N - 1 permutes (if a permute,
 as VEC_PERM_EXPR, takes two input vectors).  That's by far not
 the most efficient way to build up such a vector of course (with AVX
 you could do one splat plus N - 1 inserts for example).  The cost
 is of course dependent on the number of vector elements, so a
 simple new enum vect_cost_for_stmt kind does not cover it but
 the target would have to look at the vector type passed and do
 some reasonable guess.

Ah, splat!  Yes, that's lingo I understand.  I see the intent now.

 
  On PowerPC, we have this mapping of the floating-point registers and
  vector float registers where they overlap (the low-order half of each of
  the first 32 vector float regs corresponds to a scalar float reg).  So
  in this case we have four scalar loads that place things in the bottom
  half of four vector registers, two vector merge instructions that
  collapse the four registers into two vector registers, and a vector
  permute that gets things in the right order.(*)  I wonder if what we
  refer to as a merge instruction is similar to scalar_to_vec.
 
 Looks similar to x86 SSE then.
 
  If so, then maybe we need something like
  
   subparts = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
   inside_cost += vect_get_stmt_cost (scalar_load) * ncopies * subparts;
   inside_cost += ncopies * vect_get_stmt_cost (scalar_to_vec) * subparts 
  / 2;
   inside_cost += ncopies * vect_get_stmt_cost (vec_perm);
  
  But then we'd have to change how vec_perm is modeled elsewhere for
  PowerPC based on Ira's earlier patches.  As I said, it's difficult for
  me to figure out all the intent of cost model decisions that have been
  made in the past, using current documentation.
 
 Heh, usually the intent was to make the changes simple, not to compute
 a proper cost.
 
 I think we simply need a new scalars_to_vec cost kind.

That works.  Maybe vec_construct gets the point across a little better?
I think we need to use the full builtin_vectorization_cost interface
instead of vect_get_stmt_cost here, so the targets can parameterize on
type.  Then we can just do one cost calculation for vec_construct that
covers the full costs of getting the vector in order after the loads.

 
   I really wanted to pessimize this case
   for say AVX and char elements, thus building up a vector from 32 scalars 
   which
   certainly does not cost a mere vec_perm.  So, maybe special-case the
   subparts == 2 case and assume vec_perm would match the cost only in that
   case.
  
  (I'm a little confused by this as what you have at the moment is a
  single scalar_to_vec per copy, which has a cost of 1 on most i386
  targets (occasionally 2).  The subparts multiplier is only applied to
  the loads.  So changing this to vec_perm seemed to be a no-op for i386.)
 
 Oh, I somehow read the patch as you were removing the multiplication
 by TYPE_VECTOR_SUBPARTS.  But yes, the cost is way off and I'd wanted
 to reflect it with N scalar loads plus N splats plus N - 1 permutes
 originally.  You could also model it with N scalar loads plus N
 inserts (but we don't have a vec_insert cost either).  I think adding
 a scalars_to_vec or vec_init or however we want to call it - basically
 what the cost of a vector CONSTRUCTOR would be - is best.
 
  (*) There are actually a couple more instructions here to convert 64-bit
  values to 32-bit values, since on PowerPC 32-bit loads are 

Re: [PATCH] Add vector cost model density heuristic

2012-06-11 Thread David Edelsohn
On Mon, Jun 11, 2012 at 10:55 AM, Richard Guenther rguent...@suse.de wrote:

 Well, they are at least magic numbers and heuristics that apply
 generally and not only to the single issue in sphinx.  And in
 fact how it works for sphinx _is_ magic.

 Second, I suggest that you need to rephrase I can make you and
 re-send your reply.

 Sorry for my bad english.  Consider it meaning that I'd rather have
 you think about a more proper solution.  That's what patch review
 is about after all, no?  Sometimes a complete re-write (which gets
 more difficult which each of the patches enhancing the not ideal
 current state) is the best thing to do.

Richard,

The values of the heuristics may be magic, but Bill believes the
heuristics are testing the important characteristics.  The heuristics
themselves are controlled by hooks, so the target can set the correct
values for their own requirements.

The concern is that a general cost infrastructure is too general.
And, based on history, all ports simply will copy the boilerplate from
the first implementation. It also may cause more problems because the
target has relatively little information to be able to judge
heuristics at that point in the middle-end. If the targets start to
get too cute or too complicated, it may cause more problems or more
confusion about why more complicated heuristics are not effective and
not producing the expected results.

I worry about creating another machine dependent reorg catch-all pass.

Maybe an incremental pre- and/or post- cost hook would be more
effective. I will let Bill comment.

Thanks, David


Re: [PATCH] Add vector cost model density heuristic

2012-06-11 Thread William J. Schmidt
On Mon, 2012-06-11 at 16:58 +0200, Richard Guenther wrote:
 On Mon, 11 Jun 2012, Richard Guenther wrote:
 
  On Mon, 11 Jun 2012, William J. Schmidt wrote:
  
   On Mon, 2012-06-11 at 13:40 +0200, Richard Guenther wrote:
On Fri, 8 Jun 2012, William J. Schmidt wrote:

 This patch adds a heuristic to the vectorizer when estimating the
 minimum profitable number of iterations.  The heuristic is
 target-dependent, and is currently disabled for all targets except
 PowerPC.  However, the intent is to make it general enough to be 
 useful
 for other targets that want to opt in.
 
 A previous patch addressed some PowerPC SPEC degradations by modifying
 the vector cost model values for vec_perm and vec_promote_demote.  The
 values were set a little higher than their natural values because the
 natural values were not sufficient to prevent a poor vectorization
 choice.  However, this is not the right long-term solution, since it 
 can
 unnecessarily constrain other vectorization choices involving permute
 instructions.
 
 Analysis of the badly vectorized loop (in sphinx3) showed that the
 problem was overcommitment of vector resources -- too many vector
 instructions issued without enough non-vector instructions available 
 to
 cover the delays.  The vector cost model assumes that instructions
 always have a constant cost, and doesn't have a way of judging this 
 kind
 of density of vector instructions.
 
 The present patch adds a heuristic to recognize when a loop is likely 
 to
 overcommit resources, and adds a small penalty to the inside-loop cost
 to account for the expected stalls.  The heuristic is parameterized 
 with
 three target-specific values:
 
  * Density threshold: The heuristic will apply only when the
percentage of inside-loop cost attributable to vectorized
instructions exceeds this value.
 
  * Size threshold: The heuristic will apply only when the
inside-loop cost exceeds this value.
 
  * Penalty: The inside-loop cost will be increased by this
percentage value when the heuristic applies.
 
 Thus only reasonably large loop bodies that are mostly vectorized
 instructions will be affected.
 
 By applying only a small percentage bump to the inside-loop cost, the
 heuristic will only turn off vectorization for loops that were
 considered barely profitable to begin with (such as the sphinx3 
 loop).
 So the heuristic is quite conservative and should not affect the vast
 majority of vectorization decisions.
 
 Together with the new heuristic, this patch reduces the vec_perm and
 vec_promote_demote costs for PowerPC to their natural values.
 
 I've regstrapped this with no regressions on 
 powerpc64-unknown-linux-gnu
 and verified that no performance regressions occur on SPEC cpu2006.  
 Is
 this ok for trunk?

Hmm.  I don't like this patch or its general idea too much.  Instead
I'd like us to move more of the cost model detail to the target, giving
it a chance to look at the whole loop before deciding on a cost.  ISTR
posting the overall idea at some point, but let me repeat it here 
instead
of trying to find that e-mail.

The basic interface of the cost model should be, in targetm.vectorize

  /* Tell the target to start cost analysis of a loop or a basic-block
 (if the loop argument is NULL).  Returns an opaque pointer to
 target-private data.  */
  void *init_cost (struct loop *loop);

  /* Add cost for N vectorized-stmt-kind statements in vector_mode.  */
  void add_stmt_cost (void *data, unsigned n,
  vectorized-stmt-kind,
  enum machine_mode vector_mode);

  /* Tell the target to compute and return the cost of the accumulated
 statements and free any target-private data.  */
  unsigned finish_cost (void *data);

with eventually slightly different signatures for add_stmt_cost
(like pass in the original scalar stmt?).

It allows the target, at finish_cost time, to evaluate things like
register pressure and resource utilization.
   
   OK, I'm trying to understand how you would want this built into the
   present structure.  Taking just the loop case for now:
   
   Judging by your suggested API, we would have to call add_stmt_cost ()
   everywhere that we now call stmt_vinfo_set_inside_of_loop_cost ().  For
   now this would be an additional call, not a replacement, though maybe
   the other goes away eventually.  This allows the target to save more
   data about the vectorized instructions than just an accumulated cost
   number (order and quantity of various kinds of instructions can be
   maintained for better modeling).  Presumably the call to finish_cost
   would be 

Re: [RFC] Target-specific limits on vector alignment

2012-06-11 Thread Richard Earnshaw
On 11/06/12 15:53, Richard Guenther wrote:
 On Mon, Jun 11, 2012 at 4:38 PM, Richard Earnshaw rearn...@arm.com wrote:
 On 11/06/12 15:17, Richard Guenther wrote:
 On Mon, Jun 11, 2012 at 3:16 PM, Richard Earnshaw rearn...@arm.com wrote:
 The ARM ABI states that vectors larger than 64 bits in size still have
 64-bit alignment; never-the-less, the HW supports alignment hints of up
 to 128-bits in some cases and will trap in a vector has an alignment
 that less than the hint.  GCC currently hard-codes larger vectors to be
 aligned by the size of the vector, which means that 128-bit vectors are
 marked as being 128-bit aligned.

 The ARM ABI unfortunately does not support generating such alignment for
 parameters passed by value and this can lead to traps at run time.  It
 seems that the best way to solve this problem is to allow the back-end
 to set an upper limit on the alignment permitted for a vector.

 I've implemented this as a separate hook, rather than using the existing
 hooks because there's a strong likelihood of breaking some existing ABIs
 if I did it another way.

 There are a couple of tests that will need some re-working before this
 can be committed to deal with the fall-out of making this change; I'll
 prepare those changes if this patch is deemed generally acceptable.

 Hm.  Where would you use that target hook?


 Doh!

 It was supposed to be in the patch set... :-(

 in layout_type(), where the alignment of a vector is forced to the size
 of the vector.
 
 Ok.
 
 + /* Naturally align vectors, but let the target set an upper
 +limit.  This prevents ABI changes depending on whether or
 +not native vector modes are supported.  */
 + TYPE_ALIGN (type)
 +   = targetm.vector_alignment (type, tree_low_cst (TYPE_SIZE (type), 0));
 
 The type argument or the size argument looks redundant.  

Technically, yes, we could get rid of tree_low_cst (TYPE_SIZE (type)
and calculate it inside the alignment function if it was needed.
However, it seemed likely that most targets would need that number one
way or another, such that passing it would be helpful.

 Note that we still
 can have such vector properly aligned, thus the vectorizer would need to
 use build_aligned_type also if it knows the type is aligned, not only when
 it thinks it is misaligned.  You basically change the alignment of the 
 default
 vector type.
 

I'm not sure I follow...

R.





Re: [PATCH] Add vector cost model density heuristic

2012-06-11 Thread William J. Schmidt


On Mon, 2012-06-11 at 11:09 -0400, David Edelsohn wrote:
 On Mon, Jun 11, 2012 at 10:55 AM, Richard Guenther rguent...@suse.de wrote:
 
  Well, they are at least magic numbers and heuristics that apply
  generally and not only to the single issue in sphinx.  And in
  fact how it works for sphinx _is_ magic.
 
  Second, I suggest that you need to rephrase I can make you and
  re-send your reply.
 
  Sorry for my bad english.  Consider it meaning that I'd rather have
  you think about a more proper solution.  That's what patch review
  is about after all, no?  Sometimes a complete re-write (which gets
  more difficult which each of the patches enhancing the not ideal
  current state) is the best thing to do.
 
 Richard,
 
 The values of the heuristics may be magic, but Bill believes the
 heuristics are testing the important characteristics.  The heuristics
 themselves are controlled by hooks, so the target can set the correct
 values for their own requirements.
 
 The concern is that a general cost infrastructure is too general.
 And, based on history, all ports simply will copy the boilerplate from
 the first implementation. It also may cause more problems because the
 target has relatively little information to be able to judge
 heuristics at that point in the middle-end. If the targets start to
 get too cute or too complicated, it may cause more problems or more
 confusion about why more complicated heuristics are not effective and
 not producing the expected results.
 
 I worry about creating another machine dependent reorg catch-all pass.
 
 Maybe an incremental pre- and/or post- cost hook would be more
 effective. I will let Bill comment.

Thanks David,

I can see both sides of this, and it's hard to judge the future from
where I stand.  My belief is that the number of heuristics targets will
implement will be fairly limited, since judgments about cycle-level
costs are not accurately predictable during the middle end.  All we can
do is come up with a few things that seem to make sense.  Doing too much
in the back end seems impractical.

The interesting question to me is whether cost model heuristics are
general enough to be reusable.  What I saw in this case was what I
considered to be a somewhat target-neutral problem:  overwhelming those
assets of the processor that implement vectorization.  It seemed
reasonable to provide hooks for others to use the idea if they encounter
similar issues.  If reusing the heuristic is useful, then having to copy
the logic from one target to another isn't the best approach.  If nobody
else will ever use it, then embedding it in the back end is reasonable.
Unfortunately my crystal ball has been on the fritz for several decades,
so I can't tell you for sure which is right...

Richard, my biggest question is whether you think other targets are
likely to take advantage of a more general back-end interface, or
whether this will end up just being a PowerPC wart.  If you know of ways
this will be useful for i386, that would be helpful to know.  Perhaps
this requires your crystal ball as well; not sure how well yours
works...

If we look at just this one issue in isolation, then changing all the
code in the vectorizer that calculates inside/outside loop costs and
moving it to targetm seems more invasive than adding the few hooks.  But
if this will really be a useful feature for the community as a whole I
am certainly willing to tackle it.

Thanks,
Bill

 
 Thanks, David
 



Re: [C++] Reject variably modified types in operator new

2012-06-11 Thread Florian Weimer

On 06/01/2012 02:09 PM, Florian Weimer wrote:

On 06/01/2012 11:00 AM, Florian Weimer wrote:


I'll try to warn about this case and make the transformation to the
proper operator new[] call.


Here's the version. I've added a warning for the ill-formed code.

The only remaining glitch is in g++.dg/cpp0x/regress/debug-debug7.C,
specifically (b is not a constant):

int (*x)[b] = new int[a][b]; // { dg-error not usable }

The new warning I've added fires on this line, too. How can I check for
the pending error and suppress the warning?


As discussed in the other thread with Jason and Gaby, I have changed 
cp/parser.c to accept non-constant values and produce the error in 
build_new_1.  This is arguably the right place because this check must 
be performed after template instantiation.  (I will submit a follow-up 
patch for the remaining type check in the parser once this is accepted.)


testsuite/g++.dg/cpp0x/regress/debug-debug7.C remains problematic.  The 
diagnostic at parse time was definitely better, but I see no way to keep 
that.  I have reworded the diagnostics because the code no longer has 
access to the original syntax, so it cannot tell whether the operator 
new call involved a type-id in parens.


Bootstrapped and tested on x86_64-linux-gnu, with no new regressions.

--
Florian Weimer / Red Hat Product Security Team
2012-06-11  Florian Weimer  fwei...@redhat.com

	* init.c (build_new_1): Warn about (T[N]) for variable N, and
	reject T[M][N].

	* parser.c (cp_parser_direct_new_declarator): Accept non-constant
	expressions.  Handled now in build_new_1.

2012-06-11  Florian Weimer  fwei...@redhat.com

	* g++.dg/init/new35.C: New.
	* g++.dg/init/new36.C: New.
	* g++.dg/ext/vla5.C: New warning.
	* g++.dg/ext/vla8.C: New warning.
	* g++.dg/cpp0x/regress/debug-debug7.C: Update diagnostics.

Index: cp/init.c
===
--- cp/init.c	(revision 188384)
+++ cp/init.c	(working copy)
@@ -2175,6 +2175,7 @@
   tree pointer_type;
   tree non_const_pointer_type;
   tree outer_nelts = NULL_TREE;
+  bool outer_nelts_from_type = false;
   tree alloc_call, alloc_expr;
   /* The address returned by the call to operator new.  This node is
  a VAR_DECL and is therefore reusable.  */
@@ -2209,10 +2210,14 @@
 }
   else if (TREE_CODE (type) == ARRAY_TYPE)
 {
+  /* Transforms new (T[N]) to new T[N].  The former is a GNU
+	 extension.  (This also covers new T where T is a VLA
+	 typedef.)  */
   array_p = true;
   nelts = array_type_nelts_top (type);
   outer_nelts = nelts;
   type = TREE_TYPE (type);
+  outer_nelts_from_type = true;
 }
 
   /* If our base type is an array, then make sure we know how many elements
@@ -2220,11 +2225,40 @@
   for (elt_type = type;
TREE_CODE (elt_type) == ARRAY_TYPE;
elt_type = TREE_TYPE (elt_type))
-nelts = cp_build_binary_op (input_location,
-MULT_EXPR, nelts,
-array_type_nelts_top (elt_type),
-complain);
+{
+  tree inner_nelts = array_type_nelts_top (elt_type);
+  tree inner_nelts_cst = maybe_constant_value (inner_nelts);
+  if (!TREE_CONSTANT (inner_nelts_cst))
+	{
+	  if (complain  tf_error)
+	error_at (EXPR_LOC_OR_HERE (inner_nelts),
+		  array size in operator new must be constant);
+	  nelts = error_mark_node;
+	}
+  if (nelts != error_mark_node)
+	nelts = cp_build_binary_op (input_location,
+MULT_EXPR, nelts,
+inner_nelts_cst,
+complain);
+}
 
+  if (variably_modified_type_p (elt_type, NULL_TREE)  (complain  tf_error))
+{
+  error (variably modified type not allowed in operator new);
+  return error_mark_node;
+}
+
+  if (nelts == error_mark_node)
+return error_mark_node;
+
+  /* Warn if we performed the (T[N]) to T[N] transformation and N is
+ variable.  */
+  if (outer_nelts_from_type
+   !TREE_CONSTANT (maybe_constant_value (outer_nelts))
+   (complain  tf_warning_or_error))
+pedwarn(EXPR_LOC_OR_HERE (outer_nelts), OPT_Wvla,
+	ISO C++ does not support variable-length array types);
+
   if (TREE_CODE (elt_type) == VOID_TYPE)
 {
   if (complain  tf_error)
Index: cp/parser.c
===
--- cp/parser.c	(revision 188384)
+++ cp/parser.c	(working copy)
@@ -6845,41 +6845,34 @@
   while (true)
 {
   tree expression;
+  cp_token *token;
 
   /* Look for the opening `['.  */
   cp_parser_require (parser, CPP_OPEN_SQUARE, RT_OPEN_SQUARE);
-  /* The first expression is not required to be constant.  */
-  if (!declarator)
+
+  token = cp_lexer_peek_token (parser-lexer);
+  expression = cp_parser_expression (parser, /*cast_p=*/false, NULL);
+  /* The standard requires that the expression have integral
+	 type.  DR 74 adds enumeration types.  We believe that the
+	 real intent is that these expressions be handled like the
+	 expression in a `switch' condition, which also 

Re: [PATCH] don't presume undelegitimized UNSPEC_TLS SYMBOL_REF is a decl

2012-06-11 Thread Roland McGrath
ping?

On Wed, Jun 6, 2012 at 2:04 PM, Roland McGrath mcgra...@google.com wrote:
 cf this change:

        2010-11-19  Jakub Jelinek  ja...@redhat.com

                PR target/45870
                * dwarf2out.c (const_ok_for_output_1): Don't complain about
                non-delegitimized TLS UNSPECs.

 This case hit me where the rtx was:

        (unspec:SI [
                (symbol_ref:SI (*.LANCHOR0) [flags 0x1aa])
                (const_int 4 [0x4])
            ] UNSPEC_TLS)

 Note:
 1. The UNSPEC has two operands, not one.
 2. The SYMBOL_REF does not correspond to any decl.

 This corresponds to this ARM code:

                ldr     r3, .L10+4
        ...
        .L10:
                something else
                .word   .LANCHOR0(tpoff)
        ...
                .section        .tdata,awT,%progbits
                .align  4
        .LANCHOR0 = . + 0
                .type   tdata1, %object
                .size   tdata1, 4
        tdata1:
                .word   1

 The only way I know to reproduce this is using a variant ARM target that
 I'm still developing and is not yet ready to be submitted, so I don't have
 a proper test case to offer.  But I think the principle of the following
 change is fairly sound.

 What do you think?  (Recall that I am not a GCC committer, so if you like
 the change, please commit it for me.)


 Thanks,
 Roland


 2012-06-06  Roland McGrath  mcgra...@google.com

        * dwarf2out.c (const_ok_for_output_1): Detect a TLS UNSPEC using
        SYMBOL_REF_TLS_MODEL rather than DECL_THREAD_LOCAL_P, in case it's
        not a VAR_DECL.  Also don't limit it to UNSPECs with exactly one
        operand.

 diff --git a/gcc/dwarf2out.c b/gcc/dwarf2out.c
 index 6e4ab76..bc68205 100644
 --- a/gcc/dwarf2out.c
 +++ b/gcc/dwarf2out.c
 @@ -10129,12 +10129,12 @@ const_ok_for_output_1 (rtx *rtlp, void *data 
 ATTRIBUTE_UNUSED)
         we can't express it in the debug info.  */
  #ifdef ENABLE_CHECKING
       /* Don't complain about TLS UNSPECs, those are just too hard to
 -        delegitimize.  */
 -      if (XVECLEN (rtl, 0) != 1
 +        delegitimize.  Note this could be a non-decl SYMBOL_REF such as
 +        one in a constant pool entry, so testing SYMBOL_REF_TLS_MODEL
 +        rather than DECL_THREAD_LOCAL_P is not just an optimization.  */
 +      if (XVECLEN (rtl, 0) == 0
          || GET_CODE (XVECEXP (rtl, 0, 0)) != SYMBOL_REF
 -         || SYMBOL_REF_DECL (XVECEXP (rtl, 0, 0)) == NULL
 -         || TREE_CODE (SYMBOL_REF_DECL (XVECEXP (rtl, 0, 0))) != VAR_DECL
 -         || !DECL_THREAD_LOCAL_P (SYMBOL_REF_DECL (XVECEXP (rtl, 0, 0
 +         || SYMBOL_REF_TLS_MODEL (XVECEXP (rtl, 0, 0)) == TLS_MODEL_NONE)
        inform (current_function_decl
                ? DECL_SOURCE_LOCATION (current_function_decl)
                : UNKNOWN_LOCATION,


Re: RFA: Alternative iterator implementation

2012-06-11 Thread Tejas Belagod

Richard Sandiford wrote:

As discussed in the context of the AARCH64 submission, this patch
rewrites the iterator handling in read-rtl.c so that we record
iterator positions using an on-the-side VEC rather than placeholder
modes and codes.  We then substitute in-place for each sequence of
iterator values and take a deep copy of the result.  We do any string
substitutions during the copy.

The patch also generalises the current use of attributes for rtx modes
((zero_extend:WIDER_MODE ...), etc.) so that the same kind of thing
can be done with future iterator types, including the int iterators.
Not sure whether that'll be useful or not, but it made the patch
easier to write.

Tested by making sure that the insn-*.c output didn't change for
x86_64-linux-gnu, except that we now do a better job of retaining
#line information (not seen as a good thing by everbody, I realise).
Also made sure that things like insn-output.c are still generated
in the blink of an eye.

Bootstrapped  regression-tested on x86_64-linux-gnu and i686-pc-linux-gnu.
OK to install?

Richard


gcc/
* read-rtl.c (mapping): Remove index field.  Add current_value field.
Define heap vectors.
(iterator_group): Fix long line.  Remove num_builtins field and
uses_iterator fields.  Make apply_iterator take a void * parameter.
(iterator_use, atttribute_use): New structures.
(iterator_traverse_data, BELLWETHER_CODE, bellwether_codes): Delete.
(current_iterators, iterator_uses, attribute_uses): New variables.
(uses_mode_iterator_p, uses_code_iterator_p): Delete.
(apply_mode_iterator, apply_code_iterator): Take a void * parameter.
(map_attr_string, apply_iterator_to_string): Remove iterator
and value parameters.  Look through all current iterator values
for a matching attribute.
(mode_attr_index, apply_mode_maps): Delete.
(apply_iterator_to_rtx): Replace with...
(copy_rtx_for_iterators): ...this new function.
(uses_iterator_p, apply_iterator_traverse): Delete.
(apply_attribute_uses, add_current_iterators, apply_iterators): New
functions.
(add_mapping): Remove index field.  Set current_value field.
(initialize_iterators): Don't set num_builtins and uses_iterator_p
fields.
(find_iterator): Delete.
(record_iterator_use, record_attribute_use): New functions.
(record_potential_iterator_use): New function.
(check_code_iterator): Remove handling of bellwether codes.
(read_rtx): Remove mode maps.  Truncate iterator and attribute uses.
(read_rtx_code, read_nested_rtx, read_rtx_variadic): Remove mode_maps
parameter.  Use the first code iterator value instead of the
bellwether_codes array.  Use record_potential_iterator_use
for modes.

Index: gcc/read-rtl.c
===
--- gcc/read-rtl.c  2012-06-03 08:58:32.251211521 +0100
+++ gcc/read-rtl.c  2012-06-03 09:20:47.633208254 +0100
@@ -41,7 +41,7 @@ struct map_value {
 };

 /* Maps an iterator or attribute name to a list of (integer, string) pairs.
-   The integers are mode or code values; the strings are either C conditions
+   The integers are iterator values; the strings are either C conditions
or attribute values.  */
 struct mapping {
   /* The name of the iterator or attribute.  */
@@ -50,82 +50,80 @@ struct mapping {
   /* The group (modes or codes) to which the iterator or attribute belongs.  */
   struct iterator_group *group;

-  /* Gives a unique number to the attribute or iterator.  Numbers are
- allocated consecutively, starting at 0.  */
-  int index;
-
   /* The list of (integer, string) pairs.  */
   struct map_value *values;
+
+  /* For iterators, records the current value of the iterator.  */
+  struct map_value *current_value;
 };

-/* A structure for abstracting the common parts of code and mode iterators.  */
+/* Vector definitions for the above.  */
+typedef struct mapping *mapping_ptr;
+DEF_VEC_P (mapping_ptr);
+DEF_VEC_ALLOC_P (mapping_ptr, heap);
+
+/* A structure for abstracting the common parts of iterators.  */
 struct iterator_group {
-  /* Tables of mapping structures, one for attributes and one for iterators. 
 */
+  /* Tables of mapping structures, one for attributes and one for
+ iterators.  */
   htab_t attrs, iterators;

-  /* The number of real modes or codes (and by extension, the first
- number available for use as an iterator placeholder).  */
-  int num_builtins;
-
-  /* Treat the given string as the name of a standard mode or code and
+  /* Treat the given string as the name of a standard mode, etc., and
  return its integer value.  */
   int (*find_builtin) (const char *);

-  /* Return true if the given rtx uses the given mode or code.  */
-  bool (*uses_iterator_p) (rtx, int);
+  /* Make the given pointer use the given iterator value.  */
+  void 

Re: [PATCH] Hoist adjacent pointer loads

2012-06-11 Thread William J. Schmidt
On Mon, 2012-06-11 at 14:59 +0200, Richard Guenther wrote:
 On Mon, 11 Jun 2012, William J. Schmidt wrote:
 
  On Mon, 2012-06-11 at 13:28 +0200, Richard Guenther wrote:
   On Mon, Jun 4, 2012 at 3:45 PM, William J. Schmidt
   wschm...@linux.vnet.ibm.com wrote:
Hi Richard,
   
Here's a revision of the hoist-adjacent-loads patch.  I'm sorry for the
delay since the last revision, but my performance testing has been
blocked waiting for a fix to PR53487.  I ended up applying a test
version of the patch to 4.7 and ran performance numbers with that
instead, with no degradations.
   
In addition to addressing your comments, this patch contains one bug fix
where local_mem_dependence was called on the wrong blocks after swapping
def1 and def2.
   
Bootstrapped with no regressions on powerpc64-unknown-linux-gnu.  Is
this version ok for trunk?  I won't commit it until I can do final
testing on trunk in conjunction with a fix for PR53487.
   
Thanks,
Bill
   
   
2012-06-04  Bill Schmidt  wschm...@linux.vnet.ibm.com
   
   * opts.c: Add -fhoist_adjacent_loads to -O2 and above.
   * tree-ssa-phiopt.c (tree_ssa_phiopt_worker): Add argument to 
forward
   declaration.
   (hoist_adjacent_loads, gate_hoist_loads): New forward 
declarations.
   (tree_ssa_phiopt): Call gate_hoist_loads.
   (tree_ssa_cs_elim): Add parm to tree_ssa_phiopt_worker call.
   (tree_ssa_phiopt_worker): Add do_hoist_loads to formal arg list; 
call
   hoist_adjacent_loads.
   (local_mem_dependence): New function.
   (hoist_adjacent_loads): Likewise.
   (gate_hoist_loads): Likewise.
   * common.opt (fhoist-adjacent-loads): New switch.
   * Makefile.in (tree-ssa-phiopt.o): Added dependencies.
   * params.def (PARAM_MIN_CMOVE_STRUCT_ALIGN): New param.
   
   
Index: gcc/opts.c
===
--- gcc/opts.c  (revision 187805)
+++ gcc/opts.c  (working copy)
@@ -489,6 +489,7 @@ static const struct default_options default_option
{ OPT_LEVELS_2_PLUS, OPT_falign_functions, NULL, 1 },
{ OPT_LEVELS_2_PLUS, OPT_ftree_tail_merge, NULL, 1 },
{ OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_foptimize_strlen, NULL, 1 },
+{ OPT_LEVELS_2_PLUS, OPT_fhoist_adjacent_loads, NULL, 1 },
   
/* -O3 optimizations.  */
{ OPT_LEVELS_3_PLUS, OPT_ftree_loop_distribute_patterns, NULL, 1 },
Index: gcc/tree-ssa-phiopt.c
===
--- gcc/tree-ssa-phiopt.c   (revision 187805)
+++ gcc/tree-ssa-phiopt.c   (working copy)
@@ -37,9 +37,17 @@ along with GCC; see the file COPYING3.  If not see
 #include cfgloop.h
 #include tree-data-ref.h
 #include tree-pretty-print.h
+#include gimple-pretty-print.h
+#include insn-config.h
+#include expr.h
+#include optabs.h
   
+#ifndef HAVE_conditional_move
+#define HAVE_conditional_move (0)
+#endif
+
 static unsigned int tree_ssa_phiopt (void);
-static unsigned int tree_ssa_phiopt_worker (bool);
+static unsigned int tree_ssa_phiopt_worker (bool, bool);
 static bool conditional_replacement (basic_block, basic_block,
edge, edge, gimple, tree, tree);
 static int value_replacement (basic_block, basic_block,
@@ -53,6 +61,9 @@ static bool cond_store_replacement (basic_block, b
 static bool cond_if_else_store_replacement (basic_block, basic_block, 
basic_block);
 static struct pointer_set_t * get_non_trapping (void);
 static void replace_phi_edge_with_variable (basic_block, edge, gimple, 
tree);
+static void hoist_adjacent_loads (basic_block, basic_block,
+ basic_block, basic_block);
+static bool gate_hoist_loads (void);
   
 /* This pass tries to replaces an if-then-else block with an
   assignment.  We have four kinds of transformations.  Some of these
@@ -138,12 +149,56 @@ static void replace_phi_edge_with_variable (basic_
 bb2:
   x = PHI x' (bb0), ...;
   
-   A similar transformation is done for MAX_EXPR.  */
+   A similar transformation is done for MAX_EXPR.
   
+
+   This pass also performs a fifth transformation of a slightly 
different
+   flavor.
+
+   Adjacent Load Hoisting
+   --
+
+   This transformation replaces
+
+ bb0:
+   if (...) goto bb2; else goto bb1;
+ bb1:
+   x1 = (expr).field1;
+   goto bb3;
+ bb2:
+   x2 = (expr).field2;
+ bb3:
+   # x = PHI x1, x2;
+
+   with
+
+ bb0:
+   x1 = (expr).field1;
+   x2 = (expr).field2;
+   if (...) goto bb2; else goto bb1;
+ bb1:
+   goto 

Re: [PATCH] Hoist adjacent pointer loads

2012-06-11 Thread William J. Schmidt
On Mon, 2012-06-11 at 12:11 -0500, William J. Schmidt wrote:

 I found this parameter that seems to correspond to well-predicted
 conditional jumps:
 
 /* When branch is predicted to be taken with probability lower than this
threshold (in percent), then it is considered well predictable. */
 DEFPARAM (PARAM_PREDICTABLE_BRANCH_OUTCOME,
 predictable-branch-outcome,
 Maximal estimated outcome of branch considered predictable,
 2, 0, 50)
 
...which has an interface predictable_edge_p () in predict.c, so that's
what I'll use.

Thanks,
Bill




Re: [PATCH 1/2] SH epilogue unwind, dwarf2 pass changes

2012-06-11 Thread Richard Henderson
On 2012-06-11 01:34, Chung-Lin Tang wrote:
 Ping?
 
 On 2012/6/1 06:24 PM, Chung-Lin Tang wrote:
 On 12/5/23 1:46 AM, Richard Henderson wrote:
 On 05/18/12 03:48, Chung-Lin Tang wrote:
 @@ -2401,6 +2401,7 @@ scan_trace (dw_trace_info *trace)
{
  /* Propagate across fallthru edges.  */
  dwarf2out_flush_queued_reg_saves ();
 +def_cfa_1 (this_cfa);
  maybe_record_trace_start (insn, NULL);
  break;
}
 @@ -2455,6 +2456,18 @@ scan_trace (dw_trace_info *trace)
  cur_cfa = this_cfa;
  continue;
}
 +else
 +  {
 +/* If ELT is a annulled branch-taken instruction (i.e. 
 executed
 +   only when branch is not taken), the args_size and CFA 
 should
 +   not change through the jump.  */
 +create_trace_edges (control);
 +
 +/* Update and continue with the trace.  */
 +add_cfi_insn = insn;
 +scan_insn_after (elt);
 +continue;
 +  }

 I think the def_cfa_1 is misplaced.  It should be immediately before
 that last continue.  That mirrors the sort of flow you get via the
 other paths through the loop.

 Or possibly moved before the dwarf2out_flush_queued_reg_saves () call in
 the patch? (in the save_point_p () break case)  Note I'm only saying
 this based on overall ordering of those two routine calls in the loop.

 Attached is a testcase (adapted from libgomp) that, with the SH epilogue
 unwind patch applied alone, produces the ICE I'm seeing (-O2
 -funwind-tables).

 This dwarf2 patch, with any of the three new def_cfa_1() call sites
 seems to solve it, though you might want to comment on which call site
 seems correct

I've committed the following, after a --enable-languages=c,c++ --target=sh-elf
build and check-gcc with your patch 2/2 applied.  I eyeballed the resulting
debug info and it looked correct.

Given the test case was extracted from libgomp I didn't commit that, trusting
that the real testing will continue to happen during a build.


r~
* dwarf2cfi.c (scan_trace): Handle annulled branch-taken delay slots.



diff --git a/gcc/dwarf2cfi.c b/gcc/dwarf2cfi.c
index bf2d802f..3edb6e1 100644
--- a/gcc/dwarf2cfi.c
+++ b/gcc/dwarf2cfi.c
@@ -2429,18 +2429,20 @@ scan_trace (dw_trace_info *trace)
 
  elt = XVECEXP (pat, 0, 1);
 
- /* If ELT is an instruction from target of an annulled branch,
-the effects are for the target only and so the args_size
-and CFA along the current path shouldn't change.  */
  if (INSN_FROM_TARGET_P (elt))
{
  HOST_WIDE_INT restore_args_size;
  cfi_vec save_row_reg_save;
 
+ /* If ELT is an instruction from target of an annulled
+branch, the effects are for the target only and so
+the args_size and CFA along the current path
+shouldn't change.  */
  add_cfi_insn = NULL;
  restore_args_size = cur_trace-end_true_args_size;
  cur_cfa = cur_row-cfa;
- save_row_reg_save = VEC_copy (dw_cfi_ref, gc, 
cur_row-reg_save);
+ save_row_reg_save
+   = VEC_copy (dw_cfi_ref, gc, cur_row-reg_save);
 
  scan_insn_after (elt);
 
@@ -2453,8 +2455,20 @@ scan_trace (dw_trace_info *trace)
  cur_row-cfa = this_cfa;
  cur_row-reg_save = save_row_reg_save;
  cur_cfa = this_cfa;
- continue;
}
+ else
+   {
+ /* If ELT is a annulled branch-taken instruction (i.e.
+executed only when branch is not taken), the args_size
+and CFA should not change through the jump.  */
+ create_trace_edges (control);
+
+ /* Update and continue with the trace.  */
+ add_cfi_insn = insn;
+ scan_insn_after (elt);
+ def_cfa_1 (this_cfa);
+   }
+ continue;
}
 
  /* The insns in the delay slot should all be considered to happen


Re: [PATCH] Add vector cost model density heuristic

2012-06-11 Thread Richard Guenther
On Mon, Jun 11, 2012 at 5:38 PM, William J. Schmidt
wschm...@linux.vnet.ibm.com wrote:


 On Mon, 2012-06-11 at 11:09 -0400, David Edelsohn wrote:
 On Mon, Jun 11, 2012 at 10:55 AM, Richard Guenther rguent...@suse.de wrote:

  Well, they are at least magic numbers and heuristics that apply
  generally and not only to the single issue in sphinx.  And in
  fact how it works for sphinx _is_ magic.
 
  Second, I suggest that you need to rephrase I can make you and
  re-send your reply.
 
  Sorry for my bad english.  Consider it meaning that I'd rather have
  you think about a more proper solution.  That's what patch review
  is about after all, no?  Sometimes a complete re-write (which gets
  more difficult which each of the patches enhancing the not ideal
  current state) is the best thing to do.

 Richard,

 The values of the heuristics may be magic, but Bill believes the
 heuristics are testing the important characteristics.  The heuristics
 themselves are controlled by hooks, so the target can set the correct
 values for their own requirements.

 The concern is that a general cost infrastructure is too general.
 And, based on history, all ports simply will copy the boilerplate from
 the first implementation. It also may cause more problems because the
 target has relatively little information to be able to judge
 heuristics at that point in the middle-end. If the targets start to
 get too cute or too complicated, it may cause more problems or more
 confusion about why more complicated heuristics are not effective and
 not producing the expected results.

 I worry about creating another machine dependent reorg catch-all pass.

 Maybe an incremental pre- and/or post- cost hook would be more
 effective. I will let Bill comment.

 Thanks David,

 I can see both sides of this, and it's hard to judge the future from
 where I stand.  My belief is that the number of heuristics targets will
 implement will be fairly limited, since judgments about cycle-level
 costs are not accurately predictable during the middle end.  All we can
 do is come up with a few things that seem to make sense.  Doing too much
 in the back end seems impractical.

 The interesting question to me is whether cost model heuristics are
 general enough to be reusable.  What I saw in this case was what I
 considered to be a somewhat target-neutral problem:  overwhelming those
 assets of the processor that implement vectorization.  It seemed
 reasonable to provide hooks for others to use the idea if they encounter
 similar issues.  If reusing the heuristic is useful, then having to copy
 the logic from one target to another isn't the best approach.  If nobody
 else will ever use it, then embedding it in the back end is reasonable.
 Unfortunately my crystal ball has been on the fritz for several decades,
 so I can't tell you for sure which is right...

 Richard, my biggest question is whether you think other targets are
 likely to take advantage of a more general back-end interface, or
 whether this will end up just being a PowerPC wart.  If you know of ways
 this will be useful for i386, that would be helpful to know.  Perhaps
 this requires your crystal ball as well; not sure how well yours
 works...

 If we look at just this one issue in isolation, then changing all the
 code in the vectorizer that calculates inside/outside loop costs and
 moving it to targetm seems more invasive than adding the few hooks.  But
 if this will really be a useful feature for the community as a whole I
 am certainly willing to tackle it.

Well.  At the moment the cost model is purely local in that the target
receives no context about the cost it has to return for (part of) a vectorized
operation.  In fact the vectorizer already creates multiple
instructions (and thus
calls to the cost target hook) for a single scalar statement in some cases.
There are two ways to increase the context the target can look at - invent
more kinds of statements we pass to the cost model hook, and thus
arrive at a point where for each group of stmts the vectorizer emits and
that possibly has a more efficient target implementation we have a different
statement kind (only for the cost model, we continue to generate multiple
GIMPLE stmts).  The other way is to give the target the ability to see context,
thus relation of the (sub-)statements the vectorizer asks the target to compute
costs for.  I've run into the situation where there was missing context on x86,
too (but I forgot the specific case), and that's where I came up with
the general idea.
Of course both work, but for your kind of heuristic only looking at
context works.
In the scheme I suggested the fini_cost hook would be able to do the counting
(as would a simple counter - I don't believe that this would not work - you do
counting, too, after all, simply make the fini hook able to reject the
vectorization).
If powerpc is issue-limited then you have the additional issue that you have no
idea of the amount of address calculations 

Re: [RFC] Target-specific limits on vector alignment

2012-06-11 Thread Richard Guenther
On Mon, Jun 11, 2012 at 5:25 PM, Richard Earnshaw rearn...@arm.com wrote:
 On 11/06/12 15:53, Richard Guenther wrote:
 On Mon, Jun 11, 2012 at 4:38 PM, Richard Earnshaw rearn...@arm.com wrote:
 On 11/06/12 15:17, Richard Guenther wrote:
 On Mon, Jun 11, 2012 at 3:16 PM, Richard Earnshaw rearn...@arm.com wrote:
 The ARM ABI states that vectors larger than 64 bits in size still have
 64-bit alignment; never-the-less, the HW supports alignment hints of up
 to 128-bits in some cases and will trap in a vector has an alignment
 that less than the hint.  GCC currently hard-codes larger vectors to be
 aligned by the size of the vector, which means that 128-bit vectors are
 marked as being 128-bit aligned.

 The ARM ABI unfortunately does not support generating such alignment for
 parameters passed by value and this can lead to traps at run time.  It
 seems that the best way to solve this problem is to allow the back-end
 to set an upper limit on the alignment permitted for a vector.

 I've implemented this as a separate hook, rather than using the existing
 hooks because there's a strong likelihood of breaking some existing ABIs
 if I did it another way.

 There are a couple of tests that will need some re-working before this
 can be committed to deal with the fall-out of making this change; I'll
 prepare those changes if this patch is deemed generally acceptable.

 Hm.  Where would you use that target hook?


 Doh!

 It was supposed to be in the patch set... :-(

 in layout_type(), where the alignment of a vector is forced to the size
 of the vector.

 Ok.

 +     /* Naturally align vectors, but let the target set an upper
 +        limit.  This prevents ABI changes depending on whether or
 +        not native vector modes are supported.  */
 +     TYPE_ALIGN (type)
 +       = targetm.vector_alignment (type, tree_low_cst (TYPE_SIZE (type), 
 0));

 The type argument or the size argument looks redundant.

 Technically, yes, we could get rid of tree_low_cst (TYPE_SIZE (type)
 and calculate it inside the alignment function if it was needed.
 However, it seemed likely that most targets would need that number one
 way or another, such that passing it would be helpful.

Well, you don't need it in stor-layout and targets might think the value may
be completely unrelated to the type ...

 Note that we still
 can have such vector properly aligned, thus the vectorizer would need to
 use build_aligned_type also if it knows the type is aligned, not only when
 it thinks it is misaligned.  You basically change the alignment of the 
 default
 vector type.


 I'm not sure I follow...

I say that a large vector may be still aligned, so the vectorizer when creating
vector memory references has to use a non-default aligned vector type when
the vector is aligned.  It won't do that at the moment.

Richard.

 R.





[PATCH][Cilkplus] Patch to make internal struct runtme independent

2012-06-11 Thread Iyer, Balaji V
Hello Everyone,
   This patch is for the Cilkplus branch affecting both C and C++ compilers. 
This patch will make the cilkrts_stack_frame struct independent of the runtime.

Thanking You,

Yours Sincerely,

Balaji V. Iyer.
Index: gcc/cilk.c
===
--- gcc/cilk.c  (revision 188393)
+++ gcc/cilk.c  (working copy)
@@ -165,7 +165,7 @@
   TYPE_ALIGN (frame) = PREFERRED_STACK_BOUNDARY;
   TREE_ADDRESSABLE (frame) = 1;
   /* XXX Is this right? */
-  finish_builtin_struct (frame, __cilkrts_stack_frame, fields, NULL_TREE);
+  finish_builtin_struct (frame, __cilkrts_stack_frame_GCC, fields, 
NULL_TREE);
   /* XXX C++ only SET_CLASS_TYPE_P(record, 1); */
   /* XXX C++ only xref_basetypes (record, NULL_TREE); */
   cilk_frame_type_decl = frame;
Index: gcc/ChangeLog.cilk
===
--- gcc/ChangeLog.cilk  (revision 188393)
+++ gcc/ChangeLog.cilk  (working copy)
@@ -1,3 +1,8 @@
+2012-06-11  Balaji V. Iyer  balaji.v.i...@intel.com
+
+   * cilk.c (cilk_init_builtins): Made the stack frame structure runtime
+   independent.
+
 2012-06-05  Balaji V. Iyer  balaji.v.i...@intel.com
 
* cilk-spawn.c (compute_loop_var): Removed an unwanted assert.


Re: [PATCH] Correct cost model for strided loads

2012-06-11 Thread Richard Guenther
On Mon, Jun 11, 2012 at 5:01 PM, William J. Schmidt
wschm...@linux.vnet.ibm.com wrote:


 On Mon, 2012-06-11 at 16:10 +0200, Richard Guenther wrote:
 On Mon, 11 Jun 2012, William J. Schmidt wrote:

 
 
  On Mon, 2012-06-11 at 11:15 +0200, Richard Guenther wrote:
   On Sun, Jun 10, 2012 at 5:58 PM, William J. Schmidt
   wschm...@linux.vnet.ibm.com wrote:
The fix for PR53331 caused a degradation to 187.facerec on
powerpc64-unknown-linux-gnu.  The following simple patch reverses the
degradation without otherwise affecting SPEC cpu2000 or cpu2006.
Bootstrapped and regtested on that platform with no new regressions.  
Ok
for trunk?
  
   Well, would the real cost not be subparts * scalar_to_vec plus
   subparts * vec_perm?
   At least vec_perm isn't the cost for building up a vector from N scalar 
   elements
   either (it might be close enough if subparts == 2).  What's the case
   with facerec
   here?  Does it have subparts == 2?
 
  In this case, subparts == 4 (32-bit floats, 128-bit vec reg).  On
  PowerPC, this requires two merge instructions and a permute instruction
  to get the four 32-bit quantities into the right place in a 128-bit
  register.  Currently this is modeled as a vec_perm in other parts of the
  vectorizer per Ira's earlier patches, so I naively changed this to do
  the same thing.

 I see.

  The types of vectorizer instructions aren't documented, and I can't
  infer much from the i386.c cost model, so I need a little education.
  What semantics are represented by scalar_to_vec?

 It's a vector splat, thus x - { x, x, x, ... }.  You can create
 { x, y, z, ... } by N such splats plus N - 1 permutes (if a permute,
 as VEC_PERM_EXPR, takes two input vectors).  That's by far not
 the most efficient way to build up such a vector of course (with AVX
 you could do one splat plus N - 1 inserts for example).  The cost
 is of course dependent on the number of vector elements, so a
 simple new enum vect_cost_for_stmt kind does not cover it but
 the target would have to look at the vector type passed and do
 some reasonable guess.

 Ah, splat!  Yes, that's lingo I understand.  I see the intent now.


  On PowerPC, we have this mapping of the floating-point registers and
  vector float registers where they overlap (the low-order half of each of
  the first 32 vector float regs corresponds to a scalar float reg).  So
  in this case we have four scalar loads that place things in the bottom
  half of four vector registers, two vector merge instructions that
  collapse the four registers into two vector registers, and a vector
  permute that gets things in the right order.(*)  I wonder if what we
  refer to as a merge instruction is similar to scalar_to_vec.

 Looks similar to x86 SSE then.

  If so, then maybe we need something like
 
       subparts = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
       inside_cost += vect_get_stmt_cost (scalar_load) * ncopies * subparts;
       inside_cost += ncopies * vect_get_stmt_cost (scalar_to_vec) * 
  subparts / 2;
       inside_cost += ncopies * vect_get_stmt_cost (vec_perm);
 
  But then we'd have to change how vec_perm is modeled elsewhere for
  PowerPC based on Ira's earlier patches.  As I said, it's difficult for
  me to figure out all the intent of cost model decisions that have been
  made in the past, using current documentation.

 Heh, usually the intent was to make the changes simple, not to compute
 a proper cost.

 I think we simply need a new scalars_to_vec cost kind.

 That works.  Maybe vec_construct gets the point across a little better?
 I think we need to use the full builtin_vectorization_cost interface
 instead of vect_get_stmt_cost here, so the targets can parameterize on
 type.  Then we can just do one cost calculation for vec_construct that
 covers the full costs of getting the vector in order after the loads.

Yes, vec_construct sounds good to me.


   I really wanted to pessimize this case
   for say AVX and char elements, thus building up a vector from 32 scalars 
   which
   certainly does not cost a mere vec_perm.  So, maybe special-case the
   subparts == 2 case and assume vec_perm would match the cost only in that
   case.
 
  (I'm a little confused by this as what you have at the moment is a
  single scalar_to_vec per copy, which has a cost of 1 on most i386
  targets (occasionally 2).  The subparts multiplier is only applied to
  the loads.  So changing this to vec_perm seemed to be a no-op for i386.)

 Oh, I somehow read the patch as you were removing the multiplication
 by TYPE_VECTOR_SUBPARTS.  But yes, the cost is way off and I'd wanted
 to reflect it with N scalar loads plus N splats plus N - 1 permutes
 originally.  You could also model it with N scalar loads plus N
 inserts (but we don't have a vec_insert cost either).  I think adding
 a scalars_to_vec or vec_init or however we want to call it - basically
 what the cost of a vector CONSTRUCTOR would be - is best.

  (*) There are actually 

[PATCH][Cilkplus] Patch to make internal struct runtme independent

2012-06-11 Thread Iyer, Balaji V
Hello Everyone,
   This patch is for the Cilkplus branch affecting both C and C++ compilers. 
This patch will make the cilkrts_stack_frame structure independent of the 
runtime.

Thanking You,

Yours Sincerely,

Balaji V. Iyer.





Index: gcc/cilk.c
===
--- gcc/cilk.c  (revision 188393)
+++ gcc/cilk.c  (working copy)
@@ -165,7 +165,7 @@
   TYPE_ALIGN (frame) = PREFERRED_STACK_BOUNDARY;
   TREE_ADDRESSABLE (frame) = 1;
   /* XXX Is this right? */
-  finish_builtin_struct (frame, __cilkrts_stack_frame, fields, NULL_TREE);
+  finish_builtin_struct (frame, __cilkrts_stack_frame_GCC, fields, 
NULL_TREE);
   /* XXX C++ only SET_CLASS_TYPE_P(record, 1); */
   /* XXX C++ only xref_basetypes (record, NULL_TREE); */
   cilk_frame_type_decl = frame;
Index: gcc/ChangeLog.cilk
===
--- gcc/ChangeLog.cilk  (revision 188393)
+++ gcc/ChangeLog.cilk  (working copy)
@@ -1,3 +1,8 @@
+2012-06-11  Balaji V. Iyer  balaji.v.i...@intel.com
+
+   * cilk.c (cilk_init_builtins): Made the stack frame structure runtime
+   independent.
+
 2012-06-05  Balaji V. Iyer  balaji.v.i...@intel.com
 
* cilk-spawn.c (compute_loop_var): Removed an unwanted assert.


Make timevar phases mutually exclusive. (issue6302064)

2012-06-11 Thread Lawrence Crowl
The intent of the phases was to have a high-level but mutually exclusive
accounting of compile time.  We want to track compile time in a way that
tells us which conceptual phases are taking the most time.  That intent
is not currently satisfied.  This patch restores that intent.

Add code to verify that the sum of the phase times is less than the total
time, to detect when phases are overlapped.  A slight amount of leeway
is required due to time jitters.  This verification is done as the last
step in printing timevars so that any timevar information is not lost.

Rename the phases to be clearer about what they measure, so that they are
less likely to be modified to be overlapping.  The primary example is
to change TV_PHASE_GENERATE to TV_PHASE_LATE_ASM, meaning late work on
the assembly.  This name change avoids confusion n moving the timevar
start call after the call to lang_hooks.decls.final_write_globals,
which prevents overlapping phases.

Each implementation of lang_hooks.decls.final_write_globals, is
responsible for starting and stopping its own phases.  Each implementation
currently has a first phase of TV_PHASE_DEFERRED for front-end work
deferred until after parsing is complete.  The second phase has been
renamed from TV_PHASE_CGRAPH to TV_PHASE_OPT_GEN, to better reflect its
use as the main optimization and generation phase.  This phase accounts
for 70%-80% of compilation time.  The third phase is TV_PHASE_DBGINFO,
except in cp/decl2.c, where it is TV_PHASE_CHECK_DBGINFO because
cc1plus mixes checking in with debug info generation.  In langhooks.c,
write_global_declarations was using TV_PHASE_CHECK_DBGINFO, but it was
doing no checking.  So, it now uses TV_PHASE_DBGINFO.

The changes to LTO are significant.  First, initialization now uses
TV_PHASE_SETUP.  Reading files now uses TV_PHASE_STREAM_IN.  Writing files
now uses TV_PHASE_STREAM_OUT.  The remaining phase is TV_PHASE_OPT_GEN
(formerly TV_PHASE_CGRAPH).

Tested on x86_64.

Okay for trunk?


Index: gcc/ChangeLog

2012-06-11  Lawrence Crowl  cr...@google.com

* timevar.def (TV_PHASE_GENERATE): Rename to TV_PHASE_LATE_ASM.
(TV_PHASE_CGRAPH): Rename to TV_PHASE_OPT_GEN.
(TV_PHASE_STREAM_IN): New.
(TV_PHASE_STREAM_OUT): New.
* timevar.c (validate_phases): New.
(timevar_print): Call validate_phases.
* c-decl.c (c_write_global_declarations): Rename use of TV_PHASE_CGRAPH
to TV_PHASE_OPT_GEN.
* langhooks.c (write_global_declarations): Rename use of
TV_PHASE_CGRAPH to TV_PHASE_OPT_GEN.  Use TV_PHASE_DBGINFO instead of
TV_PHASE_CHECK_DBGINFO.
* toplev.c (compile_file): Rename use of TV_PHASE_GENERATE to
TV_PHASE_LATE_ASM.  Move start of TV_PHASE_LATE_ASM to after call to
lang_hooks.decls.final_write_globals.

Index: gcc/cp/ChangeLog

2012-06-11  Lawrence Crowl  cr...@google.com

* decl2.c (cp_write_global_declarations): Rename use of TV_PHASE_CGRAPH
to TV_PHASE_OPT_GEN.

Index: gcc/lto/ChangeLog

2012-06-11  Lawrence Crowl  cr...@google.com

* lto.c (do_whole_program_analysis):  Rename use of TV_PHASE_CGRAPH to
TV_PHASE_OPT_GEN.  Use new timevar TV_PHASE_STREAM_OUT around the call
to lto_wpa_write_files.
(lto_main):  Rename use of TV_PHASE_CGRAPH to TV_PHASE_OPT_GEN.  Move
start of TV_PHASE_OPT_GEN to include call to materialize_cgraph.  Use
TV_PHASE_SETUP for the call to lto_init.  Use new timevar
TV_PHASE_STREAM_IN around the call to read_cgraph_and_symbols.

Index: gcc/toplev.c
===
--- gcc/toplev.c(revision 188335)
+++ gcc/toplev.c(working copy)
@@ -558,18 +558,15 @@ compile_file (void)
   if (flag_syntax_only || flag_wpa)
 return;
 
-  timevar_start (TV_PHASE_GENERATE);
-
   ggc_protect_identifiers = false;
 
   /* This must also call finalize_compilation_unit.  */
   lang_hooks.decls.final_write_globals ();
 
   if (seen_error ())
-{
-  timevar_stop (TV_PHASE_GENERATE);
-  return;
-}
+return;
+
+  timevar_start (TV_PHASE_LATE_ASM);
 
   /* Compilation unit is finalized.  When producing non-fat LTO object, we are
  basically finished.  */
@@ -670,7 +667,7 @@ compile_file (void)
  assembly file after this point.  */
   targetm.asm_out.file_end ();
 
-  timevar_stop (TV_PHASE_GENERATE);
+  timevar_stop (TV_PHASE_LATE_ASM);
 }
 
 /* Print version information to FILE.
Index: gcc/cp/decl2.c
===
--- gcc/cp/decl2.c  (revision 188335)
+++ gcc/cp/decl2.c  (working copy)
@@ -4017,11 +4017,11 @@ cp_write_global_declarations (void)
   candidates = collect_candidates_for_java_method_aliases ();
 
   timevar_stop (TV_PHASE_DEFERRED);
-  timevar_start (TV_PHASE_CGRAPH);
+  timevar_start (TV_PHASE_OPT_GEN);
 
   finalize_compilation_unit ();
 
-  timevar_stop (TV_PHASE_CGRAPH);
+  timevar_stop 

[patch]: move stackalign tests that use dg-options

2012-06-11 Thread Janis Johnson
The tests in gcc.dg/torture/stackalign use two sets of torture options:
the usual optimization sets used as default for torture tests, and up to
four sets of options that are specific to stack alignment.  The stack
alignment options are passed via an option that is used by the dg-test
option dg_extra_tool_flags, which can be overridden by dg-options.  This
means that the seven tests in that test directory that use dg-option do
not use the other alignment options.  All seven of these tests are
limited to x86 targets.  Four of them use -msse2, and four use different
values for -mpreferred-stack-boundary (one does both), so it doesn't
look as if they are intended to use the four sets of stackalign torture
options.

This patch moves those seven tests out of the stackalign directory up to
the general gcc.dg/torture directory.  With them out of the way I'll be
able to use clean up the remaining stackalign tests to use torture test
support to combine the stack align options with other torture options so
they'll show up in test summary lines, eliminating lots of duplicate
lines in test summaries.

Tested on i686-pc-linux-gnu and arm-none-eabi.  OK for mainline?

2012-06-11  Janis Johnson  jani...@codesourcery.com

* gcc.dg/torture/stackalign/alloca-2.c: Move to ...
* gcc.dg/torture/alloca-2.c: ... here.
* gcc.dg/torture/stackalign/alloca-3.c: Move to ...
* gcc.dg/torture/alloca-3.c: ... here.
* gcc.dg/torture/stackalign/alloca-4.c: Move to ...
* gcc.dg/torture/alloca-4.c: ... here.
* gcc.dg/torture/stackalign/alloca-5.c: Move to ...
* gcc.dg/torture/alloca-5.c: ... here.
* gcc.dg/torture/stackalign/alloca-6.c: Move to ...
* gcc.dg/torture/alloca-6.c: ... here.
* gcc.dg/torture/stackalign/push-1.c: Move to ...
* gcc.dg/torture/push-1.c: ... here.
* gcc.dg/torture/stackalign/vararg-3.c: Move to ...
* gcc.dg/torture/vararg-3.c: ... here.


Re: [PATCH] Fixincludes/VxWorks

2012-06-11 Thread rbmj

On 06/10/2012 02:38 PM, Bruce Korb wrote:

On Sun, Jun 10, 2012 at 10:57 AM, Nathan Sidwellnat...@acm.org  wrote:

On 06/06/12 17:33, rbmj wrote:

Hi everyone,

This patch series is the result of this [1] thread about fixincludes on
VxWorks.
It resolves bugs 53457 and 53378, and a few other issues that previously
required manual intervention to fix for VxWorks header files.


 From a vxworks POV these all look ok.

Oh, from fixincludes, too.


Could somebody please commit this?  I don't have access.

Thanks,

Robert Mason


Re: [Patch] Add AC_ARG_ENABLE for libstdc++-v3

2012-06-11 Thread rbmj

On 06/11/2012 08:02 AM, Paolo Bonzini wrote:

Il 11/06/2012 13:59, rbmj ha scritto:

On 05/22/2012 04:37 PM, rbmj wrote:

This patch adds an AC_ARG_ENABLE option to build/not build
libstdc++-v3.  I wrote the patch in order to allow the user to
override the automatic disable for libstdc++-v3 for certain targets.

Ping^2 on http://gcc.gnu.org/ml/gcc-patches/2012-05/msg01525.html

Ok.


If it's OK, could somebody please commit?  Thanks,

Robert


Re: [patch]: move stackalign tests that use dg-options

2012-06-11 Thread H.J. Lu
On Mon, Jun 11, 2012 at 12:00 PM, Janis Johnson
janis_john...@mentor.com wrote:
 The tests in gcc.dg/torture/stackalign use two sets of torture options:
 the usual optimization sets used as default for torture tests, and up to
 four sets of options that are specific to stack alignment.  The stack
 alignment options are passed via an option that is used by the dg-test
 option dg_extra_tool_flags, which can be overridden by dg-options.  This
 means that the seven tests in that test directory that use dg-option do
 not use the other alignment options.  All seven of these tests are
 limited to x86 targets.  Four of them use -msse2, and four use different
 values for -mpreferred-stack-boundary (one does both), so it doesn't
 look as if they are intended to use the four sets of stackalign torture
 options.

 This patch moves those seven tests out of the stackalign directory up to
 the general gcc.dg/torture directory.  With them out of the way I'll be
 able to use clean up the remaining stackalign tests to use torture test
 support to combine the stack align options with other torture options so
 they'll show up in test summary lines, eliminating lots of duplicate
 lines in test summaries.

 Tested on i686-pc-linux-gnu and arm-none-eabi.  OK for mainline?

 2012-06-11  Janis Johnson  jani...@codesourcery.com

        * gcc.dg/torture/stackalign/alloca-2.c: Move to ...
        * gcc.dg/torture/alloca-2.c: ... here.
        * gcc.dg/torture/stackalign/alloca-3.c: Move to ...
        * gcc.dg/torture/alloca-3.c: ... here.
        * gcc.dg/torture/stackalign/alloca-4.c: Move to ...
        * gcc.dg/torture/alloca-4.c: ... here.
        * gcc.dg/torture/stackalign/alloca-5.c: Move to ...
        * gcc.dg/torture/alloca-5.c: ... here.
        * gcc.dg/torture/stackalign/alloca-6.c: Move to ...
        * gcc.dg/torture/alloca-6.c: ... here.
        * gcc.dg/torture/stackalign/push-1.c: Move to ...
        * gcc.dg/torture/push-1.c: ... here.
        * gcc.dg/torture/stackalign/vararg-3.c: Move to ...
        * gcc.dg/torture/vararg-3.c: ... here.

stackalign.exp has

gcc-dg-runtest [lsort [glob $srcdir/$subdir/*.c]] $additional_flags
if { [check_effective_target_fpic] } then {
set pic_additional_flags $additional_flags
lappend pic_additional_flags -fpic
gcc-dg-runtest [lsort [glob $srcdir/$subdir/*.c]] $pic_additional_flags
}

It adds PIC tests.  Will this change remove PIC tests?


-- 
H.J.


Re: [patch]: move stackalign tests that use dg-options

2012-06-11 Thread Janis Johnson
On 06/11/2012 12:15 PM, H.J. Lu wrote:
 On Mon, Jun 11, 2012 at 12:00 PM, Janis Johnson
 janis_john...@mentor.com wrote:
 The tests in gcc.dg/torture/stackalign use two sets of torture options:
 the usual optimization sets used as default for torture tests, and up to
 four sets of options that are specific to stack alignment.  The stack
 alignment options are passed via an option that is used by the dg-test
 option dg_extra_tool_flags, which can be overridden by dg-options.  This
 means that the seven tests in that test directory that use dg-option do
 not use the other alignment options.  All seven of these tests are
 limited to x86 targets.  Four of them use -msse2, and four use different
 values for -mpreferred-stack-boundary (one does both), so it doesn't
 look as if they are intended to use the four sets of stackalign torture
 options.

 This patch moves those seven tests out of the stackalign directory up to
 the general gcc.dg/torture directory.  With them out of the way I'll be
 able to use clean up the remaining stackalign tests to use torture test
 support to combine the stack align options with other torture options so
 they'll show up in test summary lines, eliminating lots of duplicate
 lines in test summaries.

 Tested on i686-pc-linux-gnu and arm-none-eabi.  OK for mainline?

 2012-06-11  Janis Johnson  jani...@codesourcery.com

* gcc.dg/torture/stackalign/alloca-2.c: Move to ...
* gcc.dg/torture/alloca-2.c: ... here.
* gcc.dg/torture/stackalign/alloca-3.c: Move to ...
* gcc.dg/torture/alloca-3.c: ... here.
* gcc.dg/torture/stackalign/alloca-4.c: Move to ...
* gcc.dg/torture/alloca-4.c: ... here.
* gcc.dg/torture/stackalign/alloca-5.c: Move to ...
* gcc.dg/torture/alloca-5.c: ... here.
* gcc.dg/torture/stackalign/alloca-6.c: Move to ...
* gcc.dg/torture/alloca-6.c: ... here.
* gcc.dg/torture/stackalign/push-1.c: Move to ...
* gcc.dg/torture/push-1.c: ... here.
* gcc.dg/torture/stackalign/vararg-3.c: Move to ...
* gcc.dg/torture/vararg-3.c: ... here.
 
 stackalign.exp has
 
 gcc-dg-runtest [lsort [glob $srcdir/$subdir/*.c]] $additional_flags
 if { [check_effective_target_fpic] } then {
 set pic_additional_flags $additional_flags
 lappend pic_additional_flags -fpic
 gcc-dg-runtest [lsort [glob $srcdir/$subdir/*.c]] $pic_additional_flags
 }
 
 It adds PIC tests.  Will this change remove PIC tests?
 
 

The options for PIC tests are overridden by dg-options.  This change
will make no difference at all to how these 7 tests are compiled.
They are currently compiled four times with exactly the same options.

Janis


[committed] ChangeLog formatting fixes

2012-06-11 Thread Oleg Endo
I've just committed the attached patch as rev 188397 which fixes some
formatting in the gcc/ChangeLog.

Cheers,
Oleg

Index: gcc/ChangeLog
===
--- gcc/ChangeLog	(revision 188396)
+++ gcc/ChangeLog	(working copy)
@@ -452,13 +452,15 @@
 2012-06-03  Oleg Endo  olege...@gcc.gnu.org
 
 	PR target/53512
-	* sh.opt (mfsca, mfsrra): New options.
-	* sh.md (rsqrtsf2): Use TARGET_FPU_ANY and TARGET_FSRRA	condition.
+	* config/sh/sh.opt (mfsca, mfsrra): New options.
+	* config/sh/sh.md (rsqrtsf2): Use TARGET_FPU_ANY and TARGET_FSRRA
+	condition.
 	(fsca): Use TARGET_FPU_ANY and TARGET_FSCA condition.
 	(sinssf2, cossf2): Fold expanders to ...
 	(sincossf3): ... this new expander.  Use TARGET_FPU_ANY and
 	TARGET_FSCA condition.
-	* sh.c (sh_option_override): Handle TARGET_FSRRA and TARGET_FSCA.
+	* config/sh/sh.c (sh_option_override): Handle TARGET_FSRRA and
+	TARGET_FSCA.
 	* doc/invoke.texi (SH Options): Add descriptions for -mfsca,
 	-mno-fsca, -mfsrra, -mno-fsrra.
 
@@ -501,7 +503,8 @@
 	* varasm.c (output_constructor_bitfield): Likewise.
 	* tree-vrp.c (register_edge_assert_for_2): Likewise.
 	* double-int.c (rshift_double, lshift_double): Likewise.
-	* double-int.h (double_int_fits_in_uhwi_p, double_int, double_int): Likewise.
+	* double-int.h (double_int_fits_in_uhwi_p, double_int, double_int):
+	Likewise.
 	* simplify-rtx.c (mode_signbit_p)
 	(simplify_const_unary_operation, simplify_binary_operation_1)
 	(simplify_immed_subreg): Likewise.


Re: [patch]: move stackalign tests that use dg-options

2012-06-11 Thread Janis Johnson
On 06/11/2012 12:35 PM, H.J. Lu wrote:
 On Mon, Jun 11, 2012 at 12:21 PM, Janis Johnson
 janis_john...@mentor.com wrote:
 On 06/11/2012 12:15 PM, H.J. Lu wrote:
 On Mon, Jun 11, 2012 at 12:00 PM, Janis Johnson
 janis_john...@mentor.com wrote:
 The tests in gcc.dg/torture/stackalign use two sets of torture options:
 the usual optimization sets used as default for torture tests, and up to
 four sets of options that are specific to stack alignment.  The stack
 alignment options are passed via an option that is used by the dg-test
 option dg_extra_tool_flags, which can be overridden by dg-options.  This
 means that the seven tests in that test directory that use dg-option do
 not use the other alignment options.  All seven of these tests are
 limited to x86 targets.  Four of them use -msse2, and four use different
 values for -mpreferred-stack-boundary (one does both), so it doesn't
 look as if they are intended to use the four sets of stackalign torture
 options.

 This patch moves those seven tests out of the stackalign directory up to
 the general gcc.dg/torture directory.  With them out of the way I'll be
 able to use clean up the remaining stackalign tests to use torture test
 support to combine the stack align options with other torture options so
 they'll show up in test summary lines, eliminating lots of duplicate
 lines in test summaries.

 Tested on i686-pc-linux-gnu and arm-none-eabi.  OK for mainline?

 2012-06-11  Janis Johnson  jani...@codesourcery.com

* gcc.dg/torture/stackalign/alloca-2.c: Move to ...
* gcc.dg/torture/alloca-2.c: ... here.
* gcc.dg/torture/stackalign/alloca-3.c: Move to ...
* gcc.dg/torture/alloca-3.c: ... here.
* gcc.dg/torture/stackalign/alloca-4.c: Move to ...
* gcc.dg/torture/alloca-4.c: ... here.
* gcc.dg/torture/stackalign/alloca-5.c: Move to ...
* gcc.dg/torture/alloca-5.c: ... here.
* gcc.dg/torture/stackalign/alloca-6.c: Move to ...
* gcc.dg/torture/alloca-6.c: ... here.
* gcc.dg/torture/stackalign/push-1.c: Move to ...
* gcc.dg/torture/push-1.c: ... here.
* gcc.dg/torture/stackalign/vararg-3.c: Move to ...
* gcc.dg/torture/vararg-3.c: ... here.

 stackalign.exp has

 gcc-dg-runtest [lsort [glob $srcdir/$subdir/*.c]] $additional_flags
 if { [check_effective_target_fpic] } then {
 set pic_additional_flags $additional_flags
 lappend pic_additional_flags -fpic
 gcc-dg-runtest [lsort [glob $srcdir/$subdir/*.c]] $pic_additional_flags
 }

 It adds PIC tests.  Will this change remove PIC tests?



 The options for PIC tests are overridden by dg-options.  This change
 will make no difference at all to how these 7 tests are compiled.
 They are currently compiled four times with exactly the same options.

 
 That doesn't match what I see on trunk as of yesterday:
 
 Executing on host:
 /export/build/gnu/gcc-x32/build-x86_64-linux/gcc/xgcc
 -B/export/build/gnu/gcc-x32/build-x86_64-linux/gcc/
 /export/gnu/import/git/gcc/gcc/testsuite/gcc.dg/torture/stackalign/comp-goto-1.c
  -fno-diagnostics-show-caret   -O0  -mstackrealign
 -mpreferred-stack-boundary=5 -mno-mmx  -lm   -o ./comp-goto-1.exe
 (timeout = 300)
 
 Not only stackalign.exp adds -fpic, it also adds -mforce-drap,
 -mstackrealign, -mpreferred-stack-boundary=5 and
 -mno-mmx.

Your example uses comp-goto-1.c which uses the four sets of stack
alignment options as intended.  That isn't the case for the 7
tests that use dg-options; try one of those.

I plan to modify stackalign.exp so that the stackalign options
will be treated as torture options and be reported in test summaries
to make each line in a test summary unique, with the goal of running
the tests with the same sets of options that are used now.

Janis



Re: [patch]: move stackalign tests that use dg-options

2012-06-11 Thread H.J. Lu
On Mon, Jun 11, 2012 at 12:41 PM, Janis Johnson
janis_john...@mentor.com wrote:
 On 06/11/2012 12:35 PM, H.J. Lu wrote:
 On Mon, Jun 11, 2012 at 12:21 PM, Janis Johnson
 janis_john...@mentor.com wrote:
 On 06/11/2012 12:15 PM, H.J. Lu wrote:
 On Mon, Jun 11, 2012 at 12:00 PM, Janis Johnson
 janis_john...@mentor.com wrote:
 The tests in gcc.dg/torture/stackalign use two sets of torture options:
 the usual optimization sets used as default for torture tests, and up to
 four sets of options that are specific to stack alignment.  The stack
 alignment options are passed via an option that is used by the dg-test
 option dg_extra_tool_flags, which can be overridden by dg-options.  This
 means that the seven tests in that test directory that use dg-option do
 not use the other alignment options.  All seven of these tests are
 limited to x86 targets.  Four of them use -msse2, and four use different
 values for -mpreferred-stack-boundary (one does both), so it doesn't
 look as if they are intended to use the four sets of stackalign torture
 options.

 This patch moves those seven tests out of the stackalign directory up to
 the general gcc.dg/torture directory.  With them out of the way I'll be
 able to use clean up the remaining stackalign tests to use torture test
 support to combine the stack align options with other torture options so
 they'll show up in test summary lines, eliminating lots of duplicate
 lines in test summaries.

 Tested on i686-pc-linux-gnu and arm-none-eabi.  OK for mainline?

 2012-06-11  Janis Johnson  jani...@codesourcery.com

        * gcc.dg/torture/stackalign/alloca-2.c: Move to ...
        * gcc.dg/torture/alloca-2.c: ... here.
        * gcc.dg/torture/stackalign/alloca-3.c: Move to ...
        * gcc.dg/torture/alloca-3.c: ... here.
        * gcc.dg/torture/stackalign/alloca-4.c: Move to ...
        * gcc.dg/torture/alloca-4.c: ... here.
        * gcc.dg/torture/stackalign/alloca-5.c: Move to ...
        * gcc.dg/torture/alloca-5.c: ... here.
        * gcc.dg/torture/stackalign/alloca-6.c: Move to ...
        * gcc.dg/torture/alloca-6.c: ... here.
        * gcc.dg/torture/stackalign/push-1.c: Move to ...
        * gcc.dg/torture/push-1.c: ... here.
        * gcc.dg/torture/stackalign/vararg-3.c: Move to ...
        * gcc.dg/torture/vararg-3.c: ... here.

 stackalign.exp has

 gcc-dg-runtest [lsort [glob $srcdir/$subdir/*.c]] $additional_flags
 if { [check_effective_target_fpic] } then {
     set pic_additional_flags $additional_flags
     lappend pic_additional_flags -fpic
     gcc-dg-runtest [lsort [glob $srcdir/$subdir/*.c]] $pic_additional_flags
 }

 It adds PIC tests.  Will this change remove PIC tests?



 The options for PIC tests are overridden by dg-options.  This change
 will make no difference at all to how these 7 tests are compiled.
 They are currently compiled four times with exactly the same options.


 That doesn't match what I see on trunk as of yesterday:

 Executing on host:
 /export/build/gnu/gcc-x32/build-x86_64-linux/gcc/xgcc
 -B/export/build/gnu/gcc-x32/build-x86_64-linux/gcc/
 /export/gnu/import/git/gcc/gcc/testsuite/gcc.dg/torture/stackalign/comp-goto-1.c
  -fno-diagnostics-show-caret   -O0  -mstackrealign
 -mpreferred-stack-boundary=5 -mno-mmx  -lm   -o ./comp-goto-1.exe
 (timeout = 300)

 Not only stackalign.exp adds -fpic, it also adds -mforce-drap,
 -mstackrealign, -mpreferred-stack-boundary=5 and
 -mno-mmx.

 Your example uses comp-goto-1.c which uses the four sets of stack
 alignment options as intended.  That isn't the case for the 7
 tests that use dg-options; try one of those.

 I plan to modify stackalign.exp so that the stackalign options
 will be treated as torture options and be reported in test summaries
 to make each line in a test summary unique, with the goal of running
 the tests with the same sets of options that are used now.

 Janis


Can we use dg-add-options to properly add those options?

Thanks.

-- 
H.J.


Re: [patch]: move stackalign tests that use dg-options

2012-06-11 Thread Janis Johnson
On 06/11/2012 12:48 PM, H.J. Lu wrote:
 On Mon, Jun 11, 2012 at 12:41 PM, Janis Johnson
 janis_john...@mentor.com wrote:
 On 06/11/2012 12:35 PM, H.J. Lu wrote:
 On Mon, Jun 11, 2012 at 12:21 PM, Janis Johnson
 janis_john...@mentor.com wrote:
 On 06/11/2012 12:15 PM, H.J. Lu wrote:
 On Mon, Jun 11, 2012 at 12:00 PM, Janis Johnson
 janis_john...@mentor.com wrote:
 The tests in gcc.dg/torture/stackalign use two sets of torture options:
 the usual optimization sets used as default for torture tests, and up to
 four sets of options that are specific to stack alignment.  The stack
 alignment options are passed via an option that is used by the dg-test
 option dg_extra_tool_flags, which can be overridden by dg-options.  This
 means that the seven tests in that test directory that use dg-option do
 not use the other alignment options.  All seven of these tests are
 limited to x86 targets.  Four of them use -msse2, and four use different
 values for -mpreferred-stack-boundary (one does both), so it doesn't
 look as if they are intended to use the four sets of stackalign torture
 options.

 This patch moves those seven tests out of the stackalign directory up to
 the general gcc.dg/torture directory.  With them out of the way I'll be
 able to use clean up the remaining stackalign tests to use torture test
 support to combine the stack align options with other torture options so
 they'll show up in test summary lines, eliminating lots of duplicate
 lines in test summaries.

 Tested on i686-pc-linux-gnu and arm-none-eabi.  OK for mainline?

 2012-06-11  Janis Johnson  jani...@codesourcery.com

* gcc.dg/torture/stackalign/alloca-2.c: Move to ...
* gcc.dg/torture/alloca-2.c: ... here.
* gcc.dg/torture/stackalign/alloca-3.c: Move to ...
* gcc.dg/torture/alloca-3.c: ... here.
* gcc.dg/torture/stackalign/alloca-4.c: Move to ...
* gcc.dg/torture/alloca-4.c: ... here.
* gcc.dg/torture/stackalign/alloca-5.c: Move to ...
* gcc.dg/torture/alloca-5.c: ... here.
* gcc.dg/torture/stackalign/alloca-6.c: Move to ...
* gcc.dg/torture/alloca-6.c: ... here.
* gcc.dg/torture/stackalign/push-1.c: Move to ...
* gcc.dg/torture/push-1.c: ... here.
* gcc.dg/torture/stackalign/vararg-3.c: Move to ...
* gcc.dg/torture/vararg-3.c: ... here.

 stackalign.exp has

 gcc-dg-runtest [lsort [glob $srcdir/$subdir/*.c]] $additional_flags
 if { [check_effective_target_fpic] } then {
 set pic_additional_flags $additional_flags
 lappend pic_additional_flags -fpic
 gcc-dg-runtest [lsort [glob $srcdir/$subdir/*.c]] 
 $pic_additional_flags
 }

 It adds PIC tests.  Will this change remove PIC tests?



 The options for PIC tests are overridden by dg-options.  This change
 will make no difference at all to how these 7 tests are compiled.
 They are currently compiled four times with exactly the same options.


 That doesn't match what I see on trunk as of yesterday:

 Executing on host:
 /export/build/gnu/gcc-x32/build-x86_64-linux/gcc/xgcc
 -B/export/build/gnu/gcc-x32/build-x86_64-linux/gcc/
 /export/gnu/import/git/gcc/gcc/testsuite/gcc.dg/torture/stackalign/comp-goto-1.c
  -fno-diagnostics-show-caret   -O0  -mstackrealign
 -mpreferred-stack-boundary=5 -mno-mmx  -lm   -o ./comp-goto-1.exe
 (timeout = 300)

 Not only stackalign.exp adds -fpic, it also adds -mforce-drap,
 -mstackrealign, -mpreferred-stack-boundary=5 and
 -mno-mmx.

 Your example uses comp-goto-1.c which uses the four sets of stack
 alignment options as intended.  That isn't the case for the 7
 tests that use dg-options; try one of those.

 I plan to modify stackalign.exp so that the stackalign options
 will be treated as torture options and be reported in test summaries
 to make each line in a test summary unique, with the goal of running
 the tests with the same sets of options that are used now.

 Janis

 
 Can we use dg-add-options to properly add those options?
 
 Thanks.
 

I'll look into that.  Consider this patch dropped for now.

Janis


Re: [patch]: move stackalign tests that use dg-options

2012-06-11 Thread H.J. Lu
On Mon, Jun 11, 2012 at 12:57 PM, Janis Johnson
janis_john...@mentor.com wrote:
 On 06/11/2012 12:48 PM, H.J. Lu wrote:
 On Mon, Jun 11, 2012 at 12:41 PM, Janis Johnson
 janis_john...@mentor.com wrote:
 On 06/11/2012 12:35 PM, H.J. Lu wrote:
 On Mon, Jun 11, 2012 at 12:21 PM, Janis Johnson
 janis_john...@mentor.com wrote:
 On 06/11/2012 12:15 PM, H.J. Lu wrote:
 On Mon, Jun 11, 2012 at 12:00 PM, Janis Johnson
 janis_john...@mentor.com wrote:
 The tests in gcc.dg/torture/stackalign use two sets of torture options:
 the usual optimization sets used as default for torture tests, and up to
 four sets of options that are specific to stack alignment.  The stack
 alignment options are passed via an option that is used by the dg-test
 option dg_extra_tool_flags, which can be overridden by dg-options.  This
 means that the seven tests in that test directory that use dg-option do
 not use the other alignment options.  All seven of these tests are
 limited to x86 targets.  Four of them use -msse2, and four use different
 values for -mpreferred-stack-boundary (one does both), so it doesn't
 look as if they are intended to use the four sets of stackalign torture
 options.

 This patch moves those seven tests out of the stackalign directory up to
 the general gcc.dg/torture directory.  With them out of the way I'll be
 able to use clean up the remaining stackalign tests to use torture test
 support to combine the stack align options with other torture options so
 they'll show up in test summary lines, eliminating lots of duplicate
 lines in test summaries.

 Tested on i686-pc-linux-gnu and arm-none-eabi.  OK for mainline?

 2012-06-11  Janis Johnson  jani...@codesourcery.com

        * gcc.dg/torture/stackalign/alloca-2.c: Move to ...
        * gcc.dg/torture/alloca-2.c: ... here.
        * gcc.dg/torture/stackalign/alloca-3.c: Move to ...
        * gcc.dg/torture/alloca-3.c: ... here.
        * gcc.dg/torture/stackalign/alloca-4.c: Move to ...
        * gcc.dg/torture/alloca-4.c: ... here.
        * gcc.dg/torture/stackalign/alloca-5.c: Move to ...
        * gcc.dg/torture/alloca-5.c: ... here.
        * gcc.dg/torture/stackalign/alloca-6.c: Move to ...
        * gcc.dg/torture/alloca-6.c: ... here.
        * gcc.dg/torture/stackalign/push-1.c: Move to ...
        * gcc.dg/torture/push-1.c: ... here.
        * gcc.dg/torture/stackalign/vararg-3.c: Move to ...
        * gcc.dg/torture/vararg-3.c: ... here.

 stackalign.exp has

 gcc-dg-runtest [lsort [glob $srcdir/$subdir/*.c]] $additional_flags
 if { [check_effective_target_fpic] } then {
     set pic_additional_flags $additional_flags
     lappend pic_additional_flags -fpic
     gcc-dg-runtest [lsort [glob $srcdir/$subdir/*.c]] 
 $pic_additional_flags
 }

 It adds PIC tests.  Will this change remove PIC tests?



 The options for PIC tests are overridden by dg-options.  This change
 will make no difference at all to how these 7 tests are compiled.
 They are currently compiled four times with exactly the same options.


 That doesn't match what I see on trunk as of yesterday:

 Executing on host:
 /export/build/gnu/gcc-x32/build-x86_64-linux/gcc/xgcc
 -B/export/build/gnu/gcc-x32/build-x86_64-linux/gcc/
 /export/gnu/import/git/gcc/gcc/testsuite/gcc.dg/torture/stackalign/comp-goto-1.c
  -fno-diagnostics-show-caret   -O0  -mstackrealign
 -mpreferred-stack-boundary=5 -mno-mmx  -lm   -o ./comp-goto-1.exe
 (timeout = 300)

 Not only stackalign.exp adds -fpic, it also adds -mforce-drap,
 -mstackrealign, -mpreferred-stack-boundary=5 and
 -mno-mmx.

 Your example uses comp-goto-1.c which uses the four sets of stack
 alignment options as intended.  That isn't the case for the 7
 tests that use dg-options; try one of those.

 I plan to modify stackalign.exp so that the stackalign options
 will be treated as torture options and be reported in test summaries
 to make each line in a test summary unique, with the goal of running
 the tests with the same sets of options that are used now.

 Janis


 Can we use dg-add-options to properly add those options?

 Thanks.


 I'll look into that.  Consider this patch dropped for now.


Thanks.

-- 
H.J.


Re: [PATCH] Hoist adjacent loads

2012-06-11 Thread William J. Schmidt
OK, once more with feeling... :)

This patch differs from the previous one in two respects:  It disables
the optimization when either the then or else edge is well-predicted;
and it now uses the existing l1-cache-line-size parameter instead of a
new one (with updated commentary).

Bootstraps and tests with no new regressions on
powerpc64-unknown-linux-gnu.  One last performance run is underway, but
I don't expect any surprises since both changes are more conservative.
The original benchmark issue is still resolved.

Is this version ok for trunk?

Thanks,
Bill


2012-06-11  Bill Schmidt  wschm...@linux.vnet.ibm.com

* opts.c: Add -fhoist-adjacent-loads to -O2 and above.
* tree-ssa-phiopt.c (tree_ssa_phiopt_worker): Add argument to forward
declaration.
(hoist_adjacent_loads, gate_hoist_loads): New forward declarations.
(tree_ssa_phiopt): Call gate_hoist_loads.
(tree_ssa_cs_elim): Add parm to tree_ssa_phiopt_worker call.
(tree_ssa_phiopt_worker): Add do_hoist_loads to formal arg list; call
hoist_adjacent_loads.
(local_mem_dependence): New function.
(hoist_adjacent_loads): Likewise.
(gate_hoist_loads): Likewise.
* common.opt (fhoist-adjacent-loads): New switch.
* Makefile.in (tree-ssa-phiopt.o): Added dependencies.


Index: gcc/opts.c
===
--- gcc/opts.c  (revision 188390)
+++ gcc/opts.c  (working copy)
@@ -489,6 +489,7 @@ static const struct default_options default_option
 { OPT_LEVELS_2_PLUS, OPT_falign_functions, NULL, 1 },
 { OPT_LEVELS_2_PLUS, OPT_ftree_tail_merge, NULL, 1 },
 { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_foptimize_strlen, NULL, 1 },
+{ OPT_LEVELS_2_PLUS, OPT_fhoist_adjacent_loads, NULL, 1 },
 
 /* -O3 optimizations.  */
 { OPT_LEVELS_3_PLUS, OPT_ftree_loop_distribute_patterns, NULL, 1 },
Index: gcc/tree-ssa-phiopt.c
===
--- gcc/tree-ssa-phiopt.c   (revision 188390)
+++ gcc/tree-ssa-phiopt.c   (working copy)
@@ -37,9 +37,17 @@ along with GCC; see the file COPYING3.  If not see
 #include cfgloop.h
 #include tree-data-ref.h
 #include tree-pretty-print.h
+#include gimple-pretty-print.h
+#include insn-config.h
+#include expr.h
+#include optabs.h
 
+#ifndef HAVE_conditional_move
+#define HAVE_conditional_move (0)
+#endif
+
 static unsigned int tree_ssa_phiopt (void);
-static unsigned int tree_ssa_phiopt_worker (bool);
+static unsigned int tree_ssa_phiopt_worker (bool, bool);
 static bool conditional_replacement (basic_block, basic_block,
 edge, edge, gimple, tree, tree);
 static int value_replacement (basic_block, basic_block,
@@ -53,6 +61,9 @@ static bool cond_store_replacement (basic_block, b
 static bool cond_if_else_store_replacement (basic_block, basic_block, 
basic_block);
 static struct pointer_set_t * get_non_trapping (void);
 static void replace_phi_edge_with_variable (basic_block, edge, gimple, tree);
+static void hoist_adjacent_loads (basic_block, basic_block,
+ basic_block, basic_block);
+static bool gate_hoist_loads (void);
 
 /* This pass tries to replaces an if-then-else block with an
assignment.  We have four kinds of transformations.  Some of these
@@ -138,12 +149,56 @@ static void replace_phi_edge_with_variable (basic_
  bb2:
x = PHI x' (bb0), ...;
 
-   A similar transformation is done for MAX_EXPR.  */
+   A similar transformation is done for MAX_EXPR.
 
+
+   This pass also performs a fifth transformation of a slightly different
+   flavor.
+
+   Adjacent Load Hoisting
+   --
+   
+   This transformation replaces
+
+ bb0:
+   if (...) goto bb2; else goto bb1;
+ bb1:
+   x1 = (expr).field1;
+   goto bb3;
+ bb2:
+   x2 = (expr).field2;
+ bb3:
+   # x = PHI x1, x2;
+
+   with
+
+ bb0:
+   x1 = (expr).field1;
+   x2 = (expr).field2;
+   if (...) goto bb2; else goto bb1;
+ bb1:
+   goto bb3;
+ bb2:
+ bb3:
+   # x = PHI x1, x2;
+
+   The purpose of this transformation is to enable generation of conditional
+   move instructions such as Intel CMOVE or PowerPC ISEL.  Because one of
+   the loads is speculative, the transformation is restricted to very
+   specific cases to avoid introducing a page fault.  We are looking for
+   the common idiom:
+
+ if (...)
+   x = y-left;
+ else
+   x = y-right;
+
+   where left and right are typically adjacent pointers in a tree structure.  
*/
+
 static unsigned int
 tree_ssa_phiopt (void)
 {
-  return tree_ssa_phiopt_worker (false);
+  return tree_ssa_phiopt_worker (false, gate_hoist_loads ());
 }
 
 /* This pass tries to transform conditional stores into unconditional
@@ -190,7 +245,7 @@ tree_ssa_phiopt (void)
 static unsigned int
 tree_ssa_cs_elim (void)
 {
-  return tree_ssa_phiopt_worker (true);
+  

Re: [PATCH] don't presume undelegitimized UNSPEC_TLS SYMBOL_REF is a decl

2012-06-11 Thread Richard Henderson
On 2012-06-06 14:04, Roland McGrath wrote:
 2012-06-06  Roland McGrath  mcgra...@google.com
 
   * dwarf2out.c (const_ok_for_output_1): Detect a TLS UNSPEC using
   SYMBOL_REF_TLS_MODEL rather than DECL_THREAD_LOCAL_P, in case it's
   not a VAR_DECL.  Also don't limit it to UNSPECs with exactly one
   operand.

Applied.


r~


Re: [PATCH] don't presume undelegitimized UNSPEC_TLS SYMBOL_REF is a decl

2012-06-11 Thread Roland McGrath
On Mon, Jun 11, 2012 at 1:42 PM, Richard Henderson r...@redhat.com wrote:
 Applied.

Thanks!


Re: [PATCH] libgcc: If __GLIBC__ is defined, don't refer to pthread_cancel.

2012-06-11 Thread Richard Henderson
On 2012-06-08 16:23, Roland McGrath wrote:
 2012-06-08  Roland McGrath  mcgra...@google.com
 
   * gthr-posix.h [neither FreeBSD nor Solaris] (__gthread_active_p):
   If __GLIBC__ is defined, refer to __pthread_key_create instead of
   pthread_cancel.

Applied.


r~


Re: [RFC PR48941 / 51980] Rewrite arm_neon.h to use __builtin_shuffle

2012-06-11 Thread Richard Henderson
On 2012-06-11 14:12, Ramana Radhakrishnan wrote:
 I could generate the masks with
 appropriate casts to unsigned variants instead.That doesn't seem to
 make a difference either.

No, that shouldn't make any difference.


r~


Re: [PATCH] libgcc: If __GLIBC__ is defined, don't refer to pthread_cancel.

2012-06-11 Thread Roland McGrath
On Mon, Jun 11, 2012 at 2:16 PM, Richard Henderson r...@redhat.com wrote:
 Applied.

Thanks!


Re: [RFA:] doc: TARGET_LEGITIMIZE_ADDRESS needs to be defined for native TLS

2012-06-11 Thread Richard Henderson
On 2012-05-15 17:24, Hans-Peter Nilsson wrote:
 gcc:
   * doc/tm.texi.in (Addressing Modes) TARGET_LEGITIMIZE_ADDRESS:
   Mention that this hook needs to be defined for native TLS.
   * doc/tm.texi: Regenerate.

Ok.


r~


Re: [PATCH] Fix ICE with asm goto and shrink wrapping (PR rtl-optimization/53589)

2012-06-11 Thread Richard Henderson
On 2012-06-06 23:53, Jakub Jelinek wrote:
 2012-06-07  Jakub Jelinek  ja...@redhat.com
 
   PR rtl-optimization/53589
   * cfgrtl.c (force_nonfallthru_and_redirect): Do asm_goto_edge
   discovery even when e-dest != target.  If any LABEL_REF points
   to e-dest label, redirect it to target's label.
 
   * gcc.dg/torture/pr53589.c: New test.

Ok.


r~


[SH] PR 50749 - Auto-inc-dec does not find subsequent contiguous mem accesses

2012-06-11 Thread Oleg Endo
Hello,

I'd like to add the attached SH test cases to verify the auto-inc-dec
pass operation on the SH target.
Checked by invoking the individual test cases manually with make
check-gcc on sh-sim.  Some of the tests pass, some of them don't because
of the auto-inc-dec issues mentioned in the PR.

Cheers,
Oleg

testsuite/ChangeLog:

PR target/50749
* gcc.target/sh/pr50749-sf-postinc-2.c: New.
* gcc.target/sh/pr50749-sf-postinc-4.c: New.
* gcc.target/sh/pr50749-qihisi-postinc-2.c: New.
* gcc.target/sh/pr50749-qihisi-postinc-4.c: New.
* gcc.target/sh/pr50749-sf-predec-2.c: New.
* gcc.target/sh/pr50749-sf-predec-4.c: New.
* gcc.target/sh/pr50749-qihisi-predec-1.c: New.
* gcc.target/sh/pr50749-qihisi-predec-3.c: New.
* gcc.target/sh/pr50749-sf-postinc-1.c: New.
* gcc.target/sh/pr50749-sf-postinc-3.c: New.
* gcc.target/sh/pr50749-qihisi-postinc-1.c: New.
* gcc.target/sh/pr50749-qihisi-postinc-3.c: New.
* gcc.target/sh/pr50749-sf-predec-1.c: New.
* gcc.target/sh/pr50749-sf-predec-3.c: New.
* gcc.target/sh/pr50749-qihisi-predec-2.c: New.
* gcc.target/sh/pr50749-qihisi-predec-4.c: New.

Index: gcc/testsuite/gcc.target/sh/pr50749-qihisi-predec-4.c
===
--- gcc/testsuite/gcc.target/sh/pr50749-qihisi-predec-4.c	(revision 0)
+++ gcc/testsuite/gcc.target/sh/pr50749-qihisi-predec-4.c	(revision 0)
@@ -0,0 +1,43 @@
+/* Verify that pre-decrement addressing is generated inside a loop.  */
+/* { dg-do compile { target sh*-*-* } } */
+/* { dg-options -O2 } */
+/* { dg-final { scan-assembler-times mov.b\tr\[0-9]\+,@-r\[0-9]\+ 3 } } */
+/* { dg-final { scan-assembler-times mov.w\tr\[0-9]\+,@-r\[0-9]\+ 3 } } */
+/* { dg-final { scan-assembler-times mov.l\tr\[0-9]\+,@-r\[0-9]\+ 3 } } */
+
+char*
+test_func_00 (char* p, int c, int x)
+{
+  do
+  {
+*--p = (char)x;
+*--p = (char)x;
+*--p = (char)x;
+  } while (--c);
+  return p;
+}
+
+short*
+test_func_01 (short* p, int c, int x)
+{
+  do
+  {
+*--p = (short)x;
+*--p = (short)x;
+*--p = (short)x;
+  } while (--c);
+  return p;
+}
+
+int*
+test_func_02 (int* p, int c, int x)
+{
+  do
+  {
+*--p = x;
+*--p = x;
+*--p = x;
+  } while (--c);
+  return p;
+}
+
Index: gcc/testsuite/gcc.target/sh/pr50749-sf-postinc-1.c
===
--- gcc/testsuite/gcc.target/sh/pr50749-sf-postinc-1.c	(revision 0)
+++ gcc/testsuite/gcc.target/sh/pr50749-sf-postinc-1.c	(revision 0)
@@ -0,0 +1,15 @@
+/* Verify that post-increment addressing is generated.  */
+/* { dg-do compile { target sh*-*-* } } */
+/* { dg-options -O2 } */
+/* { dg-skip-if  { sh*-*-* } { -m1 -m2* -m4al *nofpu -m4-340* -m4-400* -m4-500* -m5* } {  } }  */
+/* { dg-final { scan-assembler-times fmov.s\t@r\[0-9]\+\\+,fr\[0-9]\+ 1 } } */
+
+float*
+test_func_00 (float* p, float* x)
+{
+  float r = 0;
+  r += *p++;
+  *x = r;
+  return p;
+}
+
Index: gcc/testsuite/gcc.target/sh/pr50749-sf-postinc-3.c
===
--- gcc/testsuite/gcc.target/sh/pr50749-sf-postinc-3.c	(revision 0)
+++ gcc/testsuite/gcc.target/sh/pr50749-sf-postinc-3.c	(revision 0)
@@ -0,0 +1,17 @@
+/* Verify that post-increment addressing is generated inside a loop.  */
+/* { dg-do compile { target sh*-*-* } } */
+/* { dg-options -O2 } */
+/* { dg-skip-if  { sh*-*-* } { -m1 -m2* -m4al *nofpu -m4-340* -m4-400* -m4-500* -m5* } {  } }  */
+/* { dg-final { scan-assembler-times fmov.s\t@r\[0-9]\+\\+,fr\[0-9]\+ 1 } } */
+
+float
+test_func_00 (float* p, int c)
+{
+  float r = 0;
+  do
+  {
+r += *p++;
+  } while (--c);
+  return r;
+}
+
Index: gcc/testsuite/gcc.target/sh/pr50749-qihisi-postinc-1.c
===
--- gcc/testsuite/gcc.target/sh/pr50749-qihisi-postinc-1.c	(revision 0)
+++ gcc/testsuite/gcc.target/sh/pr50749-qihisi-postinc-1.c	(revision 0)
@@ -0,0 +1,34 @@
+/* Verify that post-increment addressing is generated.  */
+/* { dg-do compile { target sh*-*-* } } */
+/* { dg-options -O2 } */
+/* { dg-final { scan-assembler-times mov.b\t@r\[0-9]\+\\+,r\[0-9]\+ 1 } } */
+/* { dg-final { scan-assembler-times mov.w\t@r\[0-9]\+\\+,r\[0-9]\+ 1 } } */
+/* { dg-final { scan-assembler-times mov.l\t@r\[0-9]\+\\+,r\[0-9]\+ 1 } } */
+
+char*
+test_func_00 (char* p, int* x)
+{
+  int r = 0;
+  r += *p++;
+  *x = r;
+  return p;
+}
+
+short*
+test_func_01 (short* p, int* x)
+{
+  int r = 0;
+  r += *p++;
+  *x = r;
+  return p;
+}
+
+int*
+test_func_02 (int* p, int* x)
+{
+  int r = 0;
+  r += *p++;
+  *x = r;
+  return p;
+}
+
Index: gcc/testsuite/gcc.target/sh/pr50749-qihisi-postinc-3.c
===
--- gcc/testsuite/gcc.target/sh/pr50749-qihisi-postinc-3.c	(revision 0)
+++ gcc/testsuite/gcc.target/sh/pr50749-qihisi-postinc-3.c	(revision 0)

Re: [RFC] Target-specific limits on vector alignment

2012-06-11 Thread Hans-Peter Nilsson
 From: Richard Earnshaw rearn...@arm.com
 Date: Mon, 11 Jun 2012 15:16:50 +0200

 The ARM ABI states that vectors larger than 64 bits in size still have
 64-bit alignment; never-the-less, the HW supports alignment hints of up
 to 128-bits in some cases and will trap in a vector has an alignment
 that less than the hint.  GCC currently hard-codes larger vectors to be
 aligned by the size of the vector, which means that 128-bit vectors are
 marked as being 128-bit aligned.
 
 The ARM ABI unfortunately does not support generating such alignment for
 parameters passed by value and this can lead to traps at run time.  It
 seems that the best way to solve this problem is to allow the back-end
 to set an upper limit on the alignment permitted for a vector.

Don't you mean lowering the limit on the alignment required and
guaranteed for a vector?  (Or else it sounds like you want to
force attribute-aligned to refuse higher values.)

Wouldn't it be better to make the */*modes.def ADJUST_ALIGNMENT
macro actually work?  Does it work for you?  The epiphany port
tries to do it that way (optionally, not the default AFAICT),
but I bet it's actually unsuccessful.

 I've implemented this as a separate hook, rather than using the existing
 hooks because there's a strong likelihood of breaking some existing ABIs
 if I did it another way.
 
 There are a couple of tests that will need some re-working before this
 can be committed to deal with the fall-out of making this change; I'll
 prepare those changes if this patch is deemed generally acceptable.

It'd be nice to have non-naturally-aligned vectors working in
general.

I have a MIPS COP2 128-bit-SIMD implementation for 32-bit MIPS,
which means 64-bit stack-alignment and memory loads and stores
are in 64-bit parts.  So, I've chosen to not require any larger
alignment to keep ABI compatibility (vectors aren't passed in
registers; core use is intra-function anyway).  This mostly
works, but there's pushback from gcc (at least in 4.3): trying
to adjust vector-type-alignment by e.g. ADJUST_ALIGNMENT (V4SI,
8); in mips-modes.def has no effect; I had to do it in
mips_builtin_vector_type, which might be brittle or just working
by chance.  Then there's the tree-vectorizer, which for some
optimizations wants to assume natural alignment when doing
floor alignment.  I couldn't define a usable
vec_realign_load_mode (undocumented) as it assumes that the
low vector-size bits of an address will exactly correspond to
misalignment when masking off and applying to a vector.  And,
you may have to conditionalize some vectorizer tests with the
effective-target unaligned_stack.

Just preparing you for the fall-out. :]

brgds, H-P


Re: constant that doesn't fit in 32bits in alpha.c

2012-06-11 Thread Richard Henderson
On 2012-06-10 02:18, Jay K wrote:
 
 gcc-4.7.0/gcc/config/alpha/alpha.c
 
 
   word1 = expand_and (DImode, word1, GEN_INT (0x0ffffff0), NULL);
 
 
 That big constant isn't portable since it doesn't fit in 32bits.
 
 
 1) append LL
 or 2) break it up into an expression, like
   ((HOST_WIDE_INT)0x0fff)  8) | 0x0fff0
 
 
 or such.

Addressed like so.  I couldn't think of any nice way to define this
such that it could be generic, so I left it local to this bit o code.



r~
* lib/target-supports.exp
(check_effective_target_sync_long_long_runtime): Use
check_effective_target_lp64 instead of
check_effective_target_powerpc64 for powerpc targets.


diff --git a/gcc/testsuite/lib/target-supports.exp 
b/gcc/testsuite/lib/target-supports.exp
index 568f6b1..c937484 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -3796,7 +3796,7 @@ proc check_effective_target_sync_long_long_runtime { } {
  [check_effective_target_lp64]
  [check_effective_target_ultrasparc_hw]) } {
return 1
-} elseif { [check_effective_target_powerpc64] } {
+} elseif { [istarget powerpc*-*-*]  [check_effective_target_lp64] } {
return 1
 } else {
return 0


Re: constant that doesn't fit in 32bits in alpha.c

2012-06-11 Thread Richard Henderson
Bah.  Wrong patch.


r~
* config/alpha/alpha.c (alpha_trampoline_init): Split large constants.

diff --git a/gcc/config/alpha/alpha.c b/gcc/config/alpha/alpha.c
index 6d15bf7..3dda9fb 100644
--- a/gcc/config/alpha/alpha.c
+++ b/gcc/config/alpha/alpha.c
@@ -5451,6 +5451,8 @@ alpha_trampoline_init (rtx m_tramp, tree fndecl, rtx 
chain_value)
   chain_value = convert_memory_address (Pmode, chain_value);
 #endif
 
+#define HWI_HEX2(X,Y)  (((HOST_WIDE_INT)0x ## X ## u) | 0x ## Y ## u)
+
   if (TARGET_ABI_OPEN_VMS)
 {
   const char *fnname;
@@ -5468,7 +5470,8 @@ alpha_trampoline_init (rtx m_tramp, tree fndecl, rtx 
chain_value)
 the function's procedure descriptor with certain fields zeroed IAW
 the VMS calling standard. This is stored in the first quadword.  */
   word1 = force_reg (DImode, gen_const_mem (DImode, fnaddr));
-  word1 = expand_and (DImode, word1, GEN_INT (0x0ffffff0), NULL);
+  word1 = expand_and (DImode, word1,
+ GEN_INT (HWI_HEX2(0fff,fff0)), NULL);
 }
   else
 {
@@ -5479,10 +5482,12 @@ alpha_trampoline_init (rtx m_tramp, tree fndecl, rtx 
chain_value)
nop
 We don't bother setting the HINT field of the jump; the nop
 is merely there for padding.  */
-  word1 = GEN_INT (0xa77b0010a43b0018);
-  word2 = GEN_INT (0x47ff041f6bfb);
+  word1 = GEN_INT (HWI_HEX2 (a77b0010,a43b0018));
+  word2 = GEN_INT (HWI_HEX2 (47ff041f,6bfb));
 }
 
+#undef HWI_HEX2
+
   /* Store the first two words, as computed above.  */
   mem = adjust_address (m_tramp, DImode, 0);
   emit_move_insn (mem, word1);


Re: libgo patch committed: Update to Go 1.0.1 release

2012-06-11 Thread Mike Stump
On May 4, 2012, at 8:01 AM, Ian Lance Taylor wrote:
 This patch updates libgo to the Go 1.0.1 release.  This is a relatively
 small collection of bug fixes, with no API changes.

So, it is an important part of the test case for 
libgo/go/mime/multipart/testdata/nested-mime to have ^Ms in the file?  If not, 
could you strip them out?  Thanks.  If so, the testcase might need to marked as 
binary to keep svn and or git from stripping the ^Ms out.


RE: constant that doesn't fit in 32bits in alpha.c

2012-06-11 Thread Jay K

Thank you. I like it. May I have another?

book2:gcc jay$ grep -i epoch vms*
vmsdbgout.c:/* Difference in seconds between the VMS Epoch and the Unix Epoch */
vmsdbgout.c:static const long long vms_epoch_offset = 3506716800ll;
vmsdbgout.c:#define VMS_EPOCH_OFFSET 350671680
vmsdbgout.c:    + VMS_EPOCH_OFFSET;


 :)

  - Jay



 Date: Mon, 11 Jun 2012 16:06:03 -0700
 From: r...@redhat.com
 To: jay.kr...@cornell.edu
 CC: gcc-patches@gcc.gnu.org
 Subject: Re: constant that doesn't fit in 32bits in alpha.c

 Bah. Wrong patch.


 r~
  


Re: constant that doesn't fit in 32bits in alpha.c

2012-06-11 Thread Mike Stump
On Jun 11, 2012, at 4:06 PM, Richard Henderson wrote:
 Bah.  Wrong patch.
 
 
 r~
 z.txt


Hum, I'm trying to see how this patch works...  I feel like there is something 
I'm missing, like a shift?


Re: [SH] PR 50749 - Auto-inc-dec does not find subsequent contiguous mem accesses

2012-06-11 Thread Kaz Kojima
Oleg Endo oleg.e...@t-online.de wrote:
  Some of the tests pass, some of them don't because
 of the auto-inc-dec issues mentioned in the PR.

I thought that the tests which are known to fail are marked
with XFAIL, though I'm not sure the usual way for the failing
test cases.

Regards,
kaz


Re: constant that doesn't fit in 32bits in alpha.c

2012-06-11 Thread Richard Henderson
On 2012-06-11 16:23, Mike Stump wrote:
 On Jun 11, 2012, at 4:06 PM, Richard Henderson wrote:
 Bah.  Wrong patch.


 r~
 z.txt
 
 
 Hum, I'm trying to see how this patch works... I feel like there is
 something I'm missing, like a shift?

Double-bah.  That's what I get for changing the patch at the last minute.


r~

* config/alpha/alpha.c (HWI_HEX2): Add missing shift.


diff --git a/gcc/config/alpha/alpha.c b/gcc/config/alpha/alpha.c
index 3dda9fb..2177288 100644
--- a/gcc/config/alpha/alpha.c
+++ b/gcc/config/alpha/alpha.c
@@ -5451,7 +5451,7 @@ alpha_trampoline_init (rtx m_tramp, tree fndecl, rtx 
chain_value)
   chain_value = convert_memory_address (Pmode, chain_value);
 #endif
 
-#define HWI_HEX2(X,Y)  (((HOST_WIDE_INT)0x ## X ## u) | 0x ## Y ## u)
+#define HWI_HEX2(X,Y)  (((HOST_WIDE_INT)0x ## X ## u  32) | 0x ## Y ## u)
 
   if (TARGET_ABI_OPEN_VMS)
 {


Re: constant that doesn't fit in 32bits in alpha.c

2012-06-11 Thread Richard Henderson
On 2012-06-11 16:23, Jay K wrote:
 Thank you. I like it. May I have another?
 
 book2:gcc jay$ grep -i epoch vms*
 vmsdbgout.c:/* Difference in seconds between the VMS Epoch and the Unix Epoch 
 */
 vmsdbgout.c:static const long long vms_epoch_offset = 3506716800ll;
 vmsdbgout.c:#define VMS_EPOCH_OFFSET 350671680
 vmsdbgout.c:+ VMS_EPOCH_OFFSET;
 


In this case, long long is explicitly referenced in the surrounding code,
so I feel the safest change is just to add LL here.  I've cross-compiled
to alpha-dec-vms just to make sure, but I can't actually test this.


r~
* vmsdbgout.c (VMS_EPOCH_OFFSET): Add LL suffix.


diff --git a/gcc/vmsdbgout.c b/gcc/vmsdbgout.c
index 9689653..eedf1bd 100644
--- a/gcc/vmsdbgout.c
+++ b/gcc/vmsdbgout.c
@@ -1676,7 +1676,7 @@ to_vms_file_spec (char *filespec)
 }
 
 #else
-#define VMS_EPOCH_OFFSET 350671680
+#define VMS_EPOCH_OFFSET 350671680LL
 #define VMS_GRANULARITY_FACTOR 1000
 #endif
 


Re: [SH] PR 50749 - Auto-inc-dec does not find subsequent contiguous mem accesses

2012-06-11 Thread Hans-Peter Nilsson
On Tue, 12 Jun 2012, Kaz Kojima wrote:
 Oleg Endo oleg.e...@t-online.de wrote:
   Some of the tests pass, some of them don't because
  of the auto-inc-dec issues mentioned in the PR.

 I thought that the tests which are known to fail are marked
 with XFAIL,

Yes, with a clear reference to the PR at the xfail.
The closest thing to a written rule for this seems to be
http://gcc.gnu.org/wiki/HowToPrepareATestcase.

brgds, H-P


Re: MIPS testsuite patch for --with-synci configurations

2012-06-11 Thread David Daney

On 06/11/2012 12:30 PM, Steve Ellcey wrote:

This patch is to fix MIPS failures that occcur when configuring GCC with the
-with-synci option.  In this case GCC will generate the synci instruction
by default if on a MIPS architecture that supports it .  When on an
architecture that does not support synci, GCC will not generate it by default
but it does generate a warning saying that it is not doing so and that
warning is causing tests that explicitly specify a MIPS architecture that
does not support synci but do not explicitly turn off synci to fail with an
extra, unexpected warning.

I initially looked at changing GCC to remove the warning but that did
not look workable, see http://gcc.gnu.org/ml/gcc/2012-06/msg00100.html
for more details.

This patch addes the -mno-synci flag to MIPS tests that specify an
architecture that does not support synci, thus getting rid of the
warning message and making the tests pass.  Tested with the mips-linux-gnu
and mips-sde-elf targets both with and without --with-synci on the
GCC configuration.

OK to checkin?


I wonder if it would make more sense to modify the testsuite driver to 
take care of this.  It seems like the set of files with the -mno-synci 
annotation could easily become different than the set that requires it.


David Daney



Steve Ellcey
sell...@mips.com

2012-06-11  Steve Ellceysell...@mips.com

* gcc.target/mips/call-saved-1.c: Add -mno-synci flag.
* gcc.target/mips/call-saved-2.c: Ditto.
* gcc.target/mips/call-saved-3.c: Ditto.
* gcc.target/mips/clear-cache-2.c: Ditto.
* gcc.target/mips/ext-8.c: Ditto.
* gcc.target/mips/extend-2.c: Ditto.
* gcc.target/mips/fix-r4000-1.c: Ditto.
* gcc.target/mips/fix-r4000-10.c: Ditto.
* gcc.target/mips/fix-r4000-11.c: Ditto.
* gcc.target/mips/fix-r4000-12.c: Ditto.
* gcc.target/mips/fix-r4000-2.c: Ditto.
* gcc.target/mips/fix-r4000-3.c: Ditto.
* gcc.target/mips/fix-r4000-4.c: Ditto.
* gcc.target/mips/fix-r4000-5.c: Ditto.
* gcc.target/mips/fix-r4000-6.c: Ditto.
* gcc.target/mips/fix-r4000-7.c: Ditto.
* gcc.target/mips/fix-r4000-8.c: Ditto.
* gcc.target/mips/fix-r4000-9.c: Ditto.
* gcc.target/mips/fix-vr4130-1.c: Ditto.
* gcc.target/mips/fix-vr4130-2.c: Ditto.
* gcc.target/mips/fix-vr4130-3.c: Ditto.
* gcc.target/mips/fix-vr4130-4.c: Ditto.
* gcc.target/mips/fpr-moves-1.c: Ditto.
* gcc.target/mips/fpr-moves-2.c: Ditto.
* gcc.target/mips/loongson-muldiv-1.c: Ditto.
* gcc.target/mips/loongson-muldiv-2.c: Ditto.
* gcc.target/mips/loongson-shift-count-truncated-1.c: Ditto.
* gcc.target/mips/loongson-simd.c: Ditto.
* gcc.target/mips/loongson3a-muldiv-1.c: Ditto.
* gcc.target/mips/loongson3a-muldiv-2.c: Ditto.
* gcc.target/mips/madd-1.c: Ditto.
* gcc.target/mips/madd-2.c: Ditto.
* gcc.target/mips/madd-5.c: Ditto.
* gcc.target/mips/madd-6.c: Ditto.
* gcc.target/mips/madd-7.c: Ditto.
* gcc.target/mips/madd-8.c: Ditto.
* gcc.target/mips/maddu-1.c: Ditto.
* gcc.target/mips/maddu-2.c: Ditto.
* gcc.target/mips/msub-1.c: Ditto.
* gcc.target/mips/msub-2.c: Ditto.
* gcc.target/mips/msub-5.c: Ditto.
* gcc.target/mips/msub-6.c: Ditto.
* gcc.target/mips/msub-7.c: Ditto.
* gcc.target/mips/msub-8.c: Ditto.
* gcc.target/mips/msubu-1.c: Ditto.
* gcc.target/mips/msubu-2.c: Ditto.
* gcc.target/mips/nmadd-1.c: Ditto.
* gcc.target/mips/nmadd-2.c: Ditto.
* gcc.target/mips/nmadd-3.c: Ditto.
* gcc.target/mips/no-smartmips-ror-1.c: Ditto.
* gcc.target/mips/pr34831.c: Ditto.
* gcc.target/mips/r10k-cache-barrier-10.c: Ditto.
* gcc.target/mips/r3900-mult.c: Ditto.
* gcc.target/mips/rsqrt-1.c: Ditto.
* gcc.target/mips/rsqrt-2.c: Ditto.
* gcc.target/mips/rsqrt-3.c: Ditto.
* gcc.target/mips/rsqrt-4.c: Ditto.
* gcc.target/mips/sb1-1.c: Ditto.
* gcc.target/mips/vr-mult-1.c: Ditto.
* gcc.target/mips/vr-mult-2.c: Ditto.



[...]


Re: [SH] PR 50749 - Auto-inc-dec does not find subsequent contiguous mem accesses

2012-06-11 Thread Kaz Kojima
Hans-Peter Nilsson h...@bitrange.com wrote:
 On Tue, 12 Jun 2012, Kaz Kojima wrote:
  Oleg Endo oleg.e...@t-online.de wrote:
Some of the tests pass, some of them don't because
   of the auto-inc-dec issues mentioned in the PR.
 
  I thought that the tests which are known to fail are marked
  with XFAIL,
 
 Yes, with a clear reference to the PR at the xfail.
 The closest thing to a written rule for this seems to be
 http://gcc.gnu.org/wiki/HowToPrepareATestcase.

Thanks for clarification!

Oleg, the patch is OK with that change.

Regards,
kaz


Re: [PATCH 3/3] rs6000: Rewrite sync patterns for atomic; expand early.

2012-06-11 Thread David Edelsohn
On Sat, Jun 9, 2012 at 10:40 AM, Richard Henderson r...@twiddle.net wrote:

 Nope.  I do see the obvious mistake in the atomic_load pattern though:
 The mode iterator should have been INT1 not INT.

Did you want to commit the fix for the iterator?

 ... and for extra credit we ought to implement DImode atomic load/store
 via the FPU, like we do for i386.

Yes, however the PPC embedded community used to complain about FP
loads and stores generated by the movdi pattern in 32 bit mode. Why
is this function using FPRs when it does not contain any floating
point operations?. This is for a processor with an FPU, but the
developer does not want extraneous FPR usage that will force a lazy FP
save in the kernel. In other words, one can use FPRs if the function
already explicitly uses FP operations in user code, but the compiler
should not introduce it. This seems to require scanning each function
for floating point usage before allowing FPRs for DImode load/store
operations.

I like your suggestion, but the PowerPC developer community does not
uniformly appreciate that behavior.

Thanks, David


[testsuite] gcc.dg/cpp: make message check lines unique in test summary

2012-06-11 Thread Janis Johnson
This test modifies dg-message test directives by adding comments that
will be added to lines in the test summary to eliminate non-unique lines
for checks of messages for the same line of source code in a test. 

Tested on i686-pc-linux-gnu and arm-none-eabi.  OK for mainline?

Janis
2012-06-11  Janis Johnson  jani...@codesourcery.com

* gcc.dg/cpp/include2a.c: Add comments to checks for multiple
messages reported for one line of source code.
* gcc.dg/cpp/pr30786.c: Likewise.
* gcc.dg/cpp/pr28709.c: Likewise.
* gcc.dg/cpp/missing-header-MD.c: Likewise.
* gcc.dg/cpp/macspace2.c: Likewise.
* gcc.dg/cpp/missing-header-1.c: Likewise.
* gcc.dg/cpp/missing-header-MMD.c: Likewise.
* gcc.dg/cpp/missing-sysheader-MD.c: Likewise.
* gcc.dg/cpp/missing-sysheader-MMD.c: Likewise.

Index: gcc.dg/cpp/include2a.c
===
--- gcc.dg/cpp/include2a.c  (revision 188249)
+++ gcc.dg/cpp/include2a.c  (working copy)
@@ -11,6 +11,6 @@
 
 /* These error is No such file or directory, just once.  However, this
message is locale-dependent, so don't test for it.  */
-/* { dg-error silly  { target *-*-* } 0 } */
-/* { dg-error missing  { target *-*-* } 0 } */
-/* { dg-message terminated  { target *-*-* } 0 } */
+/* { dg-error silly silly { target *-*-* } 0 } */
+/* { dg-error missing missing { target *-*-* } 0 } */
+/* { dg-message terminated terminated { target *-*-* } 0 } */
Index: gcc.dg/cpp/pr30786.c
===
--- gcc.dg/cpp/pr30786.c(revision 188249)
+++ gcc.dg/cpp/pr30786.c(working copy)
@@ -1,8 +1,8 @@
 /* PR preprocessor/30786 - _Pragma at end of file should not ICE */
 /* { dg-do compile } */
 
-/* { dg-error parenthesized  { target *-*-* } 9 } */
-/* { dg-error expected  { target *-*-* } 9 } */
+/* { dg-error parenthesized parenthesized { target *-*-* } 9 } */
+/* { dg-error expected expected { target *-*-* } 9 } */
 
 int x;
 
Index: gcc.dg/cpp/pr28709.c
===
--- gcc.dg/cpp/pr28709.c(revision 188249)
+++ gcc.dg/cpp/pr28709.c(working copy)
@@ -6,5 +6,5 @@
 
 #define foo - ## 
 foo;
-/* { dg-error expected identifier.*'-'  { target *-*-* } 8 } */
-/* { dg-error pasting  { target *-*-* } 8 } */
+/* { dg-error expected identifier.*'-' expected { target *-*-* } 8 } */
+/* { dg-error pasting pasting { target *-*-* } 8 } */
Index: gcc.dg/cpp/missing-header-MD.c
===
--- gcc.dg/cpp/missing-header-MD.c  (revision 188249)
+++ gcc.dg/cpp/missing-header-MD.c  (working copy)
@@ -3,8 +3,8 @@
 /* { dg-options -MD } */
 
 #include nonexistent.h
-/* { dg-message nonexistent.h  { target *-*-* } 0 } */
-/* { dg-message terminated  { target *-*-* } 0 } */
+/* { dg-message nonexistent.h nonexistent.h { target *-*-* } 0 } */
+/* { dg-message terminated terminated { target *-*-* } 0 } */
 
 /* This declaration should not receive any diagnostic.  */
 foo bar;
Index: gcc.dg/cpp/macspace2.c
===
--- gcc.dg/cpp/macspace2.c  (revision 188249)
+++ gcc.dg/cpp/macspace2.c  (working copy)
@@ -59,5 +59,5 @@
 #define agabc/* { dg-error requires whitespace } */
 
 int dummy;
-/* { dg-error missing terminating  { target *-*-* } 6 } */
-/* { dg-error missing terminating  { target *-*-* } 10 } */
+/* { dg-error missing terminating missing-terminating { target *-*-* } 6 } 
*/
+/* { dg-error missing terminating missing-terminating { target *-*-* } 10 
} */
Index: gcc.dg/cpp/missing-header-1.c
===
--- gcc.dg/cpp/missing-header-1.c   (revision 188249)
+++ gcc.dg/cpp/missing-header-1.c   (working copy)
@@ -3,8 +3,8 @@
 /* { dg-options  } */
 
 #include nonexistent.h
-/* { dg-message nonexistent.h  { target *-*-* } 0 } */
-/* { dg-message terminated  { target *-*-* } 0 } */
+/* { dg-message nonexistent.h nonexistent.h { target *-*-* } 0 } */
+/* { dg-message terminated terminated { target *-*-* } 0 } */
 
 /* This declaration should not receive any diagnostic.  */
 foo bar;
Index: gcc.dg/cpp/missing-header-MMD.c
===
--- gcc.dg/cpp/missing-header-MMD.c (revision 188249)
+++ gcc.dg/cpp/missing-header-MMD.c (working copy)
@@ -3,8 +3,8 @@
 /* { dg-options -MMD } */
 
 #include nonexistent.h
-/* { dg-message nonexistent.h  { target *-*-* } 0 } */
-/* { dg-message terminated  { target *-*-* } 0 } */
+/* { dg-message nonexistent.h nonexistent.h { target *-*-* } 0 } */
+/* { dg-message terminated terminated { target *-*-* } 0 } */
 
 /* This declaration should not receive any diagnostic.  */
 foo bar;
Index: gcc.dg/cpp/missing-sysheader-MD.c

[testsuite] c-c++-common: make message check lines unique in test summary

2012-06-11 Thread Janis Johnson
This test modifies dg-error and dg-warning test directives by adding
comments that will be added to lines in the test summary to eliminate
non-unique lines for checks of messages for the same line of source code
in a test.

Tested on i686-pc-linux-gnu and arm-none-eabi.  OK for mainline?

Janis
2012-06-11  Janis Johnson  jani...@codesourcery.com

* c-c++-common/raw-string-3.c: Add comments to checks for multiple
messages reported for for one line of source code.
* c-c++-common/raw-string-5.c: Likewise.
* c-c++-common/raw-string-4.c: Likewise.
* c-c++-common/raw-string-6.c: Likewise.
* c-c++-common/pr2.c: Likewise.

Index: c-c++-common/raw-string-3.c
===
--- c-c++-common/raw-string-3.c (revision 188249)
+++ c-c++-common/raw-string-3.c (working copy)
@@ -4,27 +4,27 @@
 // { dg-options  { target c } }
 // { dg-options -std=c++98 { target c++ } }
 
-const void *s0 = R(a);   // { dg-error was not 
declared|undeclared }
-// { dg-error expected ',' or ';'  { target c } 7 }
-const void *s1 = uR(a);  // { dg-error was not 
declared|undeclared }
-// { dg-error expected ',' or ';'  { target c } 9 }
-const void *s2 = UR(a);  // { dg-error was not 
declared|undeclared }
-// { dg-error expected ',' or ';'  { target c } 11 }
-const void *s3 = u8R(a); // { dg-error was not 
declared|undeclared }
-// { dg-error expected ',' or ';'  { target c } 13 }
-const void *s4 = LR(a);  // { dg-error was not 
declared|undeclared }
-// { dg-error expected ',' or ';'  { target c } 15 }
+const void *s0 = R(a);   // { dg-error was not 
declared|undeclared undeclared }
+// { dg-error expected ',' or ';' expected { target c } 7 }
+const void *s1 = uR(a);  // { dg-error was not 
declared|undeclared undeclared }
+// { dg-error expected ',' or ';' expected { target c } 9 }
+const void *s2 = UR(a);  // { dg-error was not 
declared|undeclared undeclared }
+// { dg-error expected ',' or ';' expected { target c } 11 
}
+const void *s3 = u8R(a); // { dg-error was not 
declared|undeclared undeclared }
+// { dg-error expected ',' or ';' expected { target c } 13 
}
+const void *s4 = LR(a);  // { dg-error was not 
declared|undeclared undeclared }
+// { dg-error expected ',' or ';' expected { target c } 15 
}
 
-const int  i0  = R'a'; // { dg-error was not declared  { 
target c++ } }
-// { dg-error expected ',' or ';'  { target c } 18 }
-const int  i1  = uR'a';// { dg-error was not declared  { 
target c++ } }
-// { dg-error expected ',' or ';'  { target c } 20 }
-const int  i2  = UR'a';// { dg-error was not declared  { 
target c++ } }
-// { dg-error expected ',' or ';'  { target c } 22 }
-const int  i3  = u8R'a';   // { dg-error was not declared  { 
target c++ } }
-// { dg-error expected ',' or ';'  { target c } 24 }
-const int  i4  = LR'a';// { dg-error was not declared  { 
target c++ } }
-// { dg-error expected ',' or ';'  { target c } 26 }
+const int  i0  = R'a'; // { dg-error was not declared 
undeclared { target c++ } }
+// { dg-error expected ',' or ';' expected { target c } 18 
}
+const int  i1  = uR'a';// { dg-error was not declared 
undeclared { target c++ } }
+// { dg-error expected ',' or ';' expected { target c } 20 
}
+const int  i2  = UR'a';// { dg-error was not declared 
undeclared { target c++ } }
+// { dg-error expected ',' or ';' expected { target c } 22 
}
+const int  i3  = u8R'a';   // { dg-error was not declared 
undeclared { target c++ } }
+// { dg-error expected ',' or ';' expected { target c } 24 
}
+const int  i4  = LR'a';// { dg-error was not declared 
undeclared { target c++ } }
+// { dg-error expected ',' or ';' expected { target c } 26 
}
 
 #define R  a
 #define uR b
Index: c-c++-common/raw-string-5.c
===
--- c-c++-common/raw-string-5.c (revision 188249)
+++ c-c++-common/raw-string-5.c (working copy)
@@ -3,25 +3,25 @@
 // { dg-options -std=c++0x { target c++ } }
 
 const void *s0 = R0123456789abcdefg()0123456789abcdefg;
-   // { dg-error raw string delimiter longer  { target *-*-* } 5 }
-   // { dg-error stray  { target *-*-* } 5 }
+   // { dg-error raw string delimiter longer longer { target *-*-* } 5 
}
+   // { dg-error stray stray { target *-*-* } 5 }
 const void *s1 = R () ;
-   // { dg-error invalid character  { target *-*-* } 8 }
- 

[testsuite] gcc.dg: make message check lines unique in test summary

2012-06-11 Thread Janis Johnson
This test modifies dg-error, dg-warning, dg-message, and dg-bogus test
directives by adding comments that will be added to lines in the test
summary to eliminate non-unique lines for checks of messages for the
same line of source code in a test.

Tested on i686-pc-linux-gnu and arm-none-eabi.  OK for mainline?

Janis
2012-06-11  Janis Johnson  jani...@codesourcery.com

* gcc.dg/20031223-1.c: Add comments to check for multiple
messages reported for one line of source code.
* gcc.dg/Wconversion-integer.c: Likewise.
* gcc.dg/Wfatal-2.c: Likewise.
* gcc.dg/Wfatal.c: Likewise.
* gcc.dg/Wobjsize-1.c: Likewise.
* gcc.dg/c99-vla-jump-1.c: Likewise.
* gcc.dg/c99-vla-jump-2.c: Likewise.
* gcc.dg/c99-vla-jump-3.c: Likewise.
* gcc.dg/c99-vla-jump-4.c: Likewise.
* gcc.dg/c99-vla-jump-5.c: Likewise.
* gcc.dg/decl-9.c: Likewise.
* gcc.dg/declspec-10.c: Likewise.
* gcc.dg/declspec-18.c: Likewise.
* gcc.dg/mtune.c: Likewise.
* gcc.dg/parser-pr28152-2.c: Likewise.
* gcc.dg/parser-pr28152.c: Likewise.
* gcc.dg/pr14475.c: Likewise.
* gcc.dg/pr27953.c: Likewise.
* gcc.dg/pr28322-3.c: Likewise.
* gcc.dg/pr30457.c: Likewise.
* gcc.dg/pr30551-2.c: Likewise.
* gcc.dg/pr30551-3.c: Likewise.
* gcc.dg/pr30551-4.c: Likewise.
* gcc.dg/pr30551-5.c: Likewise.
* gcc.dg/pr30551-6.c: Likewise.
* gcc.dg/pr30551.c: Likewise.
* gcc.dg/pr45461.c: Likewise.
* gcc.dg/pr48552-1.c: Likewise.
* gcc.dg/pr48552-2.c: Likewise.
* gcc.dg/redecl-1.c: Likewise.
* gcc.dg/transparent-union-3.c: Likewise.
* gcc.dg/utf-dflt.c: Likewise.
* gcc.dg/utf-dflt2.c: Likewise.
* gcc.dg/vla-8.c: Likewise.
* gcc.dg/vla-init-1.c: Likewise.
* gcc.dg/wtr-int-type-1.c: Likewise.

Index: gcc.dg/20031223-1.c
===
--- gcc.dg/20031223-1.c (revision 188393)
+++ gcc.dg/20031223-1.c (working copy)
@@ -7,7 +7,7 @@
 
 void f ()
 {
- l: int; /* { dg-error a label can only be part of a statement and a 
declaration is not a statement } */
- /* { dg-warning useless type name in empty declaration  { target *-*-* } 
10 } */
- /* { dg-error label at end of compound statement  { target *-*-* } 10 } */
+ l: int; /* { dg-error a label can only be part of a statement and a 
declaration is not a statement not stmt } */
+ /* { dg-warning useless type name in empty declaration type name { target 
*-*-* } 10 } */
+ /* { dg-error label at end of compound statement label { target *-*-* } 
10 } */
 }
Index: gcc.dg/Wconversion-integer.c
===
--- gcc.dg/Wconversion-integer.c(revision 188393)
+++ gcc.dg/Wconversion-integer.c(working copy)
@@ -40,10 +40,10 @@
   fuc ('A');
   uc = 'A';
 
-  uc = x ? 1U : -1; /* { dg-warning  conversion } */
-  /* { dg-warning negative integer implicitly converted to unsigned type  
{ target *-*-* } 43 } */
-  uc = x ? SCHAR_MIN : 1U; /* { dg-warning  conversion } */
-  /* { dg-warning negative integer implicitly converted to unsigned type  
{ target *-*-* } 45 } */
+  uc = x ? 1U : -1; /* { dg-warning  conversion conversion } */
+  /* { dg-warning negative integer implicitly converted to unsigned type 
implicit { target *-*-* } 43 } */
+  uc = x ? SCHAR_MIN : 1U; /* { dg-warning  conversion conversion } */
+  /* { dg-warning negative integer implicitly converted to unsigned type 
implicit { target *-*-* } 45 } */
   uc = x ? 1 : -1; /* { dg-warning negative integer implicitly converted to 
unsigned type } */
   uc = x ? SCHAR_MIN : 1; /* { dg-warning negative integer implicitly 
converted to unsigned type } */
   ui = x ? 1U : -1; /* { dg-warning negative integer implicitly converted to 
unsigned type } */
Index: gcc.dg/Wfatal-2.c
===
--- gcc.dg/Wfatal-2.c   (revision 188393)
+++ gcc.dg/Wfatal-2.c   (working copy)
@@ -5,5 +5,5 @@
 int i = INT_MAX + 1; /* { dg-error integer overflow in expression } */
 int k = 1 / 0; 
 int j = INT_MIN - 1;
-/* { dg-message being treated as errors  { target *-*-* } 0 } */
-/* { dg-message terminated due to -Wfatal-errors  { target *-*-* } 0 } */
+/* { dg-message being treated as errors treated as errors { target *-*-* } 
0 } */
+/* { dg-message terminated due to -Wfatal-errors terminated { target *-*-* 
} 0 } */
Index: gcc.dg/Wfatal.c
===
--- gcc.dg/Wfatal.c (revision 188393)
+++ gcc.dg/Wfatal.c (working copy)
@@ -5,8 +5,8 @@
 int i = INT_MAX + 1; /* { dg-warning integer overflow in expression } */
 int k = 1 / 0; /* { dg-error division by zero } */
 int j = INT_MIN - 1;
-/* { dg-message some warnings being treated as errors  {target *-*-*} 0 
} */
-/* { dg-message terminated 

[testsuite] gcc.c-torture/compile: make msg check lines unique in test summary

2012-06-11 Thread Janis Johnson
This test modifies dg-message test directives by adding comments that
will be added to lines in the test summary to eliminate non-unique lines
for checks of messages for the same line of source code in a test.

Tested on i686-pc-linux-gnu and arm-none-eabi.  OK for mainline?

Janis
2012-06-11  Janis Johnson  jani...@codesourcery.com

* gcc.c-torture/compile/sync-1.c: Add comments to checks for multiple
messages reported for one line of source code.

Index: gcc.c-torture/compile/sync-1.c
===
--- gcc.c-torture/compile/sync-1.c  (revision 188249)
+++ gcc.c-torture/compile/sync-1.c  (working copy)
@@ -1,5 +1,5 @@
-/* { dg-message note: '__sync_fetch_and_nand' changed semantics in GCC 4.4 
 { target *-*-* } 0 } */
-/* { dg-message note: '__sync_nand_and_fetch' changed semantics in GCC 4.4 
 { target *-*-* } 0 } */
+/* { dg-message note: '__sync_fetch_and_nand' changed semantics in GCC 4.4 
fetch_and_nand { target *-*-* } 0 } */
+/* { dg-message note: '__sync_nand_and_fetch' changed semantics in GCC 4.4 
nand_and_fetch { target *-*-* } 0 } */
 /* { dg-options -ffat-lto-objects } */
 
 /* Validate that each of the __sync builtins compiles.  This won't 


  1   2   >