[Ada] Adjust Esize processing in gigi
This makes it so that the size of types (Esize) is retrieved early and properly capped for elementary types, in order to avoid obscure failures later on for huge sizes. Nothing is changed for composite types. Tested on i586-suse-linux, applied on the mainline. 2012-06-11 Eric Botcazou ebotca...@adacore.com * gcc-interface/decl.c (gnat_to_gnu_entity): Translate the Esize on entry only for elementary types and abort if it is too large. E_Record_Type: Make sure the Esize is known before using it. -- Eric Botcazou Index: gcc-interface/decl.c === --- gcc-interface/decl.c (revision 188377) +++ gcc-interface/decl.c (working copy) @@ -377,11 +377,12 @@ gnat_to_gnu_entity (Entity_Id gnat_entit prepend_attributes (First_Subtype (Base_Type (gnat_entity)), attr_list); - /* Compute a default value for the size of the type. */ - if (Known_Esize (gnat_entity) - UI_Is_In_Int_Range (Esize (gnat_entity))) + /* Compute a default value for the size of an elementary type. */ + if (Known_Esize (gnat_entity) Is_Elementary_Type (gnat_entity)) { unsigned int max_esize; + + gcc_assert (UI_Is_In_Int_Range (Esize (gnat_entity))); esize = UI_To_Int (Esize (gnat_entity)); if (IN (kind, Float_Kind)) @@ -2948,14 +2949,16 @@ gnat_to_gnu_entity (Entity_Id gnat_entit if (Known_Alignment (gnat_entity)) TYPE_ALIGN (gnu_type) = validate_alignment (Alignment (gnat_entity), gnat_entity, 0); - else if (Is_Atomic (gnat_entity)) - TYPE_ALIGN (gnu_type) - = esize = BITS_PER_WORD ? BITS_PER_WORD : ceil_pow2 (esize); + else if (Is_Atomic (gnat_entity) Known_Esize (gnat_entity)) + { + unsigned int size = UI_To_Int (Esize (gnat_entity)); + TYPE_ALIGN (gnu_type) + = size = BITS_PER_WORD ? BITS_PER_WORD : ceil_pow2 (size); + } /* If a type needs strict alignment, the minimum size will be the type size instead of the RM size (see validate_size). Cap the alignment, lest it causes this type size to become too large. */ - else if (Strict_Alignment (gnat_entity) - Known_RM_Size (gnat_entity)) + else if (Strict_Alignment (gnat_entity) Known_RM_Size (gnat_entity)) { unsigned int raw_size = UI_To_Int (RM_Size (gnat_entity)); unsigned int raw_align = raw_size -raw_size;
[Ada] Fix link error on conditional expression
The executable cannot be linked because of a duplicated symbol in the object files, which comes from a conditional expression used to initialize a constant that is the upper bound of an index type in an array. Tested on i586-suse-linux, applied on the mainline. 2012-06-11 Eric Botcazou ebotca...@adacore.com * gcc-interface/decl.c (gnat_to_gnu_entity): Do not specifically deal with external constants wrt force_global here... E_Constant: ...but here instead. * gcc-interface/utils.c (gnat_pushdecl): Do not put external DECLs onto the list of global DECLs. 2012-06-11 Eric Botcazou ebotca...@adacore.com * gnat.dg/constant4.adb: New test. * gnat.dg/constant4_pkg.ads: New helper. -- Eric Botcazou Index: gcc-interface/utils.c === --- gcc-interface/utils.c (revision 188377) +++ gcc-interface/utils.c (working copy) @@ -572,14 +572,14 @@ gnat_pushdecl (tree decl, Node_Id gnat_n if (!(TREE_CODE (decl) == TYPE_DECL TREE_CODE (TREE_TYPE (decl)) == UNCONSTRAINED_ARRAY_TYPE)) { - if (global_bindings_p ()) + if (DECL_EXTERNAL (decl)) { - VEC_safe_push (tree, gc, global_decls, decl); - if (TREE_CODE (decl) == FUNCTION_DECL DECL_BUILT_IN (decl)) VEC_safe_push (tree, gc, builtin_decls, decl); } - else if (!DECL_EXTERNAL (decl)) + else if (global_bindings_p ()) + VEC_safe_push (tree, gc, global_decls, decl); + else { DECL_CHAIN (decl) = BLOCK_VARS (current_binding_level-block); BLOCK_VARS (current_binding_level-block) = decl; Index: gcc-interface/decl.c === --- gcc-interface/decl.c (revision 188378) +++ gcc-interface/decl.c (working copy) @@ -348,12 +348,10 @@ gnat_to_gnu_entity (Entity_Id gnat_entit another compilation unit) public entities, show we are at global level for the purpose of computing scopes. Don't do this for components or discriminants since the relevant test is whether or not the record is - being defined. Don't do this for constants either as we'll look into - their defining expression in the local context. */ + being defined. */ if (!definition kind != E_Component kind != E_Discriminant - kind != E_Constant Is_Public (gnat_entity) !Is_Statically_Allocated (gnat_entity)) force_global++, this_global = true; @@ -424,6 +422,7 @@ gnat_to_gnu_entity (Entity_Id gnat_entit != N_Allocator) { bool went_into_elab_proc = false; + int save_force_global = force_global; /* The expression may contain N_Expression_With_Actions nodes and thus object declarations from other units. In this case, even @@ -436,11 +435,13 @@ gnat_to_gnu_entity (Entity_Id gnat_entit current_function_decl = get_elaboration_procedure (); went_into_elab_proc = true; } + force_global = 0; gnat_pushlevel (); gnu_expr = gnat_to_gnu (Expression (Declaration_Node (gnat_entity))); gnat_zaplevel (); + force_global = save_force_global; if (went_into_elab_proc) current_function_decl = NULL_TREE; } -- { dg-do link } -- { dg-options -gnat12 } with Constant4_Pkg; use Constant4_Pkg; procedure Constant4 is Sum : Counter := 0; begin for Count of Steals loop Sum := Sum + Count; end loop; end; with Ada.Command_Line; use Ada.Command_Line; with System.Multiprocessors; use System.Multiprocessors; package Constant4_Pkg is Max_CPUs : constant CPU := (if Argument_Count 2 then Number_Of_CPUs else CPU'Value (Argument (2))); subtype Worker_Id is CPU range 1 .. Max_CPUs; type Counter is range 0 .. 10**18; Steals : array (Worker_Id) of Counter := (others = 0); end Constant4_Pkg;
Re: [PATCH 1/2] SH epilogue unwind, dwarf2 pass changes
Ping? On 2012/6/1 06:24 PM, Chung-Lin Tang wrote: On 12/5/23 1:46 AM, Richard Henderson wrote: On 05/18/12 03:48, Chung-Lin Tang wrote: @@ -2401,6 +2401,7 @@ scan_trace (dw_trace_info *trace) { /* Propagate across fallthru edges. */ dwarf2out_flush_queued_reg_saves (); + def_cfa_1 (this_cfa); maybe_record_trace_start (insn, NULL); break; } @@ -2455,6 +2456,18 @@ scan_trace (dw_trace_info *trace) cur_cfa = this_cfa; continue; } + else + { + /* If ELT is a annulled branch-taken instruction (i.e. executed +only when branch is not taken), the args_size and CFA should +not change through the jump. */ + create_trace_edges (control); + + /* Update and continue with the trace. */ + add_cfi_insn = insn; + scan_insn_after (elt); + continue; + } I think the def_cfa_1 is misplaced. It should be immediately before that last continue. That mirrors the sort of flow you get via the other paths through the loop. Or possibly moved before the dwarf2out_flush_queued_reg_saves () call in the patch? (in the save_point_p () break case) Note I'm only saying this based on overall ordering of those two routine calls in the loop. Attached is a testcase (adapted from libgomp) that, with the SH epilogue unwind patch applied alone, produces the ICE I'm seeing (-O2 -funwind-tables). This dwarf2 patch, with any of the three new def_cfa_1() call sites seems to solve it, though you might want to comment on which call site seems correct Thanks, Chung-Lin
Re: [patch] Do not include rtl.h in tree-phinodes.c
On Wed, Jun 6, 2012 at 5:16 PM, Steven Bosscher stevenb@gmail.com wrote: Hello, tree-phinodes.c includes rtl.h for ... ceil_log2 (?!). Moving ceil_log2 to hwint.c/hwint.h to join its sibling floor_log2 breaks this dependency. Bootstrapped on x86_64-unknown-linux-gnu. OK for trunk? Ok. Thanks, Richard. Ciao! Steven
Re: [patch] Do not include rtl.h in cfgloop.h
On Wed, Jun 6, 2012 at 5:16 PM, Steven Bosscher stevenb@gmail.com wrote: Hello, cfgloop.h only depends on rtl.h for rtx_code. The patch introduces a new enum and a simple function to convert from one enum to another in a few non-performance critical places. The upside is that this makes most of the tree-* files independent of rtl.h. Bootstrapped on x86_64-unknown-linux-gnu. OK for trunk? Ok with updating the comment here: /* The type of extend applied to it (SIGN_EXTEND, ZERO_EXTEND or UNKNOWN). */ - enum rtx_code extend; + enum iv_extend_code extend; thus, IV_SIGN_EXTEND, etc. Thanks, Richard. Ciao! Steven
[Ada] Plug hole with constants used by reference
This aligns the processing of constants used by reference in Identifier_to_gnu with that of regular constants; in particular we stop returning the underlying constant if this is the address of a constant and an lvalue is required. Tested on i586-suse-linux, applied on the mainline. 2012-06-11 Eric Botcazou ebotca...@adacore.com * gcc-interface/trans.c (Identifier_to_gnu): Test Is_Elementary_Type instead of Is_Scalar_Type for a constant with an address clause. Do not return the underlying constant for a constant used by reference if it holds the address of a constant and an lvalue is required. -- Eric Botcazou Index: gcc-interface/trans.c === --- gcc-interface/trans.c (revision 188377) +++ gcc-interface/trans.c (working copy) @@ -1019,7 +1019,7 @@ Identifier_to_gnu (Node_Id gnat_node, tr order-of-elaboration issue here. */ gnu_result_type = get_unpadded_type (gnat_temp_type); - /* If this is a non-imported scalar constant with an address clause, + /* If this is a non-imported elementary constant with an address clause, retrieve the value instead of a pointer to be dereferenced unless an lvalue is required. This is generally more efficient and actually required if this is a static expression because it might be used @@ -1028,7 +1028,7 @@ Identifier_to_gnu (Node_Id gnat_node, tr volatile-ness short-circuit here since Volatile constants must be imported per C.6. */ if (Ekind (gnat_temp) == E_Constant - Is_Scalar_Type (gnat_temp_type) + Is_Elementary_Type (gnat_temp_type) !Is_Imported (gnat_temp) Present (Address_Clause (gnat_temp))) { @@ -1080,7 +1080,10 @@ Identifier_to_gnu (Node_Id gnat_node, tr = convert (build_pointer_type (gnu_result_type), gnu_result); /* If it's a CONST_DECL, return the underlying constant like below. */ - else if (TREE_CODE (gnu_result) == CONST_DECL) + else if (TREE_CODE (gnu_result) == CONST_DECL + !(DECL_CONST_ADDRESS_P (gnu_result) + lvalue_required_p (gnat_node, gnu_result_type, true, + true, false))) gnu_result = DECL_INITIAL (gnu_result); /* If it's a renaming pointer and we are at the right binding level,
[PATCH, GCC DOC][AArch64] Fix layout in AArch64 target-specific documentation
Hi, This patch fixes several layout, formatting and wording issues in the AArch64 target-specific documentation. Thanks Sofiane - 2012-06-11 Sofiane Naci sofiane.n...@arm.com [AArch64] Fix documentation layout. * doc/invoke.texi: Fix white spaces after dots. Change aarch64*be-*-* to aarch64_be-*-*. Add documentation for -mcmodel=tiny. (-march): Fix formatting. (-mcpu): Likewise. (-mtune): Rephrase. (-march and -mcpu feature modifiers): New subsubsection. aarch64-fix-doc.patch Description: Binary data
Re: Go patch RFA: Fix taking the address of a field of a local variable
On Thu, Jun 7, 2012 at 10:08 AM, Ian Lance Taylor i...@google.com wrote: This patch to the Go compiler fixes the case of taking the address of a field of a local variable. Previously taking the address of the field was not treated as taking the address of the variable. But, of course, it is. This patch fixes the problem. Bootstrapped and ran Go testsuite on x86_64-unknown-linux-gnu. Committed to mainline. OK for 4.7 branch? This is a real bug, bad runtime code generated for valid Go code. Ok for 4.7.2, thus after 4.7.1 has been released. Thanks, Richard. Ian
Re: [patch] Remove cfglayout.h, and distribute cfglayout.c code to users
On Thu, Jun 7, 2012 at 10:18 PM, Steven Bosscher stevenb@gmail.com wrote: Hello, The attached patch removes cfglayout.h. It was included in a lot of places that didn't need it, and it didn't define much anyway. While checking the users of cfglayout.h, I noticed that a lot of code in cfglayout.c can be made local to users (reemit_insn_block_notes - final.c) or doesn't have to do much with cfglayout mode per se (all insn locator code - emit-rtl.c). This of course uncovered a buglet: function.h's x_last_location wasn't used, but emit-rtl.c had a define that interfered with the last_location for the insn locators. And it seemed rather strange that emit-rtl.c should include tree-flow.h, so I moved some non-GIMPLE tree-EH related function prototypes from tree-flow.h to tree.h (in retrospect, I should probably have made that a separate patch, but these patches tend to evolve, rather than be designed... ;-) Bootstrapped on x86_64-unknown-linux-gnu. OK? Ok! Thanks, Richard. Ciao! Steven
Re: [patch] Move split_double from final.c to rtlanal.c, do not include output.h in expr.c
On Fri, Jun 8, 2012 at 12:23 AM, Steven Bosscher stevenb@gmail.com wrote: Hello, Another patch to move around some code to break up a strange dependency. expr.c isn't supposed to emit assembly so it shouldn't need output.h, but it needs split_double from final.c. This split_double has nothing to do with writing out assembly, so it shouldn't be in final.c, and I moved it to rtlanal.c. This, and moving one more prototype from output.h to tree.h (an oversight in a previous patch) , makes it possible to no longer include output.h in expr.c. Bootstrappedtested on powerpc64-unknown-linux-gnu. OK for trunk? Ok. Thanks, Richard. Ciao! Steven
[arm-embedded] Backport mainline r181172, r184089, r184180
Committed to ARM/embedded-4_6-branch 2012-06-11 Joey Ye joey...@arm.com Backport r184089,184180 from mainline 2012-02-10 Jan Hubicka j...@suse.cz PR middle-end/48600 * predict.c (predict_paths_for_bb): Prevent looping. (predict_paths_leading_to_edge, predict_paths_leading_to): Update. 2012-02-13 Jan Hubicka j...@suse.cz PR middle-end/52214 * predict.c (predict_paths_for_bb): Fix thinko in prevoius patch. Backport partly r181172 from mainline 2011-11-08 Michael Matz m...@suse.de PR bootstrap/51969 * gengtype.c (write_field_root): Avoid out-of-scope access of newv. testsuite: Backport r184089 from mainline 2012-02-10 Jan Hubicka j...@suse.cz PR middle-end/48600 * g++.dg/torture/pr48600.C: New testcase.
Re: [PATCH, GCC DOC][AArch64] Fix layout in AArch64 target-specific documentation
On 11/06/12 09:54, Sofiane Naci wrote: (-march and -mcpu feature modifiers): New subsubsection. Copy and paste subsub? /Marcus
[Ada] Fix unexpected allocation change with size clause
The attached packages declare two big objects of the same size, but for the first package the object is allocated statically whereas, for the second one, the object is allocated dynamically because of the size clause. The discrepancy stems from a bit vs byte confusion in a specific place in gigi. Tested on i586-suse-linux, applied on the mainline. 2012-06-11 Eric Botcazou ebotca...@adacore.com * gcc-interface/decl.c (gnat_to_gnu_entity) object: Convert GNU_SIZE to units before invoking allocatable_size_p on it. Remove orphaned comment. Do not use ssize_int. E_Record_Subtype: Traverse list in original order. Minor tweak. (allocatable_size_p): Adjust and simplify. (build_subst_list): Use consistent terminology throughout. (build_variant_list): Likewise. Traverse list in original order. (create_field_decl_from): Likewise. (copy_and_substitute_in_size): Likewise. (create_variant_part_from): Add comment about field list order. * gcc-interface/utils.c (build_vms_descriptor): Do not use ssize_int. * gcc-interface/utils2.c (build_allocator): Likewise. 2012-06-11 Eric Botcazou ebotca...@adacore.com * gnat.dg/specs/array1.ads: New test. * gnat.dg/specs/array2.ads: Likewise. * gnat.dg/array22.adb: Likewise. -- Eric Botcazou -- { dg-do compile } pragma Restrictions (No_Elaboration_Code); package Array1 is type Arr is array (Positive range ) of Boolean; A : Arr (1 .. 2 ** 29); end Array1; -- { dg-do compile } -- { dg-options -gnatws } pragma Restrictions (No_Elaboration_Code); package Array2 is type Arr is array (Positive range ) of Boolean; A : Arr (1 .. 2 ** 2); for A'Size use 16#1000__0#; end Array2; -- { dg-do compile } with System; use System; procedure Array22 is type Integer_Address is mod Memory_Size; type Memory is array (Integer_Address range ) of Character; type Chunk (First, Last : Integer_Address) is record Mem : Memory (First .. Last); end record; C : Chunk (1, 8); for C'Alignment use 8; pragma Unreferenced (C); begin null; end; Index: gcc-interface/utils.c === --- gcc-interface/utils.c (revision 188379) +++ gcc-interface/utils.c (working copy) @@ -3601,7 +3601,7 @@ build_vms_descriptor (tree type, Mechani record_type, size_int (klass), field_list); field_list = make_descriptor_field (MBMO, gnat_type_for_size (32, 1), - record_type, ssize_int (-1), field_list); + record_type, size_int (-1), field_list); field_list = make_descriptor_field (LENGTH, gnat_type_for_size (64, 1), record_type, Index: gcc-interface/decl.c === --- gcc-interface/decl.c (revision 188379) +++ gcc-interface/decl.c (working copy) @@ -1283,10 +1283,14 @@ gnat_to_gnu_entity (Entity_Id gnat_entit global_bindings_p () || !definition || static_p) - || (gnu_size !allocatable_size_p (gnu_size, - global_bindings_p () - || !definition - || static_p))) + || (gnu_size + !allocatable_size_p (convert (sizetype, + size_binop + (CEIL_DIV_EXPR, gnu_size, + bitsize_unit_node)), + global_bindings_p () + || !definition + || static_p))) { gnu_type = build_reference_type (gnu_type); gnu_size = NULL_TREE; @@ -2204,8 +2208,6 @@ gnat_to_gnu_entity (Entity_Id gnat_entit debug_info_p); TYPE_READONLY (gnu_template_type) = 1; - /* Now build the array type. */ - /* If Component_Size is not already specified, annotate it with the size of the component. */ if (Unknown_Component_Size (gnat_entity)) @@ -2810,12 +2812,14 @@ gnat_to_gnu_entity (Entity_Id gnat_entit tree gnu_lower_bound = convert (gnu_string_index_type, gnat_to_gnu (String_Literal_Low_Bound (gnat_entity))); - int length = UI_To_Int (String_Literal_Length (gnat_entity)); - tree gnu_length = ssize_int (length - 1); + tree gnu_length + = UI_To_gnu (String_Literal_Length (gnat_entity), + gnu_string_index_type); tree gnu_upper_bound = build_binary_op (PLUS_EXPR, gnu_string_index_type, gnu_lower_bound, - convert (gnu_string_index_type, gnu_length)); + int_const_binop (MINUS_EXPR, gnu_length, + integer_one_node)); tree gnu_index_type = create_index_type (convert (sizetype, gnu_lower_bound), convert (sizetype, gnu_upper_bound), @@ -3298,7 +3302,7 @@ gnat_to_gnu_entity (Entity_Id gnat_entit if (gnu_variant_part) { variant_desc *v; - unsigned ix; + unsigned int i; gnu_variant_list = build_variant_list (TREE_TYPE (gnu_variant_part), @@ -3307,8 +3311,7 @@ gnat_to_gnu_entity (Entity_Id gnat_entit /* If all the qualifiers are unconditionally true, the innermost variant is statically selected. */
Re: [PATCH] Correct cost model for strided loads
On Sun, Jun 10, 2012 at 5:58 PM, William J. Schmidt wschm...@linux.vnet.ibm.com wrote: The fix for PR53331 caused a degradation to 187.facerec on powerpc64-unknown-linux-gnu. The following simple patch reverses the degradation without otherwise affecting SPEC cpu2000 or cpu2006. Bootstrapped and regtested on that platform with no new regressions. Ok for trunk? Well, would the real cost not be subparts * scalar_to_vec plus subparts * vec_perm? At least vec_perm isn't the cost for building up a vector from N scalar elements either (it might be close enough if subparts == 2). What's the case with facerec here? Does it have subparts == 2? I really wanted to pessimize this case for say AVX and char elements, thus building up a vector from 32 scalars which certainly does not cost a mere vec_perm. So, maybe special-case the subparts == 2 case and assume vec_perm would match the cost only in that case. Thanks, Richard. Thanks, Bill 2012-06-10 Bill Schmidt wschm...@linux.ibm.com * tree-vect-stmts.c (vect_model_load_cost): Change cost model for strided loads. Index: gcc/tree-vect-stmts.c === --- gcc/tree-vect-stmts.c (revision 188341) +++ gcc/tree-vect-stmts.c (working copy) @@ -1031,11 +1031,10 @@ vect_model_load_cost (stmt_vec_info stmt_info, int /* The loads themselves. */ if (STMT_VINFO_STRIDE_LOAD_P (stmt_info)) { - /* N scalar loads plus gathering them into a vector. - ??? scalar_to_vec isn't the cost for that. */ + /* N scalar loads plus gathering them into a vector. */ inside_cost += (vect_get_stmt_cost (scalar_load) * ncopies * TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))); - inside_cost += ncopies * vect_get_stmt_cost (scalar_to_vec); + inside_cost += ncopies * vect_get_stmt_cost (vec_perm); } else vect_get_load_cost (first_dr, ncopies,
Re: [Fortran, DRAFT patch] PR 46321 - [OOP] Polymorphic deallocation
Thank you for the review, with this patch I get some ICEs during the regstest with: gfortran.dg/coarray/poly_run_3.f90 gfortran.dg/elemental_optional_args_5.f03 gfortran.dg/select_type_26.f03 gfortran.dg/select_type_27.f03 gfortran.dg/class_48.f90 gfortran.dg/class_allocate_10.f03 gfortran.dg/class_allocate_8.f03 gfortran.dg/class_array_1.f03 gfortran.dg/class_array_2.f03 gfortran.dg/assumed_type_2.f90 gfortran.dg/class_array_9.f03 gfortran.dg/coarray_lib_alloc_2.f90 I've debugged only the first 2 and the problem seems to be related with tmp = fold_build2_loc (input_location, NE_EXPR, boolean_type_node, se.expr, build_int_cst (TREE_TYPE (se.expr), 0)); in trans-stmt.c at line 5376. The ICE message is the following: $ gcc/bin/gfortran -c elemental_optional_args_5.f03 elemental_optional_args_5.f03: In function ‘MAIN__’: elemental_optional_args_5.f03:220:0: internal compiler error: in build_int_cst_wide, at tree.c:1219 deallocate (taa, tpa, caa, cpa) ^ Please submit a full bug report, with preprocessed source if appropriate. See http://gcc.gnu.org/bugs.html for instructions. 2012/6/10 Tobias Burnus bur...@net-b.de: Alessandro Fanfarillo wrote: with the priceless support of Tobias I've almost realized the patch for this PR. In attachment there's the second draft. During the regression test I have only one error with select_type_4.f90. The problem is in the destroy_list subroutine when it checks associated(node) after the first deallocate(node). --- gcc/fortran/trans-stmt.c (revisione 188002) +++ gcc/fortran/trans-stmt.c (copia locale) @@ -5341,7 +5341,12 @@ gfc_trans_deallocate (gfc_code *code) for (al = code-ext.alloc.list; al != NULL; al = al-next) { - gfc_expr *expr = gfc_copy_expr (al-expr); + gfc_expr *expr; + gfc_expr *ppc; + gfc_code *ppc_code; + gfc_actual_arglist *actual; + expr = gfc_copy_expr (al-expr); + ppc = gfc_copy_expr (expr); ... + if (expr-symtree-n.sym-ts.type == BT_CLASS) I'd prefer: gfc_expr *ppc = NULL; ... if (expr-ts.type == BT_CLASS) ppc = gfc_copy_expr (expr); ... if (ppc) ... Namely: Only copy the expression if needed. Additionally, the check if (expr-symtree-n.sym-ts.type == BT_CLASS) is wrong. For instance, for type(t) :: x deallocate(x%class) it won't trigger, but it should. Actually, I think a cleaner version would be: if (al-expr-ts.type == BT_CLASS) { gfc_expr *ppc; ppc = gfc_copy_expr (al-expr); * * * Furthermore, I think you call _free + free for the same component for: type t integer, allocatable :: x end type t class(t), allocatable :: y ... deallocate (y) * * * Regarding your code: You assume that al-expr points to an allocated variable, that's not the always the case - hence, select_type_4.f90 fails. * * * You always create a _free function; I wonder whether it makes sense to use _vtab-free with NULL in case that no _free is needed. * * * Attached an updated version, which does that all. No guarantee that it works correctly, but it should at least fix select_type_4.f90. Tobias
Re: Wider modes for partial int modes
On 06/09/2012 01:58 AM, Mike Stump wrote: On Jun 8, 2012, at 4:11 PM, Mike Stump wrote: On Apr 17, 2012, at 2:08 PM, Bernd Schmidt wrote: This patch enables GET_MODE_WIDER_MODE for MODE_PARTIAL_INT (by setting the wider mode to the one the partial mode is based on), which is useful for the port I'm working on: I can avoid defining operations on the partial modes. I think this breaks my port, in emit-rtl.c: B for (mode = GET_CLASS_NARROWEST_MODE (MODE_PARTIAL_INT); mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode)) = const_tiny_rtx[i][(int) mode] = GEN_INT (i); Doesn't work, as that does expects to step through all partial int modes. The problem is, we went from P1QI - P1SI, and now we go from P1QI to QI. The problem is this, we never initialize P1SI anymore, and later, when initializing vector constants, things like V2P1SI dies, as P1SI was never built: So, maybe we want to go through all modes, and just ask, are you a MODE_PARTIAL_INT, and initialize them that way? Thoughts? The below patch allows my port to at least initialize once more... Thoughts? Ok? Looks good to me. Bernd
Re: [Fortran, DRAFT patch] PR 46321 - [OOP] Polymorphic deallocation
On 06/11/2012 11:24 AM, Alessandro Fanfarillo wrote: gfortran.dg/coarray/poly_run_3.f90 That one fails because I for forgot that se.expr in gfc_trans_deallocate contains the descriptor and not the pointer to the data. That's fixed by: tmp = se.expr; if (GFC_DESCRIPTOR_TYPE_P (TREE_TYPE (tmp))) { tmp = gfc_conv_descriptor_data_get (tmp); STRIP_NOPS (tmp); } tmp = fold_build2_loc (input_location, NE_EXPR, boolean_type_node, tmp, build_int_cst (TREE_TYPE (tmp), 0)); However, it still fails for the type t integer, allocatable :: comp end type t contains subroutine foo(x) class(t), allocatable, intent(out) :: x(:) end subroutine end (The intent(out) causes automatic deallocation.) The backtrace does not really point to some code which the patch touched; it shouldn't be affected by the class.c changes and gfc_trans_deallocate does not seem to be entered. While I do not immediately see why it fails, I wonder whether it is due to the removed else if ... BT_CLASS) case in gfc_deallocate_scalar_with_status. In any case, the change to gfc_trans_deallocate might be also needed for gfc_deallocate_scalar_with_status. At least, automatic deallocation (with intent(out) or when leaving the scope) does not seem to go through gfc_trans_deallocate but only through gfc_deallocate_scalar_with_status. Tobias
Patch ping
Hi! http://gcc.gnu.org/ml/gcc-patches/2012-06/msg00105.html - add warning for complit with incomplte type Jakub
Re: [PATCH] Hoist adjacent pointer loads
On Mon, Jun 4, 2012 at 3:45 PM, William J. Schmidt wschm...@linux.vnet.ibm.com wrote: Hi Richard, Here's a revision of the hoist-adjacent-loads patch. I'm sorry for the delay since the last revision, but my performance testing has been blocked waiting for a fix to PR53487. I ended up applying a test version of the patch to 4.7 and ran performance numbers with that instead, with no degradations. In addition to addressing your comments, this patch contains one bug fix where local_mem_dependence was called on the wrong blocks after swapping def1 and def2. Bootstrapped with no regressions on powerpc64-unknown-linux-gnu. Is this version ok for trunk? I won't commit it until I can do final testing on trunk in conjunction with a fix for PR53487. Thanks, Bill 2012-06-04 Bill Schmidt wschm...@linux.vnet.ibm.com * opts.c: Add -fhoist_adjacent_loads to -O2 and above. * tree-ssa-phiopt.c (tree_ssa_phiopt_worker): Add argument to forward declaration. (hoist_adjacent_loads, gate_hoist_loads): New forward declarations. (tree_ssa_phiopt): Call gate_hoist_loads. (tree_ssa_cs_elim): Add parm to tree_ssa_phiopt_worker call. (tree_ssa_phiopt_worker): Add do_hoist_loads to formal arg list; call hoist_adjacent_loads. (local_mem_dependence): New function. (hoist_adjacent_loads): Likewise. (gate_hoist_loads): Likewise. * common.opt (fhoist-adjacent-loads): New switch. * Makefile.in (tree-ssa-phiopt.o): Added dependencies. * params.def (PARAM_MIN_CMOVE_STRUCT_ALIGN): New param. Index: gcc/opts.c === --- gcc/opts.c (revision 187805) +++ gcc/opts.c (working copy) @@ -489,6 +489,7 @@ static const struct default_options default_option { OPT_LEVELS_2_PLUS, OPT_falign_functions, NULL, 1 }, { OPT_LEVELS_2_PLUS, OPT_ftree_tail_merge, NULL, 1 }, { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_foptimize_strlen, NULL, 1 }, + { OPT_LEVELS_2_PLUS, OPT_fhoist_adjacent_loads, NULL, 1 }, /* -O3 optimizations. */ { OPT_LEVELS_3_PLUS, OPT_ftree_loop_distribute_patterns, NULL, 1 }, Index: gcc/tree-ssa-phiopt.c === --- gcc/tree-ssa-phiopt.c (revision 187805) +++ gcc/tree-ssa-phiopt.c (working copy) @@ -37,9 +37,17 @@ along with GCC; see the file COPYING3. If not see #include cfgloop.h #include tree-data-ref.h #include tree-pretty-print.h +#include gimple-pretty-print.h +#include insn-config.h +#include expr.h +#include optabs.h +#ifndef HAVE_conditional_move +#define HAVE_conditional_move (0) +#endif + static unsigned int tree_ssa_phiopt (void); -static unsigned int tree_ssa_phiopt_worker (bool); +static unsigned int tree_ssa_phiopt_worker (bool, bool); static bool conditional_replacement (basic_block, basic_block, edge, edge, gimple, tree, tree); static int value_replacement (basic_block, basic_block, @@ -53,6 +61,9 @@ static bool cond_store_replacement (basic_block, b static bool cond_if_else_store_replacement (basic_block, basic_block, basic_block); static struct pointer_set_t * get_non_trapping (void); static void replace_phi_edge_with_variable (basic_block, edge, gimple, tree); +static void hoist_adjacent_loads (basic_block, basic_block, + basic_block, basic_block); +static bool gate_hoist_loads (void); /* This pass tries to replaces an if-then-else block with an assignment. We have four kinds of transformations. Some of these @@ -138,12 +149,56 @@ static void replace_phi_edge_with_variable (basic_ bb2: x = PHI x' (bb0), ...; - A similar transformation is done for MAX_EXPR. */ + A similar transformation is done for MAX_EXPR. + + This pass also performs a fifth transformation of a slightly different + flavor. + + Adjacent Load Hoisting + -- + + This transformation replaces + + bb0: + if (...) goto bb2; else goto bb1; + bb1: + x1 = (expr).field1; + goto bb3; + bb2: + x2 = (expr).field2; + bb3: + # x = PHI x1, x2; + + with + + bb0: + x1 = (expr).field1; + x2 = (expr).field2; + if (...) goto bb2; else goto bb1; + bb1: + goto bb3; + bb2: + bb3: + # x = PHI x1, x2; + + The purpose of this transformation is to enable generation of conditional + move instructions such as Intel CMOVE or PowerPC ISEL. Because one of + the loads is speculative, the transformation is restricted to very + specific cases to avoid introducing a page fault. We are looking for + the common idiom: + + if (...) + x = y-left; + else + x = y-right; + + where left and right are typically adjacent pointers in a tree structure. */ + static
Re: [PATCH] Add vector cost model density heuristic
On Fri, 8 Jun 2012, William J. Schmidt wrote: This patch adds a heuristic to the vectorizer when estimating the minimum profitable number of iterations. The heuristic is target-dependent, and is currently disabled for all targets except PowerPC. However, the intent is to make it general enough to be useful for other targets that want to opt in. A previous patch addressed some PowerPC SPEC degradations by modifying the vector cost model values for vec_perm and vec_promote_demote. The values were set a little higher than their natural values because the natural values were not sufficient to prevent a poor vectorization choice. However, this is not the right long-term solution, since it can unnecessarily constrain other vectorization choices involving permute instructions. Analysis of the badly vectorized loop (in sphinx3) showed that the problem was overcommitment of vector resources -- too many vector instructions issued without enough non-vector instructions available to cover the delays. The vector cost model assumes that instructions always have a constant cost, and doesn't have a way of judging this kind of density of vector instructions. The present patch adds a heuristic to recognize when a loop is likely to overcommit resources, and adds a small penalty to the inside-loop cost to account for the expected stalls. The heuristic is parameterized with three target-specific values: * Density threshold: The heuristic will apply only when the percentage of inside-loop cost attributable to vectorized instructions exceeds this value. * Size threshold: The heuristic will apply only when the inside-loop cost exceeds this value. * Penalty: The inside-loop cost will be increased by this percentage value when the heuristic applies. Thus only reasonably large loop bodies that are mostly vectorized instructions will be affected. By applying only a small percentage bump to the inside-loop cost, the heuristic will only turn off vectorization for loops that were considered barely profitable to begin with (such as the sphinx3 loop). So the heuristic is quite conservative and should not affect the vast majority of vectorization decisions. Together with the new heuristic, this patch reduces the vec_perm and vec_promote_demote costs for PowerPC to their natural values. I've regstrapped this with no regressions on powerpc64-unknown-linux-gnu and verified that no performance regressions occur on SPEC cpu2006. Is this ok for trunk? Hmm. I don't like this patch or its general idea too much. Instead I'd like us to move more of the cost model detail to the target, giving it a chance to look at the whole loop before deciding on a cost. ISTR posting the overall idea at some point, but let me repeat it here instead of trying to find that e-mail. The basic interface of the cost model should be, in targetm.vectorize /* Tell the target to start cost analysis of a loop or a basic-block (if the loop argument is NULL). Returns an opaque pointer to target-private data. */ void *init_cost (struct loop *loop); /* Add cost for N vectorized-stmt-kind statements in vector_mode. */ void add_stmt_cost (void *data, unsigned n, vectorized-stmt-kind, enum machine_mode vector_mode); /* Tell the target to compute and return the cost of the accumulated statements and free any target-private data. */ unsigned finish_cost (void *data); with eventually slightly different signatures for add_stmt_cost (like pass in the original scalar stmt?). It allows the target, at finish_cost time, to evaluate things like register pressure and resource utilization. Thanks, Richard. Thanks, Bill 2012-06-08 Bill Schmidt wschm...@linux.ibm.com * doc/tm.texi.in: Add vectorization density hooks. * doc/tm.texi: Regenerate. * targhooks.c (default_density_pct_threshold): New. (default_density_size_threshold): New. (default_density_penalty): New. * targhooks.h: New decls for new targhooks.c functions. * target.def (density_pct_threshold): New DEF_HOOK. (density_size_threshold): Likewise. (density_penalty): Likewise. * tree-vect-loop.c (accum_stmt_cost): New. (vect_estimate_min_profitable_iters): Perform density test. * config/rs6000/rs6000.c (TARGET_VECTORIZE_DENSITY_PCT_THRESHOLD): New macro definition. (TARGET_VECTORIZE_DENSITY_SIZE_THRESHOLD): Likewise. (TARGET_VECTORIZE_DENSITY_PENALTY): Likewise. (rs6000_builtin_vectorization_cost): Reduce costs of vec_perm and vec_promote_demote to correct values. (rs6000_density_pct_threshold): New. (rs6000_density_size_threshold): New. (rs6000_density_penalty): New. Index: gcc/doc/tm.texi === --- gcc/doc/tm.texi (revision 188305) +++
[Ping] Fix gcc/gcov.c and libgcc/libgcov.c to fix build on VxWorks
Hi everyone, Ping RE: http://gcc.gnu.org/ml/gcc-patches/2012-06/msg00086.html Thanks, Robert Mason On 06/01/2012 02:43 PM, rbmj wrote: On 06/01/2012 02:40 PM, rbmj wrote: Hi everyone, These fixes are to allow building on vxWorks. Currently there are two issues: 1. VxWorks does not have a variadic open - it must receive three arguments. gcc/gcov.c however opens a file for reading and does not pass in a mode argument, which causes an error on vxWorks. This just adds a platform-based ifdef around this. I am aware that this is against coding conventions, and if that is an issue, I can think of two resolutions. One, an autoconf macro to check for a non-variadic open, or two, simply pass the extra mode argument in unconditionally, as it should be transparent to the function and ignored if it is variadic (I'm no expert on calling conventions though). 2. VxWorks has a two argument mkdir(). libgcc/libgcov.c uses mkdir() and calls it with two arguments if TARGET_POSIX_IO is defined and only one argument otherwise. I don't know what TARGET_POSIX_IO is, but if it's anything more than just mkdir(), this is probably correct, as vxworks is only partially POSIX compliant. Again, another platform-based ifdef, so if that is anathema, it can be replaced by more autoconf hackery. The patch as-is compiles targeting on vxworks and bootstraps on a native x86_64-linux-gnu build (which makes sense, since it doesn't touch anything for non-vxworks). Gerald Pfeifer has volunteered to apply the patch if approved. Also, in an earlier message [1] Janne Blomqvist mentioned that newer versions of VxWorks do have the variadic open(), and this is true. However, as far as I know, this version is still not available for kernel modules, only real-time processes. Thanks, Robert Mason Sorry, forgot to add in the link. [1]: http://gcc.gnu.org/ml/gcc-patches/2012-05/msg01102.html
Re: [Patch] Add AC_ARG_ENABLE for libstdc++-v3
On 05/22/2012 04:37 PM, rbmj wrote: This patch adds an AC_ARG_ENABLE option to build/not build libstdc++-v3. I wrote the patch in order to allow the user to override the automatic disable for libstdc++-v3 for certain targets. Ping^2 on http://gcc.gnu.org/ml/gcc-patches/2012-05/msg01525.html Robert Mason
Re: [Ping] Fix gcc/gcov.c and libgcc/libgcov.c to fix build on VxWorks
Il 11/06/2012 13:56, rbmj ha scritto: 1. VxWorks does not have a variadic open - it must receive three arguments. gcc/gcov.c however opens a file for reading and does not pass in a mode argument, which causes an error on vxWorks. This just adds a platform-based ifdef around this. I am aware that this is against coding conventions, and if that is an issue, I can think of two resolutions. [...] simply pass the extra mode argument in unconditionally, as it should be transparent to the function and ignored if it is variadic (I'm no expert on calling conventions though). Yes, please do this. 2. VxWorks has a two argument mkdir(). libgcc/libgcov.c uses mkdir() and calls it with two arguments if TARGET_POSIX_IO is defined and only one argument otherwise. I don't know what TARGET_POSIX_IO is, but if it's anything more than just mkdir(), this is probably correct, as vxworks is only partially POSIX compliant. Again, another platform-based ifdef, so if that is anathema, it can be replaced by more autoconf hackery. VxWorks should define TARGET_POSIX_IO if it has both access and mkdir. Please add it to gcc/config/vxworks.h if this is the case. Paolo
Re: [Patch] Add AC_ARG_ENABLE for libstdc++-v3
Il 11/06/2012 13:59, rbmj ha scritto: On 05/22/2012 04:37 PM, rbmj wrote: This patch adds an AC_ARG_ENABLE option to build/not build libstdc++-v3. I wrote the patch in order to allow the user to override the automatic disable for libstdc++-v3 for certain targets. Ping^2 on http://gcc.gnu.org/ml/gcc-patches/2012-05/msg01525.html Ok. Paolo
[PATCH][C++] Fix PR53605
When I changed empty arrays domain to use a signed sizetype [0, -1] domain mangling of a empty-array-type broke because mangling adds an unsigned one to the signed -1 which causes an ICE (I chose to do that instead of shifting the range to [1, 0] because much more code relies on a zero lower bound ...). The following fixes that by using double-ints for the arithmetic and thus also does not generate a not needed tree node. Bootstrapped and tested on x86_64-unknown-linux-gnu, ok? Thanks, Richard. 2012-06-11 Richard Guenther rguent...@suse.de PR c++/53616 * mangle.c (write_array_type): Use double-ints for array domain arithmetic. * g++.dg/ext/pr53605.C: New testcase. Index: gcc/cp/mangle.c === --- gcc/cp/mangle.c (revision 188384) +++ gcc/cp/mangle.c (working copy) @@ -3119,8 +3119,10 @@ write_array_type (const tree type) { /* The ABI specifies that we should mangle the number of elements in the array, not the largest allowed index. */ - max = size_binop (PLUS_EXPR, max, size_one_node); - write_unsigned_number (tree_low_cst (max, 1)); + double_int dmax + = double_int_add (tree_to_double_int (max), double_int_one); + gcc_assert (double_int_fits_in_uhwi_p (dmax)); + write_unsigned_number (dmax.low); } else { Index: gcc/testsuite/g++.dg/ext/pr53605.C === --- gcc/testsuite/g++.dg/ext/pr53605.C (revision 0) +++ gcc/testsuite/g++.dg/ext/pr53605.C (revision 0) @@ -0,0 +1,16 @@ +// { dg-do compile } + +// Avoid -pedantic-error default +// { dg-options } + +template bool lhs_is_null_literal +class EqHelper { +public: +template typename T1, typename T2 + static int Compare( const T1 expected, +const T2 actual); +}; +void foo(){ +static const int kData[] = {}; +::EqHelperfalse::Compare(kData, abc); +}
Re: [PATCH] Hoist adjacent pointer loads
On Mon, 2012-06-11 at 13:28 +0200, Richard Guenther wrote: On Mon, Jun 4, 2012 at 3:45 PM, William J. Schmidt wschm...@linux.vnet.ibm.com wrote: Hi Richard, Here's a revision of the hoist-adjacent-loads patch. I'm sorry for the delay since the last revision, but my performance testing has been blocked waiting for a fix to PR53487. I ended up applying a test version of the patch to 4.7 and ran performance numbers with that instead, with no degradations. In addition to addressing your comments, this patch contains one bug fix where local_mem_dependence was called on the wrong blocks after swapping def1 and def2. Bootstrapped with no regressions on powerpc64-unknown-linux-gnu. Is this version ok for trunk? I won't commit it until I can do final testing on trunk in conjunction with a fix for PR53487. Thanks, Bill 2012-06-04 Bill Schmidt wschm...@linux.vnet.ibm.com * opts.c: Add -fhoist_adjacent_loads to -O2 and above. * tree-ssa-phiopt.c (tree_ssa_phiopt_worker): Add argument to forward declaration. (hoist_adjacent_loads, gate_hoist_loads): New forward declarations. (tree_ssa_phiopt): Call gate_hoist_loads. (tree_ssa_cs_elim): Add parm to tree_ssa_phiopt_worker call. (tree_ssa_phiopt_worker): Add do_hoist_loads to formal arg list; call hoist_adjacent_loads. (local_mem_dependence): New function. (hoist_adjacent_loads): Likewise. (gate_hoist_loads): Likewise. * common.opt (fhoist-adjacent-loads): New switch. * Makefile.in (tree-ssa-phiopt.o): Added dependencies. * params.def (PARAM_MIN_CMOVE_STRUCT_ALIGN): New param. Index: gcc/opts.c === --- gcc/opts.c (revision 187805) +++ gcc/opts.c (working copy) @@ -489,6 +489,7 @@ static const struct default_options default_option { OPT_LEVELS_2_PLUS, OPT_falign_functions, NULL, 1 }, { OPT_LEVELS_2_PLUS, OPT_ftree_tail_merge, NULL, 1 }, { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_foptimize_strlen, NULL, 1 }, +{ OPT_LEVELS_2_PLUS, OPT_fhoist_adjacent_loads, NULL, 1 }, /* -O3 optimizations. */ { OPT_LEVELS_3_PLUS, OPT_ftree_loop_distribute_patterns, NULL, 1 }, Index: gcc/tree-ssa-phiopt.c === --- gcc/tree-ssa-phiopt.c (revision 187805) +++ gcc/tree-ssa-phiopt.c (working copy) @@ -37,9 +37,17 @@ along with GCC; see the file COPYING3. If not see #include cfgloop.h #include tree-data-ref.h #include tree-pretty-print.h +#include gimple-pretty-print.h +#include insn-config.h +#include expr.h +#include optabs.h +#ifndef HAVE_conditional_move +#define HAVE_conditional_move (0) +#endif + static unsigned int tree_ssa_phiopt (void); -static unsigned int tree_ssa_phiopt_worker (bool); +static unsigned int tree_ssa_phiopt_worker (bool, bool); static bool conditional_replacement (basic_block, basic_block, edge, edge, gimple, tree, tree); static int value_replacement (basic_block, basic_block, @@ -53,6 +61,9 @@ static bool cond_store_replacement (basic_block, b static bool cond_if_else_store_replacement (basic_block, basic_block, basic_block); static struct pointer_set_t * get_non_trapping (void); static void replace_phi_edge_with_variable (basic_block, edge, gimple, tree); +static void hoist_adjacent_loads (basic_block, basic_block, + basic_block, basic_block); +static bool gate_hoist_loads (void); /* This pass tries to replaces an if-then-else block with an assignment. We have four kinds of transformations. Some of these @@ -138,12 +149,56 @@ static void replace_phi_edge_with_variable (basic_ bb2: x = PHI x' (bb0), ...; - A similar transformation is done for MAX_EXPR. */ + A similar transformation is done for MAX_EXPR. + + This pass also performs a fifth transformation of a slightly different + flavor. + + Adjacent Load Hoisting + -- + + This transformation replaces + + bb0: + if (...) goto bb2; else goto bb1; + bb1: + x1 = (expr).field1; + goto bb3; + bb2: + x2 = (expr).field2; + bb3: + # x = PHI x1, x2; + + with + + bb0: + x1 = (expr).field1; + x2 = (expr).field2; + if (...) goto bb2; else goto bb1; + bb1: + goto bb3; + bb2: + bb3: + # x = PHI x1, x2; + + The purpose of this transformation is to enable generation of conditional + move instructions such as Intel CMOVE or PowerPC ISEL. Because one of + the loads is speculative, the transformation is restricted to very + specific cases to avoid introducing a page fault. We are looking for +
Re: [PATCH] Hoist adjacent pointer loads
On Mon, 11 Jun 2012, William J. Schmidt wrote: On Mon, 2012-06-11 at 13:28 +0200, Richard Guenther wrote: On Mon, Jun 4, 2012 at 3:45 PM, William J. Schmidt wschm...@linux.vnet.ibm.com wrote: Hi Richard, Here's a revision of the hoist-adjacent-loads patch. I'm sorry for the delay since the last revision, but my performance testing has been blocked waiting for a fix to PR53487. I ended up applying a test version of the patch to 4.7 and ran performance numbers with that instead, with no degradations. In addition to addressing your comments, this patch contains one bug fix where local_mem_dependence was called on the wrong blocks after swapping def1 and def2. Bootstrapped with no regressions on powerpc64-unknown-linux-gnu. Is this version ok for trunk? I won't commit it until I can do final testing on trunk in conjunction with a fix for PR53487. Thanks, Bill 2012-06-04 Bill Schmidt wschm...@linux.vnet.ibm.com * opts.c: Add -fhoist_adjacent_loads to -O2 and above. * tree-ssa-phiopt.c (tree_ssa_phiopt_worker): Add argument to forward declaration. (hoist_adjacent_loads, gate_hoist_loads): New forward declarations. (tree_ssa_phiopt): Call gate_hoist_loads. (tree_ssa_cs_elim): Add parm to tree_ssa_phiopt_worker call. (tree_ssa_phiopt_worker): Add do_hoist_loads to formal arg list; call hoist_adjacent_loads. (local_mem_dependence): New function. (hoist_adjacent_loads): Likewise. (gate_hoist_loads): Likewise. * common.opt (fhoist-adjacent-loads): New switch. * Makefile.in (tree-ssa-phiopt.o): Added dependencies. * params.def (PARAM_MIN_CMOVE_STRUCT_ALIGN): New param. Index: gcc/opts.c === --- gcc/opts.c (revision 187805) +++ gcc/opts.c (working copy) @@ -489,6 +489,7 @@ static const struct default_options default_option { OPT_LEVELS_2_PLUS, OPT_falign_functions, NULL, 1 }, { OPT_LEVELS_2_PLUS, OPT_ftree_tail_merge, NULL, 1 }, { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_foptimize_strlen, NULL, 1 }, +{ OPT_LEVELS_2_PLUS, OPT_fhoist_adjacent_loads, NULL, 1 }, /* -O3 optimizations. */ { OPT_LEVELS_3_PLUS, OPT_ftree_loop_distribute_patterns, NULL, 1 }, Index: gcc/tree-ssa-phiopt.c === --- gcc/tree-ssa-phiopt.c (revision 187805) +++ gcc/tree-ssa-phiopt.c (working copy) @@ -37,9 +37,17 @@ along with GCC; see the file COPYING3. If not see #include cfgloop.h #include tree-data-ref.h #include tree-pretty-print.h +#include gimple-pretty-print.h +#include insn-config.h +#include expr.h +#include optabs.h +#ifndef HAVE_conditional_move +#define HAVE_conditional_move (0) +#endif + static unsigned int tree_ssa_phiopt (void); -static unsigned int tree_ssa_phiopt_worker (bool); +static unsigned int tree_ssa_phiopt_worker (bool, bool); static bool conditional_replacement (basic_block, basic_block, edge, edge, gimple, tree, tree); static int value_replacement (basic_block, basic_block, @@ -53,6 +61,9 @@ static bool cond_store_replacement (basic_block, b static bool cond_if_else_store_replacement (basic_block, basic_block, basic_block); static struct pointer_set_t * get_non_trapping (void); static void replace_phi_edge_with_variable (basic_block, edge, gimple, tree); +static void hoist_adjacent_loads (basic_block, basic_block, + basic_block, basic_block); +static bool gate_hoist_loads (void); /* This pass tries to replaces an if-then-else block with an assignment. We have four kinds of transformations. Some of these @@ -138,12 +149,56 @@ static void replace_phi_edge_with_variable (basic_ bb2: x = PHI x' (bb0), ...; - A similar transformation is done for MAX_EXPR. */ + A similar transformation is done for MAX_EXPR. + + This pass also performs a fifth transformation of a slightly different + flavor. + + Adjacent Load Hoisting + -- + + This transformation replaces + + bb0: + if (...) goto bb2; else goto bb1; + bb1: + x1 = (expr).field1; + goto bb3; + bb2: + x2 = (expr).field2; + bb3: + # x = PHI x1, x2; + + with + + bb0: + x1 = (expr).field1; + x2 = (expr).field2; + if (...) goto bb2; else goto bb1; + bb1: + goto bb3; + bb2: + bb3: + # x = PHI x1, x2; + + The purpose of this transformation is to enable generation of conditional + move instructions such as Intel CMOVE
[RFC] Target-specific limits on vector alignment
The ARM ABI states that vectors larger than 64 bits in size still have 64-bit alignment; never-the-less, the HW supports alignment hints of up to 128-bits in some cases and will trap in a vector has an alignment that less than the hint. GCC currently hard-codes larger vectors to be aligned by the size of the vector, which means that 128-bit vectors are marked as being 128-bit aligned. The ARM ABI unfortunately does not support generating such alignment for parameters passed by value and this can lead to traps at run time. It seems that the best way to solve this problem is to allow the back-end to set an upper limit on the alignment permitted for a vector. I've implemented this as a separate hook, rather than using the existing hooks because there's a strong likelihood of breaking some existing ABIs if I did it another way. There are a couple of tests that will need some re-working before this can be committed to deal with the fall-out of making this change; I'll prepare those changes if this patch is deemed generally acceptable. R. * target.def (TARGET_VECTOR_ALIGNMENT): New hook. * doc/tm.texi.in (TARGET_VECTOR_ALIGNMENT): Likewise. * doc/tm.texi: Regenerate. * targhooks.c (default_vector_alignment): New function. * arm.c (arm_vector_alignment): New function. (TARGET_VECTOR_ALIGNMENT): Define.--- config/arm/arm.c(revision 187425) +++ config/arm/arm.c(local) @@ -259,6 +259,8 @@ static bool arm_array_mode_supported_p ( unsigned HOST_WIDE_INT); static enum machine_mode arm_preferred_simd_mode (enum machine_mode); static bool arm_class_likely_spilled_p (reg_class_t); +static HOST_WIDE_INT arm_vector_alignment (const_tree type, + HOST_WIDE_INT align); static bool arm_vector_alignment_reachable (const_tree type, bool is_packed); static bool arm_builtin_support_vector_misalignment (enum machine_mode mode, const_tree type, @@ -607,6 +609,9 @@ static const struct attribute_spec arm_a #undef TARGET_CLASS_LIKELY_SPILLED_P #define TARGET_CLASS_LIKELY_SPILLED_P arm_class_likely_spilled_p +#undef TARGET_VECTOR_ALIGNMENT +#define TARGET_VECTOR_ALIGNMENT arm_vector_alignment + #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \ arm_vector_alignment_reachable @@ -24850,6 +25292,15 @@ arm_have_conditional_execution (void) return !TARGET_THUMB1; } +/* The AAPCS sets the maximum alignment of a vector to 64 bits. */ +static HOST_WIDE_INT +arm_vector_alignment (const_tree type ATTRIBUTE_UNUSED, HOST_WIDE_INT align) +{ + if (TARGET_AAPCS_BASED) +return MIN (align, 64); + return align; +} + static unsigned int arm_autovectorize_vector_sizes (void) { --- doc/tm.texi (revision 187425) +++ doc/tm.texi (local) @@ -1099,6 +1099,12 @@ make it all fit in fewer cache lines. If the value of this macro has a type, it should be an unsigned type. @end defmac +@deftypefn {Target Hook} HOST_WIDE_INT TARGET_VECTOR_ALIGNMENT (const_tree @var{type}, HOST_WIDE_INT @var{align}) +This hook can be used to limit the alignment for a vector of type +@var{type}, in order to comply with a platform ABI. The default is to +return @var{align}. +@end deftypefn + @defmac STACK_SLOT_ALIGNMENT (@var{type}, @var{mode}, @var{basic-align}) If defined, a C expression to compute the alignment for stack slot. @var{type} is the data type, @var{mode} is the widest mode available, --- doc/tm.texi.in (revision 187425) +++ doc/tm.texi.in (local) @@ -1087,6 +1087,8 @@ make it all fit in fewer cache lines. If the value of this macro has a type, it should be an unsigned type. @end defmac +@hook TARGET_VECTOR_ALIGNMENT + @defmac STACK_SLOT_ALIGNMENT (@var{type}, @var{mode}, @var{basic-align}) If defined, a C expression to compute the alignment for stack slot. @var{type} is the data type, @var{mode} is the widest mode available, --- target.def (revision 187425) +++ target.def (local) @@ -1615,6 +1615,14 @@ DEFHOOK bool, (enum machine_mode mode), hook_bool_mode_false) +DEFHOOK +(vector_alignment, + This hook can be used to limit the alignment for a vector of type\n\ +@var{type}, in order to comply with a platform ABI. The default is to\n\ +return @var{align}., + HOST_WIDE_INT, (const_tree type, HOST_WIDE_INT align), + default_vector_alignment) + /* True if we should try to use a scalar mode to represent an array, overriding the usual MAX_FIXED_MODE limit. */ DEFHOOK --- targhooks.c (revision 187425) +++ targhooks.c (local) @@ -939,6 +939,13 @@ tree default_mangle_decl_assembler_name return id; } +HOST_WIDE_INT +default_vector_alignment (const_tree type ATTRIBUTE_UNUSED, + HOST_WIDE_INT align) +{ + return align; +} + bool default_builtin_vector_alignment_reachable (const_tree type, bool is_packed) {
Re: [PATCH][C++] Fix PR53605
OK. Jason
Re: [PATCH] Correct cost model for strided loads
On Mon, 2012-06-11 at 11:15 +0200, Richard Guenther wrote: On Sun, Jun 10, 2012 at 5:58 PM, William J. Schmidt wschm...@linux.vnet.ibm.com wrote: The fix for PR53331 caused a degradation to 187.facerec on powerpc64-unknown-linux-gnu. The following simple patch reverses the degradation without otherwise affecting SPEC cpu2000 or cpu2006. Bootstrapped and regtested on that platform with no new regressions. Ok for trunk? Well, would the real cost not be subparts * scalar_to_vec plus subparts * vec_perm? At least vec_perm isn't the cost for building up a vector from N scalar elements either (it might be close enough if subparts == 2). What's the case with facerec here? Does it have subparts == 2? In this case, subparts == 4 (32-bit floats, 128-bit vec reg). On PowerPC, this requires two merge instructions and a permute instruction to get the four 32-bit quantities into the right place in a 128-bit register. Currently this is modeled as a vec_perm in other parts of the vectorizer per Ira's earlier patches, so I naively changed this to do the same thing. The types of vectorizer instructions aren't documented, and I can't infer much from the i386.c cost model, so I need a little education. What semantics are represented by scalar_to_vec? On PowerPC, we have this mapping of the floating-point registers and vector float registers where they overlap (the low-order half of each of the first 32 vector float regs corresponds to a scalar float reg). So in this case we have four scalar loads that place things in the bottom half of four vector registers, two vector merge instructions that collapse the four registers into two vector registers, and a vector permute that gets things in the right order.(*) I wonder if what we refer to as a merge instruction is similar to scalar_to_vec. If so, then maybe we need something like subparts = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); inside_cost += vect_get_stmt_cost (scalar_load) * ncopies * subparts; inside_cost += ncopies * vect_get_stmt_cost (scalar_to_vec) * subparts / 2; inside_cost += ncopies * vect_get_stmt_cost (vec_perm); But then we'd have to change how vec_perm is modeled elsewhere for PowerPC based on Ira's earlier patches. As I said, it's difficult for me to figure out all the intent of cost model decisions that have been made in the past, using current documentation. I really wanted to pessimize this case for say AVX and char elements, thus building up a vector from 32 scalars which certainly does not cost a mere vec_perm. So, maybe special-case the subparts == 2 case and assume vec_perm would match the cost only in that case. (I'm a little confused by this as what you have at the moment is a single scalar_to_vec per copy, which has a cost of 1 on most i386 targets (occasionally 2). The subparts multiplier is only applied to the loads. So changing this to vec_perm seemed to be a no-op for i386.) (*) There are actually a couple more instructions here to convert 64-bit values to 32-bit values, since on PowerPC 32-bit loads are converted to 64-bit values in scalar float registers and they have to be coerced back to 32-bit. Very ugly. The cost model currently doesn't represent this at all, which I'll have to look at fixing at some point in some way that isn't too nasty for the other targets. The cost model for PowerPC seems to need a lot of TLC. Thanks, Bill Thanks, Richard. Thanks, Bill 2012-06-10 Bill Schmidt wschm...@linux.ibm.com * tree-vect-stmts.c (vect_model_load_cost): Change cost model for strided loads. Index: gcc/tree-vect-stmts.c === --- gcc/tree-vect-stmts.c (revision 188341) +++ gcc/tree-vect-stmts.c (working copy) @@ -1031,11 +1031,10 @@ vect_model_load_cost (stmt_vec_info stmt_info, int /* The loads themselves. */ if (STMT_VINFO_STRIDE_LOAD_P (stmt_info)) { - /* N scalar loads plus gathering them into a vector. - ??? scalar_to_vec isn't the cost for that. */ + /* N scalar loads plus gathering them into a vector. */ inside_cost += (vect_get_stmt_cost (scalar_load) * ncopies * TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))); - inside_cost += ncopies * vect_get_stmt_cost (scalar_to_vec); + inside_cost += ncopies * vect_get_stmt_cost (vec_perm); } else vect_get_load_cost (first_dr, ncopies,
Re: [PATCH] Correct cost model for strided loads
On Mon, 11 Jun 2012, William J. Schmidt wrote: On Mon, 2012-06-11 at 11:15 +0200, Richard Guenther wrote: On Sun, Jun 10, 2012 at 5:58 PM, William J. Schmidt wschm...@linux.vnet.ibm.com wrote: The fix for PR53331 caused a degradation to 187.facerec on powerpc64-unknown-linux-gnu. The following simple patch reverses the degradation without otherwise affecting SPEC cpu2000 or cpu2006. Bootstrapped and regtested on that platform with no new regressions. Ok for trunk? Well, would the real cost not be subparts * scalar_to_vec plus subparts * vec_perm? At least vec_perm isn't the cost for building up a vector from N scalar elements either (it might be close enough if subparts == 2). What's the case with facerec here? Does it have subparts == 2? In this case, subparts == 4 (32-bit floats, 128-bit vec reg). On PowerPC, this requires two merge instructions and a permute instruction to get the four 32-bit quantities into the right place in a 128-bit register. Currently this is modeled as a vec_perm in other parts of the vectorizer per Ira's earlier patches, so I naively changed this to do the same thing. I see. The types of vectorizer instructions aren't documented, and I can't infer much from the i386.c cost model, so I need a little education. What semantics are represented by scalar_to_vec? It's a vector splat, thus x - { x, x, x, ... }. You can create { x, y, z, ... } by N such splats plus N - 1 permutes (if a permute, as VEC_PERM_EXPR, takes two input vectors). That's by far not the most efficient way to build up such a vector of course (with AVX you could do one splat plus N - 1 inserts for example). The cost is of course dependent on the number of vector elements, so a simple new enum vect_cost_for_stmt kind does not cover it but the target would have to look at the vector type passed and do some reasonable guess. On PowerPC, we have this mapping of the floating-point registers and vector float registers where they overlap (the low-order half of each of the first 32 vector float regs corresponds to a scalar float reg). So in this case we have four scalar loads that place things in the bottom half of four vector registers, two vector merge instructions that collapse the four registers into two vector registers, and a vector permute that gets things in the right order.(*) I wonder if what we refer to as a merge instruction is similar to scalar_to_vec. Looks similar to x86 SSE then. If so, then maybe we need something like subparts = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); inside_cost += vect_get_stmt_cost (scalar_load) * ncopies * subparts; inside_cost += ncopies * vect_get_stmt_cost (scalar_to_vec) * subparts / 2; inside_cost += ncopies * vect_get_stmt_cost (vec_perm); But then we'd have to change how vec_perm is modeled elsewhere for PowerPC based on Ira's earlier patches. As I said, it's difficult for me to figure out all the intent of cost model decisions that have been made in the past, using current documentation. Heh, usually the intent was to make the changes simple, not to compute a proper cost. I think we simply need a new scalars_to_vec cost kind. I really wanted to pessimize this case for say AVX and char elements, thus building up a vector from 32 scalars which certainly does not cost a mere vec_perm. So, maybe special-case the subparts == 2 case and assume vec_perm would match the cost only in that case. (I'm a little confused by this as what you have at the moment is a single scalar_to_vec per copy, which has a cost of 1 on most i386 targets (occasionally 2). The subparts multiplier is only applied to the loads. So changing this to vec_perm seemed to be a no-op for i386.) Oh, I somehow read the patch as you were removing the multiplication by TYPE_VECTOR_SUBPARTS. But yes, the cost is way off and I'd wanted to reflect it with N scalar loads plus N splats plus N - 1 permutes originally. You could also model it with N scalar loads plus N inserts (but we don't have a vec_insert cost either). I think adding a scalars_to_vec or vec_init or however we want to call it - basically what the cost of a vector CONSTRUCTOR would be - is best. (*) There are actually a couple more instructions here to convert 64-bit values to 32-bit values, since on PowerPC 32-bit loads are converted to 64-bit values in scalar float registers and they have to be coerced back to 32-bit. Very ugly. The cost model currently doesn't represent this at all, which I'll have to look at fixing at some point in some way that isn't too nasty for the other targets. The cost model for PowerPC seems to need a lot of TLC. Maybe the above would be a possible way to do it. On x86 for example a vector of two doubles is extremely cheap for generic SSE2 to construct, we can directly load into the low/high part, thus when we use it as N scalar loads plus the vector-build the
[PATCH] Fix PR53470
This fixes PR53470 - I for quite some time wanted to remove that conditional clearing of TYPE_CONTEXT from free_lang_data_in_type but failed to do so because we regress. I've debugged it down to the C frontend having sometimes BLOCK as TYPE_CONTEXT for a type. So, simply replace such BLOCK with the nearest non-BLOCK we can get at. Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk. Richard. 2012-06-11 Richard Guenther rguent...@suse.de PR middle-end/53470 * tree.c (free_lang_data_in_type): Do not clear TYPE_CONTEXT but replace it with the first non-BLOCK context. * g++.dg/lto/pr53470_0.C: New testcase. * gcc.dg/lto/pr53470_0.c: Likewise. Index: gcc/tree.c === --- gcc/tree.c (revision 188384) +++ gcc/tree.c (working copy) @@ -4575,11 +4575,17 @@ free_lang_data_in_type (tree type) free_lang_data_in_one_sizepos (TYPE_SIZE (type)); free_lang_data_in_one_sizepos (TYPE_SIZE_UNIT (type)); - if (debug_info_level DINFO_LEVEL_TERSE - || (TYPE_CONTEXT (type) - TREE_CODE (TYPE_CONTEXT (type)) != FUNCTION_DECL - TREE_CODE (TYPE_CONTEXT (type)) != NAMESPACE_DECL)) -TYPE_CONTEXT (type) = NULL_TREE; + if (TYPE_CONTEXT (type) + TREE_CODE (TYPE_CONTEXT (type)) == BLOCK) +{ + tree ctx = TYPE_CONTEXT (type); + do + { + ctx = BLOCK_SUPERCONTEXT (ctx); + } + while (ctx TREE_CODE (ctx) == BLOCK); + TYPE_CONTEXT (type) = ctx; +} } Index: gcc/testsuite/g++.dg/lto/pr53470_0.C === --- gcc/testsuite/g++.dg/lto/pr53470_0.C(revision 0) +++ gcc/testsuite/g++.dg/lto/pr53470_0.C(revision 0) @@ -0,0 +1,26 @@ +// { dg-lto-do link } +// { dg-lto-options { { -g -flto } } } + +class sp_counted_base; +class shared_count { + sp_counted_base *pi_; +public: + templateclass Y shared_count(Y) : pi_() {} + ~shared_count() {} +}; +templateclass T struct shared_ptr { + T element_type; + templateclass Y shared_ptr(Y) : pn(0) {} + shared_count pn; +}; +templateclass class ECGetterBase; +templateclass T struct ExtensionCord { + struct Holder { +ECGetterBaseT *getter_; + }; + ExtensionCord() : holder_(new Holder) {} + + shared_ptrHolder holder_; +}; +ExtensionCordint a; +int main() {} Index: gcc/testsuite/gcc.dg/lto/pr53470_0.c === --- gcc/testsuite/gcc.dg/lto/pr53470_0.c(revision 0) +++ gcc/testsuite/gcc.dg/lto/pr53470_0.c(revision 0) @@ -0,0 +1,9 @@ +/* { dg-lto-do link } */ +/* { dg-lto-options { { -flto } { -flto -g } } } */ + +int main () +{ + { +union A { } v; + } +}
Re: [RFC] Target-specific limits on vector alignment
On Mon, Jun 11, 2012 at 3:16 PM, Richard Earnshaw rearn...@arm.com wrote: The ARM ABI states that vectors larger than 64 bits in size still have 64-bit alignment; never-the-less, the HW supports alignment hints of up to 128-bits in some cases and will trap in a vector has an alignment that less than the hint. GCC currently hard-codes larger vectors to be aligned by the size of the vector, which means that 128-bit vectors are marked as being 128-bit aligned. The ARM ABI unfortunately does not support generating such alignment for parameters passed by value and this can lead to traps at run time. It seems that the best way to solve this problem is to allow the back-end to set an upper limit on the alignment permitted for a vector. I've implemented this as a separate hook, rather than using the existing hooks because there's a strong likelihood of breaking some existing ABIs if I did it another way. There are a couple of tests that will need some re-working before this can be committed to deal with the fall-out of making this change; I'll prepare those changes if this patch is deemed generally acceptable. Hm. Where would you use that target hook? Richard. R. * target.def (TARGET_VECTOR_ALIGNMENT): New hook. * doc/tm.texi.in (TARGET_VECTOR_ALIGNMENT): Likewise. * doc/tm.texi: Regenerate. * targhooks.c (default_vector_alignment): New function. * arm.c (arm_vector_alignment): New function. (TARGET_VECTOR_ALIGNMENT): Define.
RFA: Fix path based disambiguation in RTL oracle
Hi, I haven't yet checked in my patch from last week about not creating alias-set conflicts for stack slot sharing mostly due to the caveat I mentioned. Namely that the RTL disambiguator still uses path-based means (which belong to type-based aliasing) even in the write_dependence tests. That's really wrong, but I couldn't (and I think can't) create a testcase using stack slot sharing that would fail with the current unchanged compiler. Luckily, we don't need stack slot sharing to demonstrate the bug. The testcase below will fail on x86_64 with -O2 -fschedule-insns. I've added it to the torture so it has a chance to fail on other targets too (e.g. on i686 it doesn't because it doesn't want to do the wrong swap of two insns). I've retained the path based disambiguation in the true_dependence variants, where it is okay according to our mem model. There are some cleanup opportunities in checking what the O(N^2) algorithm of nonoverlapping_component_refs_p catches that rtx_refs_may_alias_p doesn't. We know already some cases where MEM_OFFSET_KNOWN_P is set too conservatively for the latter to work on some cases. In any case that would be a follow-up. So, patch fixes testcase and is in regstrap on x86_64-linux. Okay if that passes? Ciao, Michael. * alias.c (nonoverlapping_component_refs_p): Take two rtx arguments. (nonoverlapping_memrefs_p): Don't call it here ... (true_dependence_1): ... but here. testsuite/ * gcc.dg/torture/alias-1.c: New test. Index: alias.c === --- alias.c (revision 188384) +++ alias.c (working copy) @@ -156,7 +156,7 @@ static rtx find_base_value (rtx); static int mems_in_disjoint_alias_sets_p (const_rtx, const_rtx); static int insert_subset_children (splay_tree_node, void*); static alias_set_entry get_alias_set_entry (alias_set_type); -static bool nonoverlapping_component_refs_p (const_tree, const_tree); +static bool nonoverlapping_component_refs_p (const_rtx, const_rtx); static tree decl_for_component_ref (tree); static int write_dependence_p (const_rtx, const_rtx, int); @@ -2184,11 +2184,15 @@ read_dependence (const_rtx mem, const_rt overlap for any pair of objects. */ static bool -nonoverlapping_component_refs_p (const_tree x, const_tree y) +nonoverlapping_component_refs_p (const_rtx rtlx, const_rtx rtly) { + const_tree x = MEM_EXPR (rtlx), y = MEM_EXPR (rtly); const_tree fieldx, fieldy, typex, typey, orig_y; - if (!flag_strict_aliasing) + if (!flag_strict_aliasing + || !x || !y + || TREE_CODE (x) != COMPONENT_REF + || TREE_CODE (y) != COMPONENT_REF) return false; do @@ -2307,13 +2311,6 @@ nonoverlapping_memrefs_p (const_rtx x, c ! MEM_OFFSET_KNOWN_P (y))) return 0; - /* If both are field references, we may be able to determine something. */ - if (TREE_CODE (exprx) == COMPONENT_REF - TREE_CODE (expry) == COMPONENT_REF - nonoverlapping_component_refs_p (exprx, expry)) -return 1; - - /* If the field reference test failed, look at the DECLs involved. */ moffsetx_known_p = MEM_OFFSET_KNOWN_P (x); if (moffsetx_known_p) @@ -2519,6 +2516,9 @@ true_dependence_1 (const_rtx mem, enum m if (nonoverlapping_memrefs_p (mem, x, false)) return 0; + if (nonoverlapping_component_refs_p (mem, x)) +return 0; + return rtx_refs_may_alias_p (x, mem, true); } Index: testsuite/gcc.dg/torture/alias-1.c === --- testsuite/gcc.dg/torture/alias-1.c (revision 0) +++ testsuite/gcc.dg/torture/alias-1.c (revision 0) @@ -0,0 +1,38 @@ +/* { dg-do run } */ +/* { dg-options -fschedule-insns } */ + +extern void abort (void) __attribute__((noreturn)); + +struct B { int a; int b;}; +struct wrapper { +union setconflict +{ + struct S { char one1; struct B b1; } s; + struct T { struct B b2; char two2; } t; +} a; +}; + +int +main () +{ + int sum = 0; + int i; + struct wrapper w; + struct B *p; + + p = w.a.s.b1; + asm (: =r (p):0 (p)); + p-a = 0; + asm (: =r (p):0 (p)); + sum += p-a; + + p = w.a.t.b2; + asm (: =r (p):0 (p)); + p-b = 1; + asm (: =r (p):0 (p)); + sum += p-b; + + if (sum != 1) +abort(); + return 0; +}
Re: RFA: Fix path based disambiguation in RTL oracle
On Mon, Jun 11, 2012 at 4:19 PM, Michael Matz m...@suse.de wrote: Hi, I haven't yet checked in my patch from last week about not creating alias-set conflicts for stack slot sharing mostly due to the caveat I mentioned. Namely that the RTL disambiguator still uses path-based means (which belong to type-based aliasing) even in the write_dependence tests. That's really wrong, but I couldn't (and I think can't) create a testcase using stack slot sharing that would fail with the current unchanged compiler. Luckily, we don't need stack slot sharing to demonstrate the bug. The testcase below will fail on x86_64 with -O2 -fschedule-insns. I've added it to the torture so it has a chance to fail on other targets too (e.g. on i686 it doesn't because it doesn't want to do the wrong swap of two insns). I've retained the path based disambiguation in the true_dependence variants, where it is okay according to our mem model. There are some cleanup opportunities in checking what the O(N^2) algorithm of nonoverlapping_component_refs_p catches that rtx_refs_may_alias_p doesn't. We know already some cases where MEM_OFFSET_KNOWN_P is set too conservatively for the latter to work on some cases. In any case that would be a follow-up. So, patch fixes testcase and is in regstrap on x86_64-linux. Okay if that passes? Ok. Thanks, Richard. Ciao, Michael. * alias.c (nonoverlapping_component_refs_p): Take two rtx arguments. (nonoverlapping_memrefs_p): Don't call it here ... (true_dependence_1): ... but here. testsuite/ * gcc.dg/torture/alias-1.c: New test. Index: alias.c === --- alias.c (revision 188384) +++ alias.c (working copy) @@ -156,7 +156,7 @@ static rtx find_base_value (rtx); static int mems_in_disjoint_alias_sets_p (const_rtx, const_rtx); static int insert_subset_children (splay_tree_node, void*); static alias_set_entry get_alias_set_entry (alias_set_type); -static bool nonoverlapping_component_refs_p (const_tree, const_tree); +static bool nonoverlapping_component_refs_p (const_rtx, const_rtx); static tree decl_for_component_ref (tree); static int write_dependence_p (const_rtx, const_rtx, int); @@ -2184,11 +2184,15 @@ read_dependence (const_rtx mem, const_rt overlap for any pair of objects. */ static bool -nonoverlapping_component_refs_p (const_tree x, const_tree y) +nonoverlapping_component_refs_p (const_rtx rtlx, const_rtx rtly) { + const_tree x = MEM_EXPR (rtlx), y = MEM_EXPR (rtly); const_tree fieldx, fieldy, typex, typey, orig_y; - if (!flag_strict_aliasing) + if (!flag_strict_aliasing + || !x || !y + || TREE_CODE (x) != COMPONENT_REF + || TREE_CODE (y) != COMPONENT_REF) return false; do @@ -2307,13 +2311,6 @@ nonoverlapping_memrefs_p (const_rtx x, c ! MEM_OFFSET_KNOWN_P (y))) return 0; - /* If both are field references, we may be able to determine something. */ - if (TREE_CODE (exprx) == COMPONENT_REF - TREE_CODE (expry) == COMPONENT_REF - nonoverlapping_component_refs_p (exprx, expry)) - return 1; - - /* If the field reference test failed, look at the DECLs involved. */ moffsetx_known_p = MEM_OFFSET_KNOWN_P (x); if (moffsetx_known_p) @@ -2519,6 +2516,9 @@ true_dependence_1 (const_rtx mem, enum m if (nonoverlapping_memrefs_p (mem, x, false)) return 0; + if (nonoverlapping_component_refs_p (mem, x)) + return 0; + return rtx_refs_may_alias_p (x, mem, true); } Index: testsuite/gcc.dg/torture/alias-1.c === --- testsuite/gcc.dg/torture/alias-1.c (revision 0) +++ testsuite/gcc.dg/torture/alias-1.c (revision 0) @@ -0,0 +1,38 @@ +/* { dg-do run } */ +/* { dg-options -fschedule-insns } */ + +extern void abort (void) __attribute__((noreturn)); + +struct B { int a; int b;}; +struct wrapper { +union setconflict +{ + struct S { char one1; struct B b1; } s; + struct T { struct B b2; char two2; } t; +} a; +}; + +int +main () +{ + int sum = 0; + int i; + struct wrapper w; + struct B *p; + + p = w.a.s.b1; + asm (: =r (p):0 (p)); + p-a = 0; + asm (: =r (p):0 (p)); + sum += p-a; + + p = w.a.t.b2; + asm (: =r (p):0 (p)); + p-b = 1; + asm (: =r (p):0 (p)); + sum += p-b; + + if (sum != 1) + abort(); + return 0; +}
[PATCH][2/n] alias.c TLC
This makes ao_ref_from_mem less conservative if either MEM_OFFSET or MEM_SIZE is not set. From other alias.c code and set_mem_attributes_minus_bitpos one has to conclude that MEM_EXPR is always conservatively correct (it only can cover a larger memory area) and the MEM_OFFSET / MEM_SIZE pair can refine it. Thus, we can make ao_ref_from_mem less conservative when, for example, faced with variable array accesses which set_mem_attributes_minus_bitpos ends up representing with an unknown MEM_OFFSET. Bootstrap and regtest running on x86_64-unknown-linux-gnu. Richard. 2012-06-11 Richard Guenther rguent...@suse.de * emit-rtl.c (set_mem_attributes_minus_bitpos): Remove dead code. * alias.c (ao_ref_from_mem): MEM_EXPR is conservative, MEM_OFFSET and MEM_SIZE only refines it. Reflect that and be less conservative if either of the latter is not known. Index: gcc/emit-rtl.c === *** gcc/emit-rtl.c (revision 188384) --- gcc/emit-rtl.c (working copy) *** set_mem_attributes_minus_bitpos (rtx ref *** 1838,1852 /* ??? Any reason the field size would be different than the size we got from the type? */ } - - /* If this is an indirect reference, record it. */ - else if (TREE_CODE (t) == MEM_REF) - { - attrs.expr = t; - attrs.offset_known_p = true; - attrs.offset = 0; - apply_bitpos = bitpos; - } } /* If this is an indirect reference, record it. */ --- 1838,1843 Index: gcc/alias.c === *** gcc/alias.c (revision 188384) --- gcc/alias.c (working copy) *** ao_ref_from_mem (ao_ref *ref, const_rtx *** 326,342 ref-ref_alias_set = MEM_ALIAS_SET (mem); ! /* If MEM_OFFSET or MEM_SIZE are unknown we have to punt. ! Keep points-to related information though. */ if (!MEM_OFFSET_KNOWN_P (mem) || !MEM_SIZE_KNOWN_P (mem)) ! { ! ref-ref = NULL_TREE; ! ref-offset = 0; ! ref-size = -1; ! ref-max_size = -1; ! return true; ! } /* If the base decl is a parameter we can have negative MEM_OFFSET in case of promoted subregs on bigendian targets. Trust the MEM_EXPR --- 326,336 ref-ref_alias_set = MEM_ALIAS_SET (mem); ! /* If MEM_OFFSET or MEM_SIZE are unknown what we got from MEM_EXPR ! is conservative, so trust it. */ if (!MEM_OFFSET_KNOWN_P (mem) || !MEM_SIZE_KNOWN_P (mem)) ! return true; /* If the base decl is a parameter we can have negative MEM_OFFSET in case of promoted subregs on bigendian targets. Trust the MEM_EXPR *** ao_ref_from_mem (ao_ref *ref, const_rtx *** 345,350 --- 339,347 (MEM_SIZE (mem) + MEM_OFFSET (mem)) * BITS_PER_UNIT == ref-size) return true; + /* Otherwise continue and refine size and offset we got from analyzing + MEM_EXPR by using MEM_SIZE and MEM_OFFSET. */ + ref-offset += MEM_OFFSET (mem) * BITS_PER_UNIT; ref-size = MEM_SIZE (mem) * BITS_PER_UNIT;
Re: [PATCH] Add vector cost model density heuristic
On Mon, 2012-06-11 at 13:40 +0200, Richard Guenther wrote: On Fri, 8 Jun 2012, William J. Schmidt wrote: This patch adds a heuristic to the vectorizer when estimating the minimum profitable number of iterations. The heuristic is target-dependent, and is currently disabled for all targets except PowerPC. However, the intent is to make it general enough to be useful for other targets that want to opt in. A previous patch addressed some PowerPC SPEC degradations by modifying the vector cost model values for vec_perm and vec_promote_demote. The values were set a little higher than their natural values because the natural values were not sufficient to prevent a poor vectorization choice. However, this is not the right long-term solution, since it can unnecessarily constrain other vectorization choices involving permute instructions. Analysis of the badly vectorized loop (in sphinx3) showed that the problem was overcommitment of vector resources -- too many vector instructions issued without enough non-vector instructions available to cover the delays. The vector cost model assumes that instructions always have a constant cost, and doesn't have a way of judging this kind of density of vector instructions. The present patch adds a heuristic to recognize when a loop is likely to overcommit resources, and adds a small penalty to the inside-loop cost to account for the expected stalls. The heuristic is parameterized with three target-specific values: * Density threshold: The heuristic will apply only when the percentage of inside-loop cost attributable to vectorized instructions exceeds this value. * Size threshold: The heuristic will apply only when the inside-loop cost exceeds this value. * Penalty: The inside-loop cost will be increased by this percentage value when the heuristic applies. Thus only reasonably large loop bodies that are mostly vectorized instructions will be affected. By applying only a small percentage bump to the inside-loop cost, the heuristic will only turn off vectorization for loops that were considered barely profitable to begin with (such as the sphinx3 loop). So the heuristic is quite conservative and should not affect the vast majority of vectorization decisions. Together with the new heuristic, this patch reduces the vec_perm and vec_promote_demote costs for PowerPC to their natural values. I've regstrapped this with no regressions on powerpc64-unknown-linux-gnu and verified that no performance regressions occur on SPEC cpu2006. Is this ok for trunk? Hmm. I don't like this patch or its general idea too much. Instead I'd like us to move more of the cost model detail to the target, giving it a chance to look at the whole loop before deciding on a cost. ISTR posting the overall idea at some point, but let me repeat it here instead of trying to find that e-mail. The basic interface of the cost model should be, in targetm.vectorize /* Tell the target to start cost analysis of a loop or a basic-block (if the loop argument is NULL). Returns an opaque pointer to target-private data. */ void *init_cost (struct loop *loop); /* Add cost for N vectorized-stmt-kind statements in vector_mode. */ void add_stmt_cost (void *data, unsigned n, vectorized-stmt-kind, enum machine_mode vector_mode); /* Tell the target to compute and return the cost of the accumulated statements and free any target-private data. */ unsigned finish_cost (void *data); with eventually slightly different signatures for add_stmt_cost (like pass in the original scalar stmt?). It allows the target, at finish_cost time, to evaluate things like register pressure and resource utilization. OK, I'm trying to understand how you would want this built into the present structure. Taking just the loop case for now: Judging by your suggested API, we would have to call add_stmt_cost () everywhere that we now call stmt_vinfo_set_inside_of_loop_cost (). For now this would be an additional call, not a replacement, though maybe the other goes away eventually. This allows the target to save more data about the vectorized instructions than just an accumulated cost number (order and quantity of various kinds of instructions can be maintained for better modeling). Presumably the call to finish_cost would be done within vect_estimate_min_profitable_iters () to produce the final value of inside_cost for the loop. The default target hook for add_stmt_cost would duplicate what we currently do for calculating the inside_cost of a statement, and the default target hook for finish_cost would just return the sum. I'll have to go hunting where the similar code would fit for SLP in a basic block. If I read you correctly, you don't object to a density heuristic such as the one I implemented here, but you want
Re: [RFC] Target-specific limits on vector alignment
On 11/06/12 15:17, Richard Guenther wrote: On Mon, Jun 11, 2012 at 3:16 PM, Richard Earnshaw rearn...@arm.com wrote: The ARM ABI states that vectors larger than 64 bits in size still have 64-bit alignment; never-the-less, the HW supports alignment hints of up to 128-bits in some cases and will trap in a vector has an alignment that less than the hint. GCC currently hard-codes larger vectors to be aligned by the size of the vector, which means that 128-bit vectors are marked as being 128-bit aligned. The ARM ABI unfortunately does not support generating such alignment for parameters passed by value and this can lead to traps at run time. It seems that the best way to solve this problem is to allow the back-end to set an upper limit on the alignment permitted for a vector. I've implemented this as a separate hook, rather than using the existing hooks because there's a strong likelihood of breaking some existing ABIs if I did it another way. There are a couple of tests that will need some re-working before this can be committed to deal with the fall-out of making this change; I'll prepare those changes if this patch is deemed generally acceptable. Hm. Where would you use that target hook? Doh! It was supposed to be in the patch set... :-( in layout_type(), where the alignment of a vector is forced to the size of the vector. R.--- stor-layout.c (revision 188348) +++ stor-layout.c (local) @@ -2131,9 +2131,11 @@ layout_type (tree type) TYPE_SIZE (type) = int_const_binop (MULT_EXPR, TYPE_SIZE (innertype), bitsize_int (nunits)); - /* Always naturally align vectors. This prevents ABI changes - depending on whether or not native vector modes are supported. */ - TYPE_ALIGN (type) = tree_low_cst (TYPE_SIZE (type), 0); + /* Naturally align vectors, but let the target set an upper + limit. This prevents ABI changes depending on whether or + not native vector modes are supported. */ + TYPE_ALIGN (type) + = targetm.vector_alignment (type, tree_low_cst (TYPE_SIZE (type), 0)); break; }
Re: [PATCH] Add vector cost model density heuristic
On Mon, 11 Jun 2012, William J. Schmidt wrote: On Mon, 2012-06-11 at 13:40 +0200, Richard Guenther wrote: On Fri, 8 Jun 2012, William J. Schmidt wrote: This patch adds a heuristic to the vectorizer when estimating the minimum profitable number of iterations. The heuristic is target-dependent, and is currently disabled for all targets except PowerPC. However, the intent is to make it general enough to be useful for other targets that want to opt in. A previous patch addressed some PowerPC SPEC degradations by modifying the vector cost model values for vec_perm and vec_promote_demote. The values were set a little higher than their natural values because the natural values were not sufficient to prevent a poor vectorization choice. However, this is not the right long-term solution, since it can unnecessarily constrain other vectorization choices involving permute instructions. Analysis of the badly vectorized loop (in sphinx3) showed that the problem was overcommitment of vector resources -- too many vector instructions issued without enough non-vector instructions available to cover the delays. The vector cost model assumes that instructions always have a constant cost, and doesn't have a way of judging this kind of density of vector instructions. The present patch adds a heuristic to recognize when a loop is likely to overcommit resources, and adds a small penalty to the inside-loop cost to account for the expected stalls. The heuristic is parameterized with three target-specific values: * Density threshold: The heuristic will apply only when the percentage of inside-loop cost attributable to vectorized instructions exceeds this value. * Size threshold: The heuristic will apply only when the inside-loop cost exceeds this value. * Penalty: The inside-loop cost will be increased by this percentage value when the heuristic applies. Thus only reasonably large loop bodies that are mostly vectorized instructions will be affected. By applying only a small percentage bump to the inside-loop cost, the heuristic will only turn off vectorization for loops that were considered barely profitable to begin with (such as the sphinx3 loop). So the heuristic is quite conservative and should not affect the vast majority of vectorization decisions. Together with the new heuristic, this patch reduces the vec_perm and vec_promote_demote costs for PowerPC to their natural values. I've regstrapped this with no regressions on powerpc64-unknown-linux-gnu and verified that no performance regressions occur on SPEC cpu2006. Is this ok for trunk? Hmm. I don't like this patch or its general idea too much. Instead I'd like us to move more of the cost model detail to the target, giving it a chance to look at the whole loop before deciding on a cost. ISTR posting the overall idea at some point, but let me repeat it here instead of trying to find that e-mail. The basic interface of the cost model should be, in targetm.vectorize /* Tell the target to start cost analysis of a loop or a basic-block (if the loop argument is NULL). Returns an opaque pointer to target-private data. */ void *init_cost (struct loop *loop); /* Add cost for N vectorized-stmt-kind statements in vector_mode. */ void add_stmt_cost (void *data, unsigned n, vectorized-stmt-kind, enum machine_mode vector_mode); /* Tell the target to compute and return the cost of the accumulated statements and free any target-private data. */ unsigned finish_cost (void *data); with eventually slightly different signatures for add_stmt_cost (like pass in the original scalar stmt?). It allows the target, at finish_cost time, to evaluate things like register pressure and resource utilization. OK, I'm trying to understand how you would want this built into the present structure. Taking just the loop case for now: Judging by your suggested API, we would have to call add_stmt_cost () everywhere that we now call stmt_vinfo_set_inside_of_loop_cost (). For now this would be an additional call, not a replacement, though maybe the other goes away eventually. This allows the target to save more data about the vectorized instructions than just an accumulated cost number (order and quantity of various kinds of instructions can be maintained for better modeling). Presumably the call to finish_cost would be done within vect_estimate_min_profitable_iters () to produce the final value of inside_cost for the loop. Yes. I didn't look in detail at the difference between inner/outer costs though. Maybe that complicates things, maybe not. The default target hook for add_stmt_cost would duplicate what we currently do for calculating the inside_cost of a statement,
Re: [PATCH] Add vector cost model density heuristic
On Mon, 11 Jun 2012, David Edelsohn wrote: On Mon, Jun 11, 2012 at 7:40 AM, Richard Guenther rguent...@suse.de wrote: Hmm. I don't like this patch or its general idea too much. Instead I'd like us to move more of the cost model detail to the target, giving it a chance to look at the whole loop before deciding on a cost. ISTR posting the overall idea at some point, but let me repeat it here instead of trying to find that e-mail. Richard, Allowing the target to see the entire loop and have more control over the cost model is a fine goal, but the current code is using incorrect heuristics. Requiring a complete re-write of the auto-vectorizer cost model should not be a requirement for fixing the current code. If fixing the current code adds more magic hooks at the wrong spot then yes, in stage1 I can make you think about a more proper solution to the issue. Richard.
[AARCH64][PATCH] Fix legitimate address checking for TImode and TFmode
This patch fixes the behaviour of legitimate address checking for TImode and TFmode values. The current implementation does not correctly distinguish between the address modes available for ldp/stp X,X versus ldr/str Q. /Marcus ChangeLog: * config/aarch64/aarch64.c (aarch64_regno_ok_for_index_p): Handle NULL reg_renumber. (aarch64_regno_ok_for_base_p): Likewise. (offset_7bit_signed_scaled_p): New. (offset_9bit_signed_unscaled_p): New. (offset_12bit_unsigned_scaled_p): New. (aarch64_classify_address): Replace pair_p with allow_reg_index_p. Conservative test for valid TImode and TFmode addresses. Use offset_7bit_signed_scaled_p offset_9bit_signed_unscaled_p and offset_12bit_unsigned_scaled_p. Remove explicit TImode and TFmode tests. * config/aarch64/aarch64.md (movti_aarch64): Replace 'm' with 'Ump'. (movtf_aarch64): Replace 'm' with 'Ump', replace 'Utf' with 'm'. * config/aarch64/constraints.md (Utf): Remove. (Ump) Pass strict.diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 34d04755900a75376c0afe5b9b4b8b3cd1ac84e2..b985d63c4fd591c7ae17d519a7b858c0264edec7 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -2441,6 +2441,10 @@ aarch64_regno_ok_for_index_p (int regno, { if (!strict_p) return true; + + if (!reg_renumber) + return false; + regno = reg_renumber[regno]; } return GP_REGNUM_P (regno); @@ -2456,6 +2460,10 @@ aarch64_regno_ok_for_base_p (int regno, { if (!strict_p) return true; + + if (!reg_renumber) + return false; + regno = reg_renumber[regno]; } @@ -2639,6 +2647,29 @@ aarch64_classify_index (struct aarch64_a return false; } +static inline bool +offset_7bit_signed_scaled_p (enum machine_mode mode, HOST_WIDE_INT offset) +{ + return (offset = -64 * GET_MODE_SIZE (mode) + offset 64 * GET_MODE_SIZE (mode) + offset % GET_MODE_SIZE (mode) == 0); +} + +static inline bool +offset_9bit_signed_unscaled_p (enum machine_mode mode ATTRIBUTE_UNUSED, + HOST_WIDE_INT offset) +{ + return offset = -256 offset 256; +} + +static inline bool +offset_12bit_unsigned_scaled_p (enum machine_mode mode, HOST_WIDE_INT offset) +{ + return (offset = 0 + offset 4096 * GET_MODE_SIZE (mode) + offset % GET_MODE_SIZE (mode) == 0); +} + /* Return true if X is a valid address for machine mode MODE. If it is, fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in effect. OUTER_CODE is PARALLEL for a load/store pair. */ @@ -2650,7 +2681,8 @@ aarch64_classify_address (struct aarch64 { enum rtx_code code = GET_CODE (x); rtx op0, op1; - bool pair_p = (outer_code == PARALLEL || mode == TImode); + bool allow_reg_index_p = +outer_code != PARALLEL GET_MODE_SIZE(mode) != 16; /* Don't support anything other than POST_INC or REG addressing for AdvSIMD. */ @@ -2679,25 +2711,27 @@ aarch64_classify_address (struct aarch64 info-type = ADDRESS_REG_IMM; info-base = op0; info-offset = op1; + + /* TImode and TFmode values are allowed in both pairs of X + registers and individual Q registers. The available + address modes are: + X,X: 7-bit signed scaled offset + Q: 9-bit signed offset + We conservatively require an offset representable in either mode. + */ + if (mode == TImode || mode == TFmode) + return (offset_7bit_signed_scaled_p (mode, offset) + offset_9bit_signed_unscaled_p (mode, offset)); + if (outer_code == PARALLEL) - /* load/store pair: 7-bit signed, scaled offset. */ return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8) - offset = -64 * GET_MODE_SIZE (mode) - offset 64 * GET_MODE_SIZE (mode) - offset % GET_MODE_SIZE (mode) == 0); - else if (mode == TImode) - /* load/store pair of dwords: 7-bit signed, scaled offset. */ - return offset = -512 offset 512 offset % 8 == 0; + offset_7bit_signed_scaled_p (mode, offset)); else - /* load/store single: 9-bit signed, unscaled offset. */ - /* load/store single: 12-bit unsigned, scaled offset. */ - return ((offset = -256 offset 256) - || (offset = 0 - offset 4096 * GET_MODE_SIZE (mode) - offset % GET_MODE_SIZE (mode) == 0)); + return (offset_9bit_signed_unscaled_p (mode, offset) + || offset_12bit_unsigned_scaled_p (mode, offset)); } - if (!pair_p) + if (allow_reg_index_p) { /* Look for base + (scaled/extended) index register. */ if (aarch64_base_register_rtx_p (op0, strict_p) @@ -2737,18 +2771,23 @@ aarch64_classify_address (struct aarch64 HOST_WIDE_INT offset; info-offset = XEXP (XEXP (x, 1), 1); offset = INTVAL (info-offset); + + /* TImode and TFmode values are allowed in both pairs of X + registers and individual Q registers. The available + address modes
[Ada] fix use of PICFLAG for Ada library variants
Hello, We used to have in gcc-interface/Makefile.in: GNATLIBCFLAGS_FOR_C = -W -Wall $(GNATLIBCFLAGS) $(TARGET_LIBGCC2_CFLAGS) ... As part of the libgcc move to toplevel, http://gcc.gnu.org/ml/gcc-patches/2011-08/msg00917.html introduced PIC_FLAG_FOR_TARGET and replaced all the uses of TARGET_LIBGCC2_CFLAGS by that, on the assumption that it was essentially used for gnatlib-shared related targets. That was true for most of the uses indeed, in particular those directly part of the gnatlib-shared targets which now have: gnatlib-shared-default: $(MAKE) $(FLAGS_TO_PASS) \ GNATLIBFLAGS=$(GNATLIBFLAGS) \ ==GNATLIBCFLAGS=$(GNATLIBCFLAGS) $(PICFLAG_FOR_TARGET) \ GNATLIBCFLAGS_FOR_C=$(GNATLIBCFLAGS_FOR_C) \ gnatlib However, we now alo have: GNATLIBCFLAGS_FOR_C = -W -Wall $(GNATLIBCFLAGS) $(PICFLAG_FOR_TARGET) ^ which is used for all the gnatlib related targets in this Makefile, so we end up passing -fpic or alike for the compilation of C files aimed at non shared libs as well. This causes kernel-mode library build failures with --diable-libada on VxWorks because the compiler stops in presence of -fPIC without -mrtp. The extraneaous PICFLAG is visible in libada/Makefile.in as well, with similar consequences in the libada enabled case. The attached patch addresses this with two simple steps: remove PICFLAG_FOR_TARGET from the top definition of GNATLIBCFLAGS_FOR_C, then add it explicitly to the options passed by the gnatlib-shared targets only. Like GNATLIBCFLAGS_FOR_C = -W -Wall $(GNATLIBCFLAGS) gnatlib-shared-default: $(MAKE) $(FLAGS_TO_PASS) \ GNATLIBFLAGS=$(GNATLIBFLAGS) \ ==GNATLIBCFLAGS=$(GNATLIBCFLAGS) $(PICFLAG_FOR_TARGET) \ ==GNATLIBCFLAGS_FOR_C=$(GNATLIBCFLAGS_FOR_C) $(PICFLAG_FOR_TARGET) \ gnatlib This lets the simple VxWorks builds proceed to completion, and bootstrap+regtest fine x86_64-linux. Committing to mainline. Olivier 2012-06-10 Olivier Hainque hain...@adacore.com * Makefile.in (GNATLIBCFLAGS_FOR_C): Remove $(PIC_FLAG_FOR_TARGET). (gnatlib-shared-default): Add $(PIC_FLAG_FOR_TARGET) to GNATLIBCFLAGS_FOR_C passed to gnatlib. (gnatlib-shared-win32): Likewise. (gnatlib-shared-darwin): Likewise. (gnatlib-shared-dual-win32): Pass PIC_FLAG_FOR_TARGET to gnatlib-shared-win32. libada/ * Makefile.in (GNATLIBCFLAGS_FOR_C): Remove $(PICFLAG). ada-picflag.diff Description: Binary data
Re: [PATCH] Add vector cost model density heuristic
On Mon, Jun 11, 2012 at 10:48 AM, Richard Guenther rguent...@suse.de wrote: On Mon, 11 Jun 2012, David Edelsohn wrote: On Mon, Jun 11, 2012 at 7:40 AM, Richard Guenther rguent...@suse.de wrote: Hmm. I don't like this patch or its general idea too much. Instead I'd like us to move more of the cost model detail to the target, giving it a chance to look at the whole loop before deciding on a cost. ISTR posting the overall idea at some point, but let me repeat it here instead of trying to find that e-mail. Richard, Allowing the target to see the entire loop and have more control over the cost model is a fine goal, but the current code is using incorrect heuristics. Requiring a complete re-write of the auto-vectorizer cost model should not be a requirement for fixing the current code. If fixing the current code adds more magic hooks at the wrong spot then yes, in stage1 I can make you think about a more proper solution to the issue. First, these are not magic hooks. Second, I suggest that you need to rephrase I can make you and re-send your reply. Thanks, David
Re: [RFC] Target-specific limits on vector alignment
On Mon, Jun 11, 2012 at 4:38 PM, Richard Earnshaw rearn...@arm.com wrote: On 11/06/12 15:17, Richard Guenther wrote: On Mon, Jun 11, 2012 at 3:16 PM, Richard Earnshaw rearn...@arm.com wrote: The ARM ABI states that vectors larger than 64 bits in size still have 64-bit alignment; never-the-less, the HW supports alignment hints of up to 128-bits in some cases and will trap in a vector has an alignment that less than the hint. GCC currently hard-codes larger vectors to be aligned by the size of the vector, which means that 128-bit vectors are marked as being 128-bit aligned. The ARM ABI unfortunately does not support generating such alignment for parameters passed by value and this can lead to traps at run time. It seems that the best way to solve this problem is to allow the back-end to set an upper limit on the alignment permitted for a vector. I've implemented this as a separate hook, rather than using the existing hooks because there's a strong likelihood of breaking some existing ABIs if I did it another way. There are a couple of tests that will need some re-working before this can be committed to deal with the fall-out of making this change; I'll prepare those changes if this patch is deemed generally acceptable. Hm. Where would you use that target hook? Doh! It was supposed to be in the patch set... :-( in layout_type(), where the alignment of a vector is forced to the size of the vector. Ok. + /* Naturally align vectors, but let the target set an upper + limit. This prevents ABI changes depending on whether or + not native vector modes are supported. */ + TYPE_ALIGN (type) + = targetm.vector_alignment (type, tree_low_cst (TYPE_SIZE (type), 0)); The type argument or the size argument looks redundant. Note that we still can have such vector properly aligned, thus the vectorizer would need to use build_aligned_type also if it knows the type is aligned, not only when it thinks it is misaligned. You basically change the alignment of the default vector type. Richard. R.
[PATCH, GCC][AArch64] Remove support for __float128
Hi, This patch removes __float128 support in the AArch64 port. Thanks Sofiane - gcc/ChangeLog: 2012-06-11 Sofiane Naci sofiane.n...@arm.com [AArch64] Remove __float128 support. * config/aarch64/aarch64.c (aarch64_mangle_type): Remove function. (aarch64_init_builtins): Remove __float128 support. (aarch64_expand_builtin): Likewise. (TARGET_MANGLE_TYPE): Remove definition. * config/aarch64/aarch64.h (enum aarch64_builtins): Update. * doc/extend.texi: Remove __float128 documentation. gcc/testsuite/ChangeLog: 2012-06-11 Sofiane Naci sofiane.n...@arm.com [AArch64] Remove __float128 support. * gcc.dg/const-float128-ped.c: Remove AArch64 from list of targets. * gcc.dg/const-float128.c: Likewise. * gcc.dg/torture/fp-int-convert-float128-timode.c: Likewise. * gcc.dg/torture/fp-int-convert-float128.c: Likewise. aarch64-remove-float128.patch Description: Binary data
Re: [PATCH] Add vector cost model density heuristic
On Mon, 11 Jun 2012, David Edelsohn wrote: On Mon, Jun 11, 2012 at 10:48 AM, Richard Guenther rguent...@suse.de wrote: On Mon, 11 Jun 2012, David Edelsohn wrote: On Mon, Jun 11, 2012 at 7:40 AM, Richard Guenther rguent...@suse.de wrote: Hmm. I don't like this patch or its general idea too much. Instead I'd like us to move more of the cost model detail to the target, giving it a chance to look at the whole loop before deciding on a cost. ISTR posting the overall idea at some point, but let me repeat it here instead of trying to find that e-mail. Richard, Allowing the target to see the entire loop and have more control over the cost model is a fine goal, but the current code is using incorrect heuristics. Requiring a complete re-write of the auto-vectorizer cost model should not be a requirement for fixing the current code. If fixing the current code adds more magic hooks at the wrong spot then yes, in stage1 I can make you think about a more proper solution to the issue. First, these are not magic hooks. Well, they are at least magic numbers and heuristics that apply generally and not only to the single issue in sphinx. And in fact how it works for sphinx _is_ magic. Second, I suggest that you need to rephrase I can make you and re-send your reply. Sorry for my bad english. Consider it meaning that I'd rather have you think about a more proper solution. That's what patch review is about after all, no? Sometimes a complete re-write (which gets more difficult which each of the patches enhancing the not ideal current state) is the best thing to do. Richard.
Re: [PATCH] Add vector cost model density heuristic
On Mon, 11 Jun 2012, Richard Guenther wrote: On Mon, 11 Jun 2012, William J. Schmidt wrote: On Mon, 2012-06-11 at 13:40 +0200, Richard Guenther wrote: On Fri, 8 Jun 2012, William J. Schmidt wrote: This patch adds a heuristic to the vectorizer when estimating the minimum profitable number of iterations. The heuristic is target-dependent, and is currently disabled for all targets except PowerPC. However, the intent is to make it general enough to be useful for other targets that want to opt in. A previous patch addressed some PowerPC SPEC degradations by modifying the vector cost model values for vec_perm and vec_promote_demote. The values were set a little higher than their natural values because the natural values were not sufficient to prevent a poor vectorization choice. However, this is not the right long-term solution, since it can unnecessarily constrain other vectorization choices involving permute instructions. Analysis of the badly vectorized loop (in sphinx3) showed that the problem was overcommitment of vector resources -- too many vector instructions issued without enough non-vector instructions available to cover the delays. The vector cost model assumes that instructions always have a constant cost, and doesn't have a way of judging this kind of density of vector instructions. The present patch adds a heuristic to recognize when a loop is likely to overcommit resources, and adds a small penalty to the inside-loop cost to account for the expected stalls. The heuristic is parameterized with three target-specific values: * Density threshold: The heuristic will apply only when the percentage of inside-loop cost attributable to vectorized instructions exceeds this value. * Size threshold: The heuristic will apply only when the inside-loop cost exceeds this value. * Penalty: The inside-loop cost will be increased by this percentage value when the heuristic applies. Thus only reasonably large loop bodies that are mostly vectorized instructions will be affected. By applying only a small percentage bump to the inside-loop cost, the heuristic will only turn off vectorization for loops that were considered barely profitable to begin with (such as the sphinx3 loop). So the heuristic is quite conservative and should not affect the vast majority of vectorization decisions. Together with the new heuristic, this patch reduces the vec_perm and vec_promote_demote costs for PowerPC to their natural values. I've regstrapped this with no regressions on powerpc64-unknown-linux-gnu and verified that no performance regressions occur on SPEC cpu2006. Is this ok for trunk? Hmm. I don't like this patch or its general idea too much. Instead I'd like us to move more of the cost model detail to the target, giving it a chance to look at the whole loop before deciding on a cost. ISTR posting the overall idea at some point, but let me repeat it here instead of trying to find that e-mail. The basic interface of the cost model should be, in targetm.vectorize /* Tell the target to start cost analysis of a loop or a basic-block (if the loop argument is NULL). Returns an opaque pointer to target-private data. */ void *init_cost (struct loop *loop); /* Add cost for N vectorized-stmt-kind statements in vector_mode. */ void add_stmt_cost (void *data, unsigned n, vectorized-stmt-kind, enum machine_mode vector_mode); /* Tell the target to compute and return the cost of the accumulated statements and free any target-private data. */ unsigned finish_cost (void *data); with eventually slightly different signatures for add_stmt_cost (like pass in the original scalar stmt?). It allows the target, at finish_cost time, to evaluate things like register pressure and resource utilization. OK, I'm trying to understand how you would want this built into the present structure. Taking just the loop case for now: Judging by your suggested API, we would have to call add_stmt_cost () everywhere that we now call stmt_vinfo_set_inside_of_loop_cost (). For now this would be an additional call, not a replacement, though maybe the other goes away eventually. This allows the target to save more data about the vectorized instructions than just an accumulated cost number (order and quantity of various kinds of instructions can be maintained for better modeling). Presumably the call to finish_cost would be done within vect_estimate_min_profitable_iters () to produce the final value of inside_cost for the loop. Yes. I didn't look in detail at the difference between inner/outer costs though. Maybe that complicates
Re: [PATCH] Correct cost model for strided loads
On Mon, 2012-06-11 at 16:10 +0200, Richard Guenther wrote: On Mon, 11 Jun 2012, William J. Schmidt wrote: On Mon, 2012-06-11 at 11:15 +0200, Richard Guenther wrote: On Sun, Jun 10, 2012 at 5:58 PM, William J. Schmidt wschm...@linux.vnet.ibm.com wrote: The fix for PR53331 caused a degradation to 187.facerec on powerpc64-unknown-linux-gnu. The following simple patch reverses the degradation without otherwise affecting SPEC cpu2000 or cpu2006. Bootstrapped and regtested on that platform with no new regressions. Ok for trunk? Well, would the real cost not be subparts * scalar_to_vec plus subparts * vec_perm? At least vec_perm isn't the cost for building up a vector from N scalar elements either (it might be close enough if subparts == 2). What's the case with facerec here? Does it have subparts == 2? In this case, subparts == 4 (32-bit floats, 128-bit vec reg). On PowerPC, this requires two merge instructions and a permute instruction to get the four 32-bit quantities into the right place in a 128-bit register. Currently this is modeled as a vec_perm in other parts of the vectorizer per Ira's earlier patches, so I naively changed this to do the same thing. I see. The types of vectorizer instructions aren't documented, and I can't infer much from the i386.c cost model, so I need a little education. What semantics are represented by scalar_to_vec? It's a vector splat, thus x - { x, x, x, ... }. You can create { x, y, z, ... } by N such splats plus N - 1 permutes (if a permute, as VEC_PERM_EXPR, takes two input vectors). That's by far not the most efficient way to build up such a vector of course (with AVX you could do one splat plus N - 1 inserts for example). The cost is of course dependent on the number of vector elements, so a simple new enum vect_cost_for_stmt kind does not cover it but the target would have to look at the vector type passed and do some reasonable guess. Ah, splat! Yes, that's lingo I understand. I see the intent now. On PowerPC, we have this mapping of the floating-point registers and vector float registers where they overlap (the low-order half of each of the first 32 vector float regs corresponds to a scalar float reg). So in this case we have four scalar loads that place things in the bottom half of four vector registers, two vector merge instructions that collapse the four registers into two vector registers, and a vector permute that gets things in the right order.(*) I wonder if what we refer to as a merge instruction is similar to scalar_to_vec. Looks similar to x86 SSE then. If so, then maybe we need something like subparts = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); inside_cost += vect_get_stmt_cost (scalar_load) * ncopies * subparts; inside_cost += ncopies * vect_get_stmt_cost (scalar_to_vec) * subparts / 2; inside_cost += ncopies * vect_get_stmt_cost (vec_perm); But then we'd have to change how vec_perm is modeled elsewhere for PowerPC based on Ira's earlier patches. As I said, it's difficult for me to figure out all the intent of cost model decisions that have been made in the past, using current documentation. Heh, usually the intent was to make the changes simple, not to compute a proper cost. I think we simply need a new scalars_to_vec cost kind. That works. Maybe vec_construct gets the point across a little better? I think we need to use the full builtin_vectorization_cost interface instead of vect_get_stmt_cost here, so the targets can parameterize on type. Then we can just do one cost calculation for vec_construct that covers the full costs of getting the vector in order after the loads. I really wanted to pessimize this case for say AVX and char elements, thus building up a vector from 32 scalars which certainly does not cost a mere vec_perm. So, maybe special-case the subparts == 2 case and assume vec_perm would match the cost only in that case. (I'm a little confused by this as what you have at the moment is a single scalar_to_vec per copy, which has a cost of 1 on most i386 targets (occasionally 2). The subparts multiplier is only applied to the loads. So changing this to vec_perm seemed to be a no-op for i386.) Oh, I somehow read the patch as you were removing the multiplication by TYPE_VECTOR_SUBPARTS. But yes, the cost is way off and I'd wanted to reflect it with N scalar loads plus N splats plus N - 1 permutes originally. You could also model it with N scalar loads plus N inserts (but we don't have a vec_insert cost either). I think adding a scalars_to_vec or vec_init or however we want to call it - basically what the cost of a vector CONSTRUCTOR would be - is best. (*) There are actually a couple more instructions here to convert 64-bit values to 32-bit values, since on PowerPC 32-bit loads are
Re: [PATCH] Add vector cost model density heuristic
On Mon, Jun 11, 2012 at 10:55 AM, Richard Guenther rguent...@suse.de wrote: Well, they are at least magic numbers and heuristics that apply generally and not only to the single issue in sphinx. And in fact how it works for sphinx _is_ magic. Second, I suggest that you need to rephrase I can make you and re-send your reply. Sorry for my bad english. Consider it meaning that I'd rather have you think about a more proper solution. That's what patch review is about after all, no? Sometimes a complete re-write (which gets more difficult which each of the patches enhancing the not ideal current state) is the best thing to do. Richard, The values of the heuristics may be magic, but Bill believes the heuristics are testing the important characteristics. The heuristics themselves are controlled by hooks, so the target can set the correct values for their own requirements. The concern is that a general cost infrastructure is too general. And, based on history, all ports simply will copy the boilerplate from the first implementation. It also may cause more problems because the target has relatively little information to be able to judge heuristics at that point in the middle-end. If the targets start to get too cute or too complicated, it may cause more problems or more confusion about why more complicated heuristics are not effective and not producing the expected results. I worry about creating another machine dependent reorg catch-all pass. Maybe an incremental pre- and/or post- cost hook would be more effective. I will let Bill comment. Thanks, David
Re: [PATCH] Add vector cost model density heuristic
On Mon, 2012-06-11 at 16:58 +0200, Richard Guenther wrote: On Mon, 11 Jun 2012, Richard Guenther wrote: On Mon, 11 Jun 2012, William J. Schmidt wrote: On Mon, 2012-06-11 at 13:40 +0200, Richard Guenther wrote: On Fri, 8 Jun 2012, William J. Schmidt wrote: This patch adds a heuristic to the vectorizer when estimating the minimum profitable number of iterations. The heuristic is target-dependent, and is currently disabled for all targets except PowerPC. However, the intent is to make it general enough to be useful for other targets that want to opt in. A previous patch addressed some PowerPC SPEC degradations by modifying the vector cost model values for vec_perm and vec_promote_demote. The values were set a little higher than their natural values because the natural values were not sufficient to prevent a poor vectorization choice. However, this is not the right long-term solution, since it can unnecessarily constrain other vectorization choices involving permute instructions. Analysis of the badly vectorized loop (in sphinx3) showed that the problem was overcommitment of vector resources -- too many vector instructions issued without enough non-vector instructions available to cover the delays. The vector cost model assumes that instructions always have a constant cost, and doesn't have a way of judging this kind of density of vector instructions. The present patch adds a heuristic to recognize when a loop is likely to overcommit resources, and adds a small penalty to the inside-loop cost to account for the expected stalls. The heuristic is parameterized with three target-specific values: * Density threshold: The heuristic will apply only when the percentage of inside-loop cost attributable to vectorized instructions exceeds this value. * Size threshold: The heuristic will apply only when the inside-loop cost exceeds this value. * Penalty: The inside-loop cost will be increased by this percentage value when the heuristic applies. Thus only reasonably large loop bodies that are mostly vectorized instructions will be affected. By applying only a small percentage bump to the inside-loop cost, the heuristic will only turn off vectorization for loops that were considered barely profitable to begin with (such as the sphinx3 loop). So the heuristic is quite conservative and should not affect the vast majority of vectorization decisions. Together with the new heuristic, this patch reduces the vec_perm and vec_promote_demote costs for PowerPC to their natural values. I've regstrapped this with no regressions on powerpc64-unknown-linux-gnu and verified that no performance regressions occur on SPEC cpu2006. Is this ok for trunk? Hmm. I don't like this patch or its general idea too much. Instead I'd like us to move more of the cost model detail to the target, giving it a chance to look at the whole loop before deciding on a cost. ISTR posting the overall idea at some point, but let me repeat it here instead of trying to find that e-mail. The basic interface of the cost model should be, in targetm.vectorize /* Tell the target to start cost analysis of a loop or a basic-block (if the loop argument is NULL). Returns an opaque pointer to target-private data. */ void *init_cost (struct loop *loop); /* Add cost for N vectorized-stmt-kind statements in vector_mode. */ void add_stmt_cost (void *data, unsigned n, vectorized-stmt-kind, enum machine_mode vector_mode); /* Tell the target to compute and return the cost of the accumulated statements and free any target-private data. */ unsigned finish_cost (void *data); with eventually slightly different signatures for add_stmt_cost (like pass in the original scalar stmt?). It allows the target, at finish_cost time, to evaluate things like register pressure and resource utilization. OK, I'm trying to understand how you would want this built into the present structure. Taking just the loop case for now: Judging by your suggested API, we would have to call add_stmt_cost () everywhere that we now call stmt_vinfo_set_inside_of_loop_cost (). For now this would be an additional call, not a replacement, though maybe the other goes away eventually. This allows the target to save more data about the vectorized instructions than just an accumulated cost number (order and quantity of various kinds of instructions can be maintained for better modeling). Presumably the call to finish_cost would be
Re: [RFC] Target-specific limits on vector alignment
On 11/06/12 15:53, Richard Guenther wrote: On Mon, Jun 11, 2012 at 4:38 PM, Richard Earnshaw rearn...@arm.com wrote: On 11/06/12 15:17, Richard Guenther wrote: On Mon, Jun 11, 2012 at 3:16 PM, Richard Earnshaw rearn...@arm.com wrote: The ARM ABI states that vectors larger than 64 bits in size still have 64-bit alignment; never-the-less, the HW supports alignment hints of up to 128-bits in some cases and will trap in a vector has an alignment that less than the hint. GCC currently hard-codes larger vectors to be aligned by the size of the vector, which means that 128-bit vectors are marked as being 128-bit aligned. The ARM ABI unfortunately does not support generating such alignment for parameters passed by value and this can lead to traps at run time. It seems that the best way to solve this problem is to allow the back-end to set an upper limit on the alignment permitted for a vector. I've implemented this as a separate hook, rather than using the existing hooks because there's a strong likelihood of breaking some existing ABIs if I did it another way. There are a couple of tests that will need some re-working before this can be committed to deal with the fall-out of making this change; I'll prepare those changes if this patch is deemed generally acceptable. Hm. Where would you use that target hook? Doh! It was supposed to be in the patch set... :-( in layout_type(), where the alignment of a vector is forced to the size of the vector. Ok. + /* Naturally align vectors, but let the target set an upper +limit. This prevents ABI changes depending on whether or +not native vector modes are supported. */ + TYPE_ALIGN (type) + = targetm.vector_alignment (type, tree_low_cst (TYPE_SIZE (type), 0)); The type argument or the size argument looks redundant. Technically, yes, we could get rid of tree_low_cst (TYPE_SIZE (type) and calculate it inside the alignment function if it was needed. However, it seemed likely that most targets would need that number one way or another, such that passing it would be helpful. Note that we still can have such vector properly aligned, thus the vectorizer would need to use build_aligned_type also if it knows the type is aligned, not only when it thinks it is misaligned. You basically change the alignment of the default vector type. I'm not sure I follow... R.
Re: [PATCH] Add vector cost model density heuristic
On Mon, 2012-06-11 at 11:09 -0400, David Edelsohn wrote: On Mon, Jun 11, 2012 at 10:55 AM, Richard Guenther rguent...@suse.de wrote: Well, they are at least magic numbers and heuristics that apply generally and not only to the single issue in sphinx. And in fact how it works for sphinx _is_ magic. Second, I suggest that you need to rephrase I can make you and re-send your reply. Sorry for my bad english. Consider it meaning that I'd rather have you think about a more proper solution. That's what patch review is about after all, no? Sometimes a complete re-write (which gets more difficult which each of the patches enhancing the not ideal current state) is the best thing to do. Richard, The values of the heuristics may be magic, but Bill believes the heuristics are testing the important characteristics. The heuristics themselves are controlled by hooks, so the target can set the correct values for their own requirements. The concern is that a general cost infrastructure is too general. And, based on history, all ports simply will copy the boilerplate from the first implementation. It also may cause more problems because the target has relatively little information to be able to judge heuristics at that point in the middle-end. If the targets start to get too cute or too complicated, it may cause more problems or more confusion about why more complicated heuristics are not effective and not producing the expected results. I worry about creating another machine dependent reorg catch-all pass. Maybe an incremental pre- and/or post- cost hook would be more effective. I will let Bill comment. Thanks David, I can see both sides of this, and it's hard to judge the future from where I stand. My belief is that the number of heuristics targets will implement will be fairly limited, since judgments about cycle-level costs are not accurately predictable during the middle end. All we can do is come up with a few things that seem to make sense. Doing too much in the back end seems impractical. The interesting question to me is whether cost model heuristics are general enough to be reusable. What I saw in this case was what I considered to be a somewhat target-neutral problem: overwhelming those assets of the processor that implement vectorization. It seemed reasonable to provide hooks for others to use the idea if they encounter similar issues. If reusing the heuristic is useful, then having to copy the logic from one target to another isn't the best approach. If nobody else will ever use it, then embedding it in the back end is reasonable. Unfortunately my crystal ball has been on the fritz for several decades, so I can't tell you for sure which is right... Richard, my biggest question is whether you think other targets are likely to take advantage of a more general back-end interface, or whether this will end up just being a PowerPC wart. If you know of ways this will be useful for i386, that would be helpful to know. Perhaps this requires your crystal ball as well; not sure how well yours works... If we look at just this one issue in isolation, then changing all the code in the vectorizer that calculates inside/outside loop costs and moving it to targetm seems more invasive than adding the few hooks. But if this will really be a useful feature for the community as a whole I am certainly willing to tackle it. Thanks, Bill Thanks, David
Re: [C++] Reject variably modified types in operator new
On 06/01/2012 02:09 PM, Florian Weimer wrote: On 06/01/2012 11:00 AM, Florian Weimer wrote: I'll try to warn about this case and make the transformation to the proper operator new[] call. Here's the version. I've added a warning for the ill-formed code. The only remaining glitch is in g++.dg/cpp0x/regress/debug-debug7.C, specifically (b is not a constant): int (*x)[b] = new int[a][b]; // { dg-error not usable } The new warning I've added fires on this line, too. How can I check for the pending error and suppress the warning? As discussed in the other thread with Jason and Gaby, I have changed cp/parser.c to accept non-constant values and produce the error in build_new_1. This is arguably the right place because this check must be performed after template instantiation. (I will submit a follow-up patch for the remaining type check in the parser once this is accepted.) testsuite/g++.dg/cpp0x/regress/debug-debug7.C remains problematic. The diagnostic at parse time was definitely better, but I see no way to keep that. I have reworded the diagnostics because the code no longer has access to the original syntax, so it cannot tell whether the operator new call involved a type-id in parens. Bootstrapped and tested on x86_64-linux-gnu, with no new regressions. -- Florian Weimer / Red Hat Product Security Team 2012-06-11 Florian Weimer fwei...@redhat.com * init.c (build_new_1): Warn about (T[N]) for variable N, and reject T[M][N]. * parser.c (cp_parser_direct_new_declarator): Accept non-constant expressions. Handled now in build_new_1. 2012-06-11 Florian Weimer fwei...@redhat.com * g++.dg/init/new35.C: New. * g++.dg/init/new36.C: New. * g++.dg/ext/vla5.C: New warning. * g++.dg/ext/vla8.C: New warning. * g++.dg/cpp0x/regress/debug-debug7.C: Update diagnostics. Index: cp/init.c === --- cp/init.c (revision 188384) +++ cp/init.c (working copy) @@ -2175,6 +2175,7 @@ tree pointer_type; tree non_const_pointer_type; tree outer_nelts = NULL_TREE; + bool outer_nelts_from_type = false; tree alloc_call, alloc_expr; /* The address returned by the call to operator new. This node is a VAR_DECL and is therefore reusable. */ @@ -2209,10 +2210,14 @@ } else if (TREE_CODE (type) == ARRAY_TYPE) { + /* Transforms new (T[N]) to new T[N]. The former is a GNU + extension. (This also covers new T where T is a VLA + typedef.) */ array_p = true; nelts = array_type_nelts_top (type); outer_nelts = nelts; type = TREE_TYPE (type); + outer_nelts_from_type = true; } /* If our base type is an array, then make sure we know how many elements @@ -2220,11 +2225,40 @@ for (elt_type = type; TREE_CODE (elt_type) == ARRAY_TYPE; elt_type = TREE_TYPE (elt_type)) -nelts = cp_build_binary_op (input_location, -MULT_EXPR, nelts, -array_type_nelts_top (elt_type), -complain); +{ + tree inner_nelts = array_type_nelts_top (elt_type); + tree inner_nelts_cst = maybe_constant_value (inner_nelts); + if (!TREE_CONSTANT (inner_nelts_cst)) + { + if (complain tf_error) + error_at (EXPR_LOC_OR_HERE (inner_nelts), + array size in operator new must be constant); + nelts = error_mark_node; + } + if (nelts != error_mark_node) + nelts = cp_build_binary_op (input_location, +MULT_EXPR, nelts, +inner_nelts_cst, +complain); +} + if (variably_modified_type_p (elt_type, NULL_TREE) (complain tf_error)) +{ + error (variably modified type not allowed in operator new); + return error_mark_node; +} + + if (nelts == error_mark_node) +return error_mark_node; + + /* Warn if we performed the (T[N]) to T[N] transformation and N is + variable. */ + if (outer_nelts_from_type + !TREE_CONSTANT (maybe_constant_value (outer_nelts)) + (complain tf_warning_or_error)) +pedwarn(EXPR_LOC_OR_HERE (outer_nelts), OPT_Wvla, + ISO C++ does not support variable-length array types); + if (TREE_CODE (elt_type) == VOID_TYPE) { if (complain tf_error) Index: cp/parser.c === --- cp/parser.c (revision 188384) +++ cp/parser.c (working copy) @@ -6845,41 +6845,34 @@ while (true) { tree expression; + cp_token *token; /* Look for the opening `['. */ cp_parser_require (parser, CPP_OPEN_SQUARE, RT_OPEN_SQUARE); - /* The first expression is not required to be constant. */ - if (!declarator) + + token = cp_lexer_peek_token (parser-lexer); + expression = cp_parser_expression (parser, /*cast_p=*/false, NULL); + /* The standard requires that the expression have integral + type. DR 74 adds enumeration types. We believe that the + real intent is that these expressions be handled like the + expression in a `switch' condition, which also
Re: [PATCH] don't presume undelegitimized UNSPEC_TLS SYMBOL_REF is a decl
ping? On Wed, Jun 6, 2012 at 2:04 PM, Roland McGrath mcgra...@google.com wrote: cf this change: 2010-11-19 Jakub Jelinek ja...@redhat.com PR target/45870 * dwarf2out.c (const_ok_for_output_1): Don't complain about non-delegitimized TLS UNSPECs. This case hit me where the rtx was: (unspec:SI [ (symbol_ref:SI (*.LANCHOR0) [flags 0x1aa]) (const_int 4 [0x4]) ] UNSPEC_TLS) Note: 1. The UNSPEC has two operands, not one. 2. The SYMBOL_REF does not correspond to any decl. This corresponds to this ARM code: ldr r3, .L10+4 ... .L10: something else .word .LANCHOR0(tpoff) ... .section .tdata,awT,%progbits .align 4 .LANCHOR0 = . + 0 .type tdata1, %object .size tdata1, 4 tdata1: .word 1 The only way I know to reproduce this is using a variant ARM target that I'm still developing and is not yet ready to be submitted, so I don't have a proper test case to offer. But I think the principle of the following change is fairly sound. What do you think? (Recall that I am not a GCC committer, so if you like the change, please commit it for me.) Thanks, Roland 2012-06-06 Roland McGrath mcgra...@google.com * dwarf2out.c (const_ok_for_output_1): Detect a TLS UNSPEC using SYMBOL_REF_TLS_MODEL rather than DECL_THREAD_LOCAL_P, in case it's not a VAR_DECL. Also don't limit it to UNSPECs with exactly one operand. diff --git a/gcc/dwarf2out.c b/gcc/dwarf2out.c index 6e4ab76..bc68205 100644 --- a/gcc/dwarf2out.c +++ b/gcc/dwarf2out.c @@ -10129,12 +10129,12 @@ const_ok_for_output_1 (rtx *rtlp, void *data ATTRIBUTE_UNUSED) we can't express it in the debug info. */ #ifdef ENABLE_CHECKING /* Don't complain about TLS UNSPECs, those are just too hard to - delegitimize. */ - if (XVECLEN (rtl, 0) != 1 + delegitimize. Note this could be a non-decl SYMBOL_REF such as + one in a constant pool entry, so testing SYMBOL_REF_TLS_MODEL + rather than DECL_THREAD_LOCAL_P is not just an optimization. */ + if (XVECLEN (rtl, 0) == 0 || GET_CODE (XVECEXP (rtl, 0, 0)) != SYMBOL_REF - || SYMBOL_REF_DECL (XVECEXP (rtl, 0, 0)) == NULL - || TREE_CODE (SYMBOL_REF_DECL (XVECEXP (rtl, 0, 0))) != VAR_DECL - || !DECL_THREAD_LOCAL_P (SYMBOL_REF_DECL (XVECEXP (rtl, 0, 0 + || SYMBOL_REF_TLS_MODEL (XVECEXP (rtl, 0, 0)) == TLS_MODEL_NONE) inform (current_function_decl ? DECL_SOURCE_LOCATION (current_function_decl) : UNKNOWN_LOCATION,
Re: RFA: Alternative iterator implementation
Richard Sandiford wrote: As discussed in the context of the AARCH64 submission, this patch rewrites the iterator handling in read-rtl.c so that we record iterator positions using an on-the-side VEC rather than placeholder modes and codes. We then substitute in-place for each sequence of iterator values and take a deep copy of the result. We do any string substitutions during the copy. The patch also generalises the current use of attributes for rtx modes ((zero_extend:WIDER_MODE ...), etc.) so that the same kind of thing can be done with future iterator types, including the int iterators. Not sure whether that'll be useful or not, but it made the patch easier to write. Tested by making sure that the insn-*.c output didn't change for x86_64-linux-gnu, except that we now do a better job of retaining #line information (not seen as a good thing by everbody, I realise). Also made sure that things like insn-output.c are still generated in the blink of an eye. Bootstrapped regression-tested on x86_64-linux-gnu and i686-pc-linux-gnu. OK to install? Richard gcc/ * read-rtl.c (mapping): Remove index field. Add current_value field. Define heap vectors. (iterator_group): Fix long line. Remove num_builtins field and uses_iterator fields. Make apply_iterator take a void * parameter. (iterator_use, atttribute_use): New structures. (iterator_traverse_data, BELLWETHER_CODE, bellwether_codes): Delete. (current_iterators, iterator_uses, attribute_uses): New variables. (uses_mode_iterator_p, uses_code_iterator_p): Delete. (apply_mode_iterator, apply_code_iterator): Take a void * parameter. (map_attr_string, apply_iterator_to_string): Remove iterator and value parameters. Look through all current iterator values for a matching attribute. (mode_attr_index, apply_mode_maps): Delete. (apply_iterator_to_rtx): Replace with... (copy_rtx_for_iterators): ...this new function. (uses_iterator_p, apply_iterator_traverse): Delete. (apply_attribute_uses, add_current_iterators, apply_iterators): New functions. (add_mapping): Remove index field. Set current_value field. (initialize_iterators): Don't set num_builtins and uses_iterator_p fields. (find_iterator): Delete. (record_iterator_use, record_attribute_use): New functions. (record_potential_iterator_use): New function. (check_code_iterator): Remove handling of bellwether codes. (read_rtx): Remove mode maps. Truncate iterator and attribute uses. (read_rtx_code, read_nested_rtx, read_rtx_variadic): Remove mode_maps parameter. Use the first code iterator value instead of the bellwether_codes array. Use record_potential_iterator_use for modes. Index: gcc/read-rtl.c === --- gcc/read-rtl.c 2012-06-03 08:58:32.251211521 +0100 +++ gcc/read-rtl.c 2012-06-03 09:20:47.633208254 +0100 @@ -41,7 +41,7 @@ struct map_value { }; /* Maps an iterator or attribute name to a list of (integer, string) pairs. - The integers are mode or code values; the strings are either C conditions + The integers are iterator values; the strings are either C conditions or attribute values. */ struct mapping { /* The name of the iterator or attribute. */ @@ -50,82 +50,80 @@ struct mapping { /* The group (modes or codes) to which the iterator or attribute belongs. */ struct iterator_group *group; - /* Gives a unique number to the attribute or iterator. Numbers are - allocated consecutively, starting at 0. */ - int index; - /* The list of (integer, string) pairs. */ struct map_value *values; + + /* For iterators, records the current value of the iterator. */ + struct map_value *current_value; }; -/* A structure for abstracting the common parts of code and mode iterators. */ +/* Vector definitions for the above. */ +typedef struct mapping *mapping_ptr; +DEF_VEC_P (mapping_ptr); +DEF_VEC_ALLOC_P (mapping_ptr, heap); + +/* A structure for abstracting the common parts of iterators. */ struct iterator_group { - /* Tables of mapping structures, one for attributes and one for iterators. */ + /* Tables of mapping structures, one for attributes and one for + iterators. */ htab_t attrs, iterators; - /* The number of real modes or codes (and by extension, the first - number available for use as an iterator placeholder). */ - int num_builtins; - - /* Treat the given string as the name of a standard mode or code and + /* Treat the given string as the name of a standard mode, etc., and return its integer value. */ int (*find_builtin) (const char *); - /* Return true if the given rtx uses the given mode or code. */ - bool (*uses_iterator_p) (rtx, int); + /* Make the given pointer use the given iterator value. */ + void
Re: [PATCH] Hoist adjacent pointer loads
On Mon, 2012-06-11 at 14:59 +0200, Richard Guenther wrote: On Mon, 11 Jun 2012, William J. Schmidt wrote: On Mon, 2012-06-11 at 13:28 +0200, Richard Guenther wrote: On Mon, Jun 4, 2012 at 3:45 PM, William J. Schmidt wschm...@linux.vnet.ibm.com wrote: Hi Richard, Here's a revision of the hoist-adjacent-loads patch. I'm sorry for the delay since the last revision, but my performance testing has been blocked waiting for a fix to PR53487. I ended up applying a test version of the patch to 4.7 and ran performance numbers with that instead, with no degradations. In addition to addressing your comments, this patch contains one bug fix where local_mem_dependence was called on the wrong blocks after swapping def1 and def2. Bootstrapped with no regressions on powerpc64-unknown-linux-gnu. Is this version ok for trunk? I won't commit it until I can do final testing on trunk in conjunction with a fix for PR53487. Thanks, Bill 2012-06-04 Bill Schmidt wschm...@linux.vnet.ibm.com * opts.c: Add -fhoist_adjacent_loads to -O2 and above. * tree-ssa-phiopt.c (tree_ssa_phiopt_worker): Add argument to forward declaration. (hoist_adjacent_loads, gate_hoist_loads): New forward declarations. (tree_ssa_phiopt): Call gate_hoist_loads. (tree_ssa_cs_elim): Add parm to tree_ssa_phiopt_worker call. (tree_ssa_phiopt_worker): Add do_hoist_loads to formal arg list; call hoist_adjacent_loads. (local_mem_dependence): New function. (hoist_adjacent_loads): Likewise. (gate_hoist_loads): Likewise. * common.opt (fhoist-adjacent-loads): New switch. * Makefile.in (tree-ssa-phiopt.o): Added dependencies. * params.def (PARAM_MIN_CMOVE_STRUCT_ALIGN): New param. Index: gcc/opts.c === --- gcc/opts.c (revision 187805) +++ gcc/opts.c (working copy) @@ -489,6 +489,7 @@ static const struct default_options default_option { OPT_LEVELS_2_PLUS, OPT_falign_functions, NULL, 1 }, { OPT_LEVELS_2_PLUS, OPT_ftree_tail_merge, NULL, 1 }, { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_foptimize_strlen, NULL, 1 }, +{ OPT_LEVELS_2_PLUS, OPT_fhoist_adjacent_loads, NULL, 1 }, /* -O3 optimizations. */ { OPT_LEVELS_3_PLUS, OPT_ftree_loop_distribute_patterns, NULL, 1 }, Index: gcc/tree-ssa-phiopt.c === --- gcc/tree-ssa-phiopt.c (revision 187805) +++ gcc/tree-ssa-phiopt.c (working copy) @@ -37,9 +37,17 @@ along with GCC; see the file COPYING3. If not see #include cfgloop.h #include tree-data-ref.h #include tree-pretty-print.h +#include gimple-pretty-print.h +#include insn-config.h +#include expr.h +#include optabs.h +#ifndef HAVE_conditional_move +#define HAVE_conditional_move (0) +#endif + static unsigned int tree_ssa_phiopt (void); -static unsigned int tree_ssa_phiopt_worker (bool); +static unsigned int tree_ssa_phiopt_worker (bool, bool); static bool conditional_replacement (basic_block, basic_block, edge, edge, gimple, tree, tree); static int value_replacement (basic_block, basic_block, @@ -53,6 +61,9 @@ static bool cond_store_replacement (basic_block, b static bool cond_if_else_store_replacement (basic_block, basic_block, basic_block); static struct pointer_set_t * get_non_trapping (void); static void replace_phi_edge_with_variable (basic_block, edge, gimple, tree); +static void hoist_adjacent_loads (basic_block, basic_block, + basic_block, basic_block); +static bool gate_hoist_loads (void); /* This pass tries to replaces an if-then-else block with an assignment. We have four kinds of transformations. Some of these @@ -138,12 +149,56 @@ static void replace_phi_edge_with_variable (basic_ bb2: x = PHI x' (bb0), ...; - A similar transformation is done for MAX_EXPR. */ + A similar transformation is done for MAX_EXPR. + + This pass also performs a fifth transformation of a slightly different + flavor. + + Adjacent Load Hoisting + -- + + This transformation replaces + + bb0: + if (...) goto bb2; else goto bb1; + bb1: + x1 = (expr).field1; + goto bb3; + bb2: + x2 = (expr).field2; + bb3: + # x = PHI x1, x2; + + with + + bb0: + x1 = (expr).field1; + x2 = (expr).field2; + if (...) goto bb2; else goto bb1; + bb1: + goto
Re: [PATCH] Hoist adjacent pointer loads
On Mon, 2012-06-11 at 12:11 -0500, William J. Schmidt wrote: I found this parameter that seems to correspond to well-predicted conditional jumps: /* When branch is predicted to be taken with probability lower than this threshold (in percent), then it is considered well predictable. */ DEFPARAM (PARAM_PREDICTABLE_BRANCH_OUTCOME, predictable-branch-outcome, Maximal estimated outcome of branch considered predictable, 2, 0, 50) ...which has an interface predictable_edge_p () in predict.c, so that's what I'll use. Thanks, Bill
Re: [PATCH 1/2] SH epilogue unwind, dwarf2 pass changes
On 2012-06-11 01:34, Chung-Lin Tang wrote: Ping? On 2012/6/1 06:24 PM, Chung-Lin Tang wrote: On 12/5/23 1:46 AM, Richard Henderson wrote: On 05/18/12 03:48, Chung-Lin Tang wrote: @@ -2401,6 +2401,7 @@ scan_trace (dw_trace_info *trace) { /* Propagate across fallthru edges. */ dwarf2out_flush_queued_reg_saves (); +def_cfa_1 (this_cfa); maybe_record_trace_start (insn, NULL); break; } @@ -2455,6 +2456,18 @@ scan_trace (dw_trace_info *trace) cur_cfa = this_cfa; continue; } +else + { +/* If ELT is a annulled branch-taken instruction (i.e. executed + only when branch is not taken), the args_size and CFA should + not change through the jump. */ +create_trace_edges (control); + +/* Update and continue with the trace. */ +add_cfi_insn = insn; +scan_insn_after (elt); +continue; + } I think the def_cfa_1 is misplaced. It should be immediately before that last continue. That mirrors the sort of flow you get via the other paths through the loop. Or possibly moved before the dwarf2out_flush_queued_reg_saves () call in the patch? (in the save_point_p () break case) Note I'm only saying this based on overall ordering of those two routine calls in the loop. Attached is a testcase (adapted from libgomp) that, with the SH epilogue unwind patch applied alone, produces the ICE I'm seeing (-O2 -funwind-tables). This dwarf2 patch, with any of the three new def_cfa_1() call sites seems to solve it, though you might want to comment on which call site seems correct I've committed the following, after a --enable-languages=c,c++ --target=sh-elf build and check-gcc with your patch 2/2 applied. I eyeballed the resulting debug info and it looked correct. Given the test case was extracted from libgomp I didn't commit that, trusting that the real testing will continue to happen during a build. r~ * dwarf2cfi.c (scan_trace): Handle annulled branch-taken delay slots. diff --git a/gcc/dwarf2cfi.c b/gcc/dwarf2cfi.c index bf2d802f..3edb6e1 100644 --- a/gcc/dwarf2cfi.c +++ b/gcc/dwarf2cfi.c @@ -2429,18 +2429,20 @@ scan_trace (dw_trace_info *trace) elt = XVECEXP (pat, 0, 1); - /* If ELT is an instruction from target of an annulled branch, -the effects are for the target only and so the args_size -and CFA along the current path shouldn't change. */ if (INSN_FROM_TARGET_P (elt)) { HOST_WIDE_INT restore_args_size; cfi_vec save_row_reg_save; + /* If ELT is an instruction from target of an annulled +branch, the effects are for the target only and so +the args_size and CFA along the current path +shouldn't change. */ add_cfi_insn = NULL; restore_args_size = cur_trace-end_true_args_size; cur_cfa = cur_row-cfa; - save_row_reg_save = VEC_copy (dw_cfi_ref, gc, cur_row-reg_save); + save_row_reg_save + = VEC_copy (dw_cfi_ref, gc, cur_row-reg_save); scan_insn_after (elt); @@ -2453,8 +2455,20 @@ scan_trace (dw_trace_info *trace) cur_row-cfa = this_cfa; cur_row-reg_save = save_row_reg_save; cur_cfa = this_cfa; - continue; } + else + { + /* If ELT is a annulled branch-taken instruction (i.e. +executed only when branch is not taken), the args_size +and CFA should not change through the jump. */ + create_trace_edges (control); + + /* Update and continue with the trace. */ + add_cfi_insn = insn; + scan_insn_after (elt); + def_cfa_1 (this_cfa); + } + continue; } /* The insns in the delay slot should all be considered to happen
Re: [PATCH] Add vector cost model density heuristic
On Mon, Jun 11, 2012 at 5:38 PM, William J. Schmidt wschm...@linux.vnet.ibm.com wrote: On Mon, 2012-06-11 at 11:09 -0400, David Edelsohn wrote: On Mon, Jun 11, 2012 at 10:55 AM, Richard Guenther rguent...@suse.de wrote: Well, they are at least magic numbers and heuristics that apply generally and not only to the single issue in sphinx. And in fact how it works for sphinx _is_ magic. Second, I suggest that you need to rephrase I can make you and re-send your reply. Sorry for my bad english. Consider it meaning that I'd rather have you think about a more proper solution. That's what patch review is about after all, no? Sometimes a complete re-write (which gets more difficult which each of the patches enhancing the not ideal current state) is the best thing to do. Richard, The values of the heuristics may be magic, but Bill believes the heuristics are testing the important characteristics. The heuristics themselves are controlled by hooks, so the target can set the correct values for their own requirements. The concern is that a general cost infrastructure is too general. And, based on history, all ports simply will copy the boilerplate from the first implementation. It also may cause more problems because the target has relatively little information to be able to judge heuristics at that point in the middle-end. If the targets start to get too cute or too complicated, it may cause more problems or more confusion about why more complicated heuristics are not effective and not producing the expected results. I worry about creating another machine dependent reorg catch-all pass. Maybe an incremental pre- and/or post- cost hook would be more effective. I will let Bill comment. Thanks David, I can see both sides of this, and it's hard to judge the future from where I stand. My belief is that the number of heuristics targets will implement will be fairly limited, since judgments about cycle-level costs are not accurately predictable during the middle end. All we can do is come up with a few things that seem to make sense. Doing too much in the back end seems impractical. The interesting question to me is whether cost model heuristics are general enough to be reusable. What I saw in this case was what I considered to be a somewhat target-neutral problem: overwhelming those assets of the processor that implement vectorization. It seemed reasonable to provide hooks for others to use the idea if they encounter similar issues. If reusing the heuristic is useful, then having to copy the logic from one target to another isn't the best approach. If nobody else will ever use it, then embedding it in the back end is reasonable. Unfortunately my crystal ball has been on the fritz for several decades, so I can't tell you for sure which is right... Richard, my biggest question is whether you think other targets are likely to take advantage of a more general back-end interface, or whether this will end up just being a PowerPC wart. If you know of ways this will be useful for i386, that would be helpful to know. Perhaps this requires your crystal ball as well; not sure how well yours works... If we look at just this one issue in isolation, then changing all the code in the vectorizer that calculates inside/outside loop costs and moving it to targetm seems more invasive than adding the few hooks. But if this will really be a useful feature for the community as a whole I am certainly willing to tackle it. Well. At the moment the cost model is purely local in that the target receives no context about the cost it has to return for (part of) a vectorized operation. In fact the vectorizer already creates multiple instructions (and thus calls to the cost target hook) for a single scalar statement in some cases. There are two ways to increase the context the target can look at - invent more kinds of statements we pass to the cost model hook, and thus arrive at a point where for each group of stmts the vectorizer emits and that possibly has a more efficient target implementation we have a different statement kind (only for the cost model, we continue to generate multiple GIMPLE stmts). The other way is to give the target the ability to see context, thus relation of the (sub-)statements the vectorizer asks the target to compute costs for. I've run into the situation where there was missing context on x86, too (but I forgot the specific case), and that's where I came up with the general idea. Of course both work, but for your kind of heuristic only looking at context works. In the scheme I suggested the fini_cost hook would be able to do the counting (as would a simple counter - I don't believe that this would not work - you do counting, too, after all, simply make the fini hook able to reject the vectorization). If powerpc is issue-limited then you have the additional issue that you have no idea of the amount of address calculations
Re: [RFC] Target-specific limits on vector alignment
On Mon, Jun 11, 2012 at 5:25 PM, Richard Earnshaw rearn...@arm.com wrote: On 11/06/12 15:53, Richard Guenther wrote: On Mon, Jun 11, 2012 at 4:38 PM, Richard Earnshaw rearn...@arm.com wrote: On 11/06/12 15:17, Richard Guenther wrote: On Mon, Jun 11, 2012 at 3:16 PM, Richard Earnshaw rearn...@arm.com wrote: The ARM ABI states that vectors larger than 64 bits in size still have 64-bit alignment; never-the-less, the HW supports alignment hints of up to 128-bits in some cases and will trap in a vector has an alignment that less than the hint. GCC currently hard-codes larger vectors to be aligned by the size of the vector, which means that 128-bit vectors are marked as being 128-bit aligned. The ARM ABI unfortunately does not support generating such alignment for parameters passed by value and this can lead to traps at run time. It seems that the best way to solve this problem is to allow the back-end to set an upper limit on the alignment permitted for a vector. I've implemented this as a separate hook, rather than using the existing hooks because there's a strong likelihood of breaking some existing ABIs if I did it another way. There are a couple of tests that will need some re-working before this can be committed to deal with the fall-out of making this change; I'll prepare those changes if this patch is deemed generally acceptable. Hm. Where would you use that target hook? Doh! It was supposed to be in the patch set... :-( in layout_type(), where the alignment of a vector is forced to the size of the vector. Ok. + /* Naturally align vectors, but let the target set an upper + limit. This prevents ABI changes depending on whether or + not native vector modes are supported. */ + TYPE_ALIGN (type) + = targetm.vector_alignment (type, tree_low_cst (TYPE_SIZE (type), 0)); The type argument or the size argument looks redundant. Technically, yes, we could get rid of tree_low_cst (TYPE_SIZE (type) and calculate it inside the alignment function if it was needed. However, it seemed likely that most targets would need that number one way or another, such that passing it would be helpful. Well, you don't need it in stor-layout and targets might think the value may be completely unrelated to the type ... Note that we still can have such vector properly aligned, thus the vectorizer would need to use build_aligned_type also if it knows the type is aligned, not only when it thinks it is misaligned. You basically change the alignment of the default vector type. I'm not sure I follow... I say that a large vector may be still aligned, so the vectorizer when creating vector memory references has to use a non-default aligned vector type when the vector is aligned. It won't do that at the moment. Richard. R.
[PATCH][Cilkplus] Patch to make internal struct runtme independent
Hello Everyone, This patch is for the Cilkplus branch affecting both C and C++ compilers. This patch will make the cilkrts_stack_frame struct independent of the runtime. Thanking You, Yours Sincerely, Balaji V. Iyer. Index: gcc/cilk.c === --- gcc/cilk.c (revision 188393) +++ gcc/cilk.c (working copy) @@ -165,7 +165,7 @@ TYPE_ALIGN (frame) = PREFERRED_STACK_BOUNDARY; TREE_ADDRESSABLE (frame) = 1; /* XXX Is this right? */ - finish_builtin_struct (frame, __cilkrts_stack_frame, fields, NULL_TREE); + finish_builtin_struct (frame, __cilkrts_stack_frame_GCC, fields, NULL_TREE); /* XXX C++ only SET_CLASS_TYPE_P(record, 1); */ /* XXX C++ only xref_basetypes (record, NULL_TREE); */ cilk_frame_type_decl = frame; Index: gcc/ChangeLog.cilk === --- gcc/ChangeLog.cilk (revision 188393) +++ gcc/ChangeLog.cilk (working copy) @@ -1,3 +1,8 @@ +2012-06-11 Balaji V. Iyer balaji.v.i...@intel.com + + * cilk.c (cilk_init_builtins): Made the stack frame structure runtime + independent. + 2012-06-05 Balaji V. Iyer balaji.v.i...@intel.com * cilk-spawn.c (compute_loop_var): Removed an unwanted assert.
Re: [PATCH] Correct cost model for strided loads
On Mon, Jun 11, 2012 at 5:01 PM, William J. Schmidt wschm...@linux.vnet.ibm.com wrote: On Mon, 2012-06-11 at 16:10 +0200, Richard Guenther wrote: On Mon, 11 Jun 2012, William J. Schmidt wrote: On Mon, 2012-06-11 at 11:15 +0200, Richard Guenther wrote: On Sun, Jun 10, 2012 at 5:58 PM, William J. Schmidt wschm...@linux.vnet.ibm.com wrote: The fix for PR53331 caused a degradation to 187.facerec on powerpc64-unknown-linux-gnu. The following simple patch reverses the degradation without otherwise affecting SPEC cpu2000 or cpu2006. Bootstrapped and regtested on that platform with no new regressions. Ok for trunk? Well, would the real cost not be subparts * scalar_to_vec plus subparts * vec_perm? At least vec_perm isn't the cost for building up a vector from N scalar elements either (it might be close enough if subparts == 2). What's the case with facerec here? Does it have subparts == 2? In this case, subparts == 4 (32-bit floats, 128-bit vec reg). On PowerPC, this requires two merge instructions and a permute instruction to get the four 32-bit quantities into the right place in a 128-bit register. Currently this is modeled as a vec_perm in other parts of the vectorizer per Ira's earlier patches, so I naively changed this to do the same thing. I see. The types of vectorizer instructions aren't documented, and I can't infer much from the i386.c cost model, so I need a little education. What semantics are represented by scalar_to_vec? It's a vector splat, thus x - { x, x, x, ... }. You can create { x, y, z, ... } by N such splats plus N - 1 permutes (if a permute, as VEC_PERM_EXPR, takes two input vectors). That's by far not the most efficient way to build up such a vector of course (with AVX you could do one splat plus N - 1 inserts for example). The cost is of course dependent on the number of vector elements, so a simple new enum vect_cost_for_stmt kind does not cover it but the target would have to look at the vector type passed and do some reasonable guess. Ah, splat! Yes, that's lingo I understand. I see the intent now. On PowerPC, we have this mapping of the floating-point registers and vector float registers where they overlap (the low-order half of each of the first 32 vector float regs corresponds to a scalar float reg). So in this case we have four scalar loads that place things in the bottom half of four vector registers, two vector merge instructions that collapse the four registers into two vector registers, and a vector permute that gets things in the right order.(*) I wonder if what we refer to as a merge instruction is similar to scalar_to_vec. Looks similar to x86 SSE then. If so, then maybe we need something like subparts = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); inside_cost += vect_get_stmt_cost (scalar_load) * ncopies * subparts; inside_cost += ncopies * vect_get_stmt_cost (scalar_to_vec) * subparts / 2; inside_cost += ncopies * vect_get_stmt_cost (vec_perm); But then we'd have to change how vec_perm is modeled elsewhere for PowerPC based on Ira's earlier patches. As I said, it's difficult for me to figure out all the intent of cost model decisions that have been made in the past, using current documentation. Heh, usually the intent was to make the changes simple, not to compute a proper cost. I think we simply need a new scalars_to_vec cost kind. That works. Maybe vec_construct gets the point across a little better? I think we need to use the full builtin_vectorization_cost interface instead of vect_get_stmt_cost here, so the targets can parameterize on type. Then we can just do one cost calculation for vec_construct that covers the full costs of getting the vector in order after the loads. Yes, vec_construct sounds good to me. I really wanted to pessimize this case for say AVX and char elements, thus building up a vector from 32 scalars which certainly does not cost a mere vec_perm. So, maybe special-case the subparts == 2 case and assume vec_perm would match the cost only in that case. (I'm a little confused by this as what you have at the moment is a single scalar_to_vec per copy, which has a cost of 1 on most i386 targets (occasionally 2). The subparts multiplier is only applied to the loads. So changing this to vec_perm seemed to be a no-op for i386.) Oh, I somehow read the patch as you were removing the multiplication by TYPE_VECTOR_SUBPARTS. But yes, the cost is way off and I'd wanted to reflect it with N scalar loads plus N splats plus N - 1 permutes originally. You could also model it with N scalar loads plus N inserts (but we don't have a vec_insert cost either). I think adding a scalars_to_vec or vec_init or however we want to call it - basically what the cost of a vector CONSTRUCTOR would be - is best. (*) There are actually
[PATCH][Cilkplus] Patch to make internal struct runtme independent
Hello Everyone, This patch is for the Cilkplus branch affecting both C and C++ compilers. This patch will make the cilkrts_stack_frame structure independent of the runtime. Thanking You, Yours Sincerely, Balaji V. Iyer. Index: gcc/cilk.c === --- gcc/cilk.c (revision 188393) +++ gcc/cilk.c (working copy) @@ -165,7 +165,7 @@ TYPE_ALIGN (frame) = PREFERRED_STACK_BOUNDARY; TREE_ADDRESSABLE (frame) = 1; /* XXX Is this right? */ - finish_builtin_struct (frame, __cilkrts_stack_frame, fields, NULL_TREE); + finish_builtin_struct (frame, __cilkrts_stack_frame_GCC, fields, NULL_TREE); /* XXX C++ only SET_CLASS_TYPE_P(record, 1); */ /* XXX C++ only xref_basetypes (record, NULL_TREE); */ cilk_frame_type_decl = frame; Index: gcc/ChangeLog.cilk === --- gcc/ChangeLog.cilk (revision 188393) +++ gcc/ChangeLog.cilk (working copy) @@ -1,3 +1,8 @@ +2012-06-11 Balaji V. Iyer balaji.v.i...@intel.com + + * cilk.c (cilk_init_builtins): Made the stack frame structure runtime + independent. + 2012-06-05 Balaji V. Iyer balaji.v.i...@intel.com * cilk-spawn.c (compute_loop_var): Removed an unwanted assert.
Make timevar phases mutually exclusive. (issue6302064)
The intent of the phases was to have a high-level but mutually exclusive accounting of compile time. We want to track compile time in a way that tells us which conceptual phases are taking the most time. That intent is not currently satisfied. This patch restores that intent. Add code to verify that the sum of the phase times is less than the total time, to detect when phases are overlapped. A slight amount of leeway is required due to time jitters. This verification is done as the last step in printing timevars so that any timevar information is not lost. Rename the phases to be clearer about what they measure, so that they are less likely to be modified to be overlapping. The primary example is to change TV_PHASE_GENERATE to TV_PHASE_LATE_ASM, meaning late work on the assembly. This name change avoids confusion n moving the timevar start call after the call to lang_hooks.decls.final_write_globals, which prevents overlapping phases. Each implementation of lang_hooks.decls.final_write_globals, is responsible for starting and stopping its own phases. Each implementation currently has a first phase of TV_PHASE_DEFERRED for front-end work deferred until after parsing is complete. The second phase has been renamed from TV_PHASE_CGRAPH to TV_PHASE_OPT_GEN, to better reflect its use as the main optimization and generation phase. This phase accounts for 70%-80% of compilation time. The third phase is TV_PHASE_DBGINFO, except in cp/decl2.c, where it is TV_PHASE_CHECK_DBGINFO because cc1plus mixes checking in with debug info generation. In langhooks.c, write_global_declarations was using TV_PHASE_CHECK_DBGINFO, but it was doing no checking. So, it now uses TV_PHASE_DBGINFO. The changes to LTO are significant. First, initialization now uses TV_PHASE_SETUP. Reading files now uses TV_PHASE_STREAM_IN. Writing files now uses TV_PHASE_STREAM_OUT. The remaining phase is TV_PHASE_OPT_GEN (formerly TV_PHASE_CGRAPH). Tested on x86_64. Okay for trunk? Index: gcc/ChangeLog 2012-06-11 Lawrence Crowl cr...@google.com * timevar.def (TV_PHASE_GENERATE): Rename to TV_PHASE_LATE_ASM. (TV_PHASE_CGRAPH): Rename to TV_PHASE_OPT_GEN. (TV_PHASE_STREAM_IN): New. (TV_PHASE_STREAM_OUT): New. * timevar.c (validate_phases): New. (timevar_print): Call validate_phases. * c-decl.c (c_write_global_declarations): Rename use of TV_PHASE_CGRAPH to TV_PHASE_OPT_GEN. * langhooks.c (write_global_declarations): Rename use of TV_PHASE_CGRAPH to TV_PHASE_OPT_GEN. Use TV_PHASE_DBGINFO instead of TV_PHASE_CHECK_DBGINFO. * toplev.c (compile_file): Rename use of TV_PHASE_GENERATE to TV_PHASE_LATE_ASM. Move start of TV_PHASE_LATE_ASM to after call to lang_hooks.decls.final_write_globals. Index: gcc/cp/ChangeLog 2012-06-11 Lawrence Crowl cr...@google.com * decl2.c (cp_write_global_declarations): Rename use of TV_PHASE_CGRAPH to TV_PHASE_OPT_GEN. Index: gcc/lto/ChangeLog 2012-06-11 Lawrence Crowl cr...@google.com * lto.c (do_whole_program_analysis): Rename use of TV_PHASE_CGRAPH to TV_PHASE_OPT_GEN. Use new timevar TV_PHASE_STREAM_OUT around the call to lto_wpa_write_files. (lto_main): Rename use of TV_PHASE_CGRAPH to TV_PHASE_OPT_GEN. Move start of TV_PHASE_OPT_GEN to include call to materialize_cgraph. Use TV_PHASE_SETUP for the call to lto_init. Use new timevar TV_PHASE_STREAM_IN around the call to read_cgraph_and_symbols. Index: gcc/toplev.c === --- gcc/toplev.c(revision 188335) +++ gcc/toplev.c(working copy) @@ -558,18 +558,15 @@ compile_file (void) if (flag_syntax_only || flag_wpa) return; - timevar_start (TV_PHASE_GENERATE); - ggc_protect_identifiers = false; /* This must also call finalize_compilation_unit. */ lang_hooks.decls.final_write_globals (); if (seen_error ()) -{ - timevar_stop (TV_PHASE_GENERATE); - return; -} +return; + + timevar_start (TV_PHASE_LATE_ASM); /* Compilation unit is finalized. When producing non-fat LTO object, we are basically finished. */ @@ -670,7 +667,7 @@ compile_file (void) assembly file after this point. */ targetm.asm_out.file_end (); - timevar_stop (TV_PHASE_GENERATE); + timevar_stop (TV_PHASE_LATE_ASM); } /* Print version information to FILE. Index: gcc/cp/decl2.c === --- gcc/cp/decl2.c (revision 188335) +++ gcc/cp/decl2.c (working copy) @@ -4017,11 +4017,11 @@ cp_write_global_declarations (void) candidates = collect_candidates_for_java_method_aliases (); timevar_stop (TV_PHASE_DEFERRED); - timevar_start (TV_PHASE_CGRAPH); + timevar_start (TV_PHASE_OPT_GEN); finalize_compilation_unit (); - timevar_stop (TV_PHASE_CGRAPH); + timevar_stop
[patch]: move stackalign tests that use dg-options
The tests in gcc.dg/torture/stackalign use two sets of torture options: the usual optimization sets used as default for torture tests, and up to four sets of options that are specific to stack alignment. The stack alignment options are passed via an option that is used by the dg-test option dg_extra_tool_flags, which can be overridden by dg-options. This means that the seven tests in that test directory that use dg-option do not use the other alignment options. All seven of these tests are limited to x86 targets. Four of them use -msse2, and four use different values for -mpreferred-stack-boundary (one does both), so it doesn't look as if they are intended to use the four sets of stackalign torture options. This patch moves those seven tests out of the stackalign directory up to the general gcc.dg/torture directory. With them out of the way I'll be able to use clean up the remaining stackalign tests to use torture test support to combine the stack align options with other torture options so they'll show up in test summary lines, eliminating lots of duplicate lines in test summaries. Tested on i686-pc-linux-gnu and arm-none-eabi. OK for mainline? 2012-06-11 Janis Johnson jani...@codesourcery.com * gcc.dg/torture/stackalign/alloca-2.c: Move to ... * gcc.dg/torture/alloca-2.c: ... here. * gcc.dg/torture/stackalign/alloca-3.c: Move to ... * gcc.dg/torture/alloca-3.c: ... here. * gcc.dg/torture/stackalign/alloca-4.c: Move to ... * gcc.dg/torture/alloca-4.c: ... here. * gcc.dg/torture/stackalign/alloca-5.c: Move to ... * gcc.dg/torture/alloca-5.c: ... here. * gcc.dg/torture/stackalign/alloca-6.c: Move to ... * gcc.dg/torture/alloca-6.c: ... here. * gcc.dg/torture/stackalign/push-1.c: Move to ... * gcc.dg/torture/push-1.c: ... here. * gcc.dg/torture/stackalign/vararg-3.c: Move to ... * gcc.dg/torture/vararg-3.c: ... here.
Re: [PATCH] Fixincludes/VxWorks
On 06/10/2012 02:38 PM, Bruce Korb wrote: On Sun, Jun 10, 2012 at 10:57 AM, Nathan Sidwellnat...@acm.org wrote: On 06/06/12 17:33, rbmj wrote: Hi everyone, This patch series is the result of this [1] thread about fixincludes on VxWorks. It resolves bugs 53457 and 53378, and a few other issues that previously required manual intervention to fix for VxWorks header files. From a vxworks POV these all look ok. Oh, from fixincludes, too. Could somebody please commit this? I don't have access. Thanks, Robert Mason
Re: [Patch] Add AC_ARG_ENABLE for libstdc++-v3
On 06/11/2012 08:02 AM, Paolo Bonzini wrote: Il 11/06/2012 13:59, rbmj ha scritto: On 05/22/2012 04:37 PM, rbmj wrote: This patch adds an AC_ARG_ENABLE option to build/not build libstdc++-v3. I wrote the patch in order to allow the user to override the automatic disable for libstdc++-v3 for certain targets. Ping^2 on http://gcc.gnu.org/ml/gcc-patches/2012-05/msg01525.html Ok. If it's OK, could somebody please commit? Thanks, Robert
Re: [patch]: move stackalign tests that use dg-options
On Mon, Jun 11, 2012 at 12:00 PM, Janis Johnson janis_john...@mentor.com wrote: The tests in gcc.dg/torture/stackalign use two sets of torture options: the usual optimization sets used as default for torture tests, and up to four sets of options that are specific to stack alignment. The stack alignment options are passed via an option that is used by the dg-test option dg_extra_tool_flags, which can be overridden by dg-options. This means that the seven tests in that test directory that use dg-option do not use the other alignment options. All seven of these tests are limited to x86 targets. Four of them use -msse2, and four use different values for -mpreferred-stack-boundary (one does both), so it doesn't look as if they are intended to use the four sets of stackalign torture options. This patch moves those seven tests out of the stackalign directory up to the general gcc.dg/torture directory. With them out of the way I'll be able to use clean up the remaining stackalign tests to use torture test support to combine the stack align options with other torture options so they'll show up in test summary lines, eliminating lots of duplicate lines in test summaries. Tested on i686-pc-linux-gnu and arm-none-eabi. OK for mainline? 2012-06-11 Janis Johnson jani...@codesourcery.com * gcc.dg/torture/stackalign/alloca-2.c: Move to ... * gcc.dg/torture/alloca-2.c: ... here. * gcc.dg/torture/stackalign/alloca-3.c: Move to ... * gcc.dg/torture/alloca-3.c: ... here. * gcc.dg/torture/stackalign/alloca-4.c: Move to ... * gcc.dg/torture/alloca-4.c: ... here. * gcc.dg/torture/stackalign/alloca-5.c: Move to ... * gcc.dg/torture/alloca-5.c: ... here. * gcc.dg/torture/stackalign/alloca-6.c: Move to ... * gcc.dg/torture/alloca-6.c: ... here. * gcc.dg/torture/stackalign/push-1.c: Move to ... * gcc.dg/torture/push-1.c: ... here. * gcc.dg/torture/stackalign/vararg-3.c: Move to ... * gcc.dg/torture/vararg-3.c: ... here. stackalign.exp has gcc-dg-runtest [lsort [glob $srcdir/$subdir/*.c]] $additional_flags if { [check_effective_target_fpic] } then { set pic_additional_flags $additional_flags lappend pic_additional_flags -fpic gcc-dg-runtest [lsort [glob $srcdir/$subdir/*.c]] $pic_additional_flags } It adds PIC tests. Will this change remove PIC tests? -- H.J.
Re: [patch]: move stackalign tests that use dg-options
On 06/11/2012 12:15 PM, H.J. Lu wrote: On Mon, Jun 11, 2012 at 12:00 PM, Janis Johnson janis_john...@mentor.com wrote: The tests in gcc.dg/torture/stackalign use two sets of torture options: the usual optimization sets used as default for torture tests, and up to four sets of options that are specific to stack alignment. The stack alignment options are passed via an option that is used by the dg-test option dg_extra_tool_flags, which can be overridden by dg-options. This means that the seven tests in that test directory that use dg-option do not use the other alignment options. All seven of these tests are limited to x86 targets. Four of them use -msse2, and four use different values for -mpreferred-stack-boundary (one does both), so it doesn't look as if they are intended to use the four sets of stackalign torture options. This patch moves those seven tests out of the stackalign directory up to the general gcc.dg/torture directory. With them out of the way I'll be able to use clean up the remaining stackalign tests to use torture test support to combine the stack align options with other torture options so they'll show up in test summary lines, eliminating lots of duplicate lines in test summaries. Tested on i686-pc-linux-gnu and arm-none-eabi. OK for mainline? 2012-06-11 Janis Johnson jani...@codesourcery.com * gcc.dg/torture/stackalign/alloca-2.c: Move to ... * gcc.dg/torture/alloca-2.c: ... here. * gcc.dg/torture/stackalign/alloca-3.c: Move to ... * gcc.dg/torture/alloca-3.c: ... here. * gcc.dg/torture/stackalign/alloca-4.c: Move to ... * gcc.dg/torture/alloca-4.c: ... here. * gcc.dg/torture/stackalign/alloca-5.c: Move to ... * gcc.dg/torture/alloca-5.c: ... here. * gcc.dg/torture/stackalign/alloca-6.c: Move to ... * gcc.dg/torture/alloca-6.c: ... here. * gcc.dg/torture/stackalign/push-1.c: Move to ... * gcc.dg/torture/push-1.c: ... here. * gcc.dg/torture/stackalign/vararg-3.c: Move to ... * gcc.dg/torture/vararg-3.c: ... here. stackalign.exp has gcc-dg-runtest [lsort [glob $srcdir/$subdir/*.c]] $additional_flags if { [check_effective_target_fpic] } then { set pic_additional_flags $additional_flags lappend pic_additional_flags -fpic gcc-dg-runtest [lsort [glob $srcdir/$subdir/*.c]] $pic_additional_flags } It adds PIC tests. Will this change remove PIC tests? The options for PIC tests are overridden by dg-options. This change will make no difference at all to how these 7 tests are compiled. They are currently compiled four times with exactly the same options. Janis
[committed] ChangeLog formatting fixes
I've just committed the attached patch as rev 188397 which fixes some formatting in the gcc/ChangeLog. Cheers, Oleg Index: gcc/ChangeLog === --- gcc/ChangeLog (revision 188396) +++ gcc/ChangeLog (working copy) @@ -452,13 +452,15 @@ 2012-06-03 Oleg Endo olege...@gcc.gnu.org PR target/53512 - * sh.opt (mfsca, mfsrra): New options. - * sh.md (rsqrtsf2): Use TARGET_FPU_ANY and TARGET_FSRRA condition. + * config/sh/sh.opt (mfsca, mfsrra): New options. + * config/sh/sh.md (rsqrtsf2): Use TARGET_FPU_ANY and TARGET_FSRRA + condition. (fsca): Use TARGET_FPU_ANY and TARGET_FSCA condition. (sinssf2, cossf2): Fold expanders to ... (sincossf3): ... this new expander. Use TARGET_FPU_ANY and TARGET_FSCA condition. - * sh.c (sh_option_override): Handle TARGET_FSRRA and TARGET_FSCA. + * config/sh/sh.c (sh_option_override): Handle TARGET_FSRRA and + TARGET_FSCA. * doc/invoke.texi (SH Options): Add descriptions for -mfsca, -mno-fsca, -mfsrra, -mno-fsrra. @@ -501,7 +503,8 @@ * varasm.c (output_constructor_bitfield): Likewise. * tree-vrp.c (register_edge_assert_for_2): Likewise. * double-int.c (rshift_double, lshift_double): Likewise. - * double-int.h (double_int_fits_in_uhwi_p, double_int, double_int): Likewise. + * double-int.h (double_int_fits_in_uhwi_p, double_int, double_int): + Likewise. * simplify-rtx.c (mode_signbit_p) (simplify_const_unary_operation, simplify_binary_operation_1) (simplify_immed_subreg): Likewise.
Re: [patch]: move stackalign tests that use dg-options
On 06/11/2012 12:35 PM, H.J. Lu wrote: On Mon, Jun 11, 2012 at 12:21 PM, Janis Johnson janis_john...@mentor.com wrote: On 06/11/2012 12:15 PM, H.J. Lu wrote: On Mon, Jun 11, 2012 at 12:00 PM, Janis Johnson janis_john...@mentor.com wrote: The tests in gcc.dg/torture/stackalign use two sets of torture options: the usual optimization sets used as default for torture tests, and up to four sets of options that are specific to stack alignment. The stack alignment options are passed via an option that is used by the dg-test option dg_extra_tool_flags, which can be overridden by dg-options. This means that the seven tests in that test directory that use dg-option do not use the other alignment options. All seven of these tests are limited to x86 targets. Four of them use -msse2, and four use different values for -mpreferred-stack-boundary (one does both), so it doesn't look as if they are intended to use the four sets of stackalign torture options. This patch moves those seven tests out of the stackalign directory up to the general gcc.dg/torture directory. With them out of the way I'll be able to use clean up the remaining stackalign tests to use torture test support to combine the stack align options with other torture options so they'll show up in test summary lines, eliminating lots of duplicate lines in test summaries. Tested on i686-pc-linux-gnu and arm-none-eabi. OK for mainline? 2012-06-11 Janis Johnson jani...@codesourcery.com * gcc.dg/torture/stackalign/alloca-2.c: Move to ... * gcc.dg/torture/alloca-2.c: ... here. * gcc.dg/torture/stackalign/alloca-3.c: Move to ... * gcc.dg/torture/alloca-3.c: ... here. * gcc.dg/torture/stackalign/alloca-4.c: Move to ... * gcc.dg/torture/alloca-4.c: ... here. * gcc.dg/torture/stackalign/alloca-5.c: Move to ... * gcc.dg/torture/alloca-5.c: ... here. * gcc.dg/torture/stackalign/alloca-6.c: Move to ... * gcc.dg/torture/alloca-6.c: ... here. * gcc.dg/torture/stackalign/push-1.c: Move to ... * gcc.dg/torture/push-1.c: ... here. * gcc.dg/torture/stackalign/vararg-3.c: Move to ... * gcc.dg/torture/vararg-3.c: ... here. stackalign.exp has gcc-dg-runtest [lsort [glob $srcdir/$subdir/*.c]] $additional_flags if { [check_effective_target_fpic] } then { set pic_additional_flags $additional_flags lappend pic_additional_flags -fpic gcc-dg-runtest [lsort [glob $srcdir/$subdir/*.c]] $pic_additional_flags } It adds PIC tests. Will this change remove PIC tests? The options for PIC tests are overridden by dg-options. This change will make no difference at all to how these 7 tests are compiled. They are currently compiled four times with exactly the same options. That doesn't match what I see on trunk as of yesterday: Executing on host: /export/build/gnu/gcc-x32/build-x86_64-linux/gcc/xgcc -B/export/build/gnu/gcc-x32/build-x86_64-linux/gcc/ /export/gnu/import/git/gcc/gcc/testsuite/gcc.dg/torture/stackalign/comp-goto-1.c -fno-diagnostics-show-caret -O0 -mstackrealign -mpreferred-stack-boundary=5 -mno-mmx -lm -o ./comp-goto-1.exe (timeout = 300) Not only stackalign.exp adds -fpic, it also adds -mforce-drap, -mstackrealign, -mpreferred-stack-boundary=5 and -mno-mmx. Your example uses comp-goto-1.c which uses the four sets of stack alignment options as intended. That isn't the case for the 7 tests that use dg-options; try one of those. I plan to modify stackalign.exp so that the stackalign options will be treated as torture options and be reported in test summaries to make each line in a test summary unique, with the goal of running the tests with the same sets of options that are used now. Janis
Re: [patch]: move stackalign tests that use dg-options
On Mon, Jun 11, 2012 at 12:41 PM, Janis Johnson janis_john...@mentor.com wrote: On 06/11/2012 12:35 PM, H.J. Lu wrote: On Mon, Jun 11, 2012 at 12:21 PM, Janis Johnson janis_john...@mentor.com wrote: On 06/11/2012 12:15 PM, H.J. Lu wrote: On Mon, Jun 11, 2012 at 12:00 PM, Janis Johnson janis_john...@mentor.com wrote: The tests in gcc.dg/torture/stackalign use two sets of torture options: the usual optimization sets used as default for torture tests, and up to four sets of options that are specific to stack alignment. The stack alignment options are passed via an option that is used by the dg-test option dg_extra_tool_flags, which can be overridden by dg-options. This means that the seven tests in that test directory that use dg-option do not use the other alignment options. All seven of these tests are limited to x86 targets. Four of them use -msse2, and four use different values for -mpreferred-stack-boundary (one does both), so it doesn't look as if they are intended to use the four sets of stackalign torture options. This patch moves those seven tests out of the stackalign directory up to the general gcc.dg/torture directory. With them out of the way I'll be able to use clean up the remaining stackalign tests to use torture test support to combine the stack align options with other torture options so they'll show up in test summary lines, eliminating lots of duplicate lines in test summaries. Tested on i686-pc-linux-gnu and arm-none-eabi. OK for mainline? 2012-06-11 Janis Johnson jani...@codesourcery.com * gcc.dg/torture/stackalign/alloca-2.c: Move to ... * gcc.dg/torture/alloca-2.c: ... here. * gcc.dg/torture/stackalign/alloca-3.c: Move to ... * gcc.dg/torture/alloca-3.c: ... here. * gcc.dg/torture/stackalign/alloca-4.c: Move to ... * gcc.dg/torture/alloca-4.c: ... here. * gcc.dg/torture/stackalign/alloca-5.c: Move to ... * gcc.dg/torture/alloca-5.c: ... here. * gcc.dg/torture/stackalign/alloca-6.c: Move to ... * gcc.dg/torture/alloca-6.c: ... here. * gcc.dg/torture/stackalign/push-1.c: Move to ... * gcc.dg/torture/push-1.c: ... here. * gcc.dg/torture/stackalign/vararg-3.c: Move to ... * gcc.dg/torture/vararg-3.c: ... here. stackalign.exp has gcc-dg-runtest [lsort [glob $srcdir/$subdir/*.c]] $additional_flags if { [check_effective_target_fpic] } then { set pic_additional_flags $additional_flags lappend pic_additional_flags -fpic gcc-dg-runtest [lsort [glob $srcdir/$subdir/*.c]] $pic_additional_flags } It adds PIC tests. Will this change remove PIC tests? The options for PIC tests are overridden by dg-options. This change will make no difference at all to how these 7 tests are compiled. They are currently compiled four times with exactly the same options. That doesn't match what I see on trunk as of yesterday: Executing on host: /export/build/gnu/gcc-x32/build-x86_64-linux/gcc/xgcc -B/export/build/gnu/gcc-x32/build-x86_64-linux/gcc/ /export/gnu/import/git/gcc/gcc/testsuite/gcc.dg/torture/stackalign/comp-goto-1.c -fno-diagnostics-show-caret -O0 -mstackrealign -mpreferred-stack-boundary=5 -mno-mmx -lm -o ./comp-goto-1.exe (timeout = 300) Not only stackalign.exp adds -fpic, it also adds -mforce-drap, -mstackrealign, -mpreferred-stack-boundary=5 and -mno-mmx. Your example uses comp-goto-1.c which uses the four sets of stack alignment options as intended. That isn't the case for the 7 tests that use dg-options; try one of those. I plan to modify stackalign.exp so that the stackalign options will be treated as torture options and be reported in test summaries to make each line in a test summary unique, with the goal of running the tests with the same sets of options that are used now. Janis Can we use dg-add-options to properly add those options? Thanks. -- H.J.
Re: [patch]: move stackalign tests that use dg-options
On 06/11/2012 12:48 PM, H.J. Lu wrote: On Mon, Jun 11, 2012 at 12:41 PM, Janis Johnson janis_john...@mentor.com wrote: On 06/11/2012 12:35 PM, H.J. Lu wrote: On Mon, Jun 11, 2012 at 12:21 PM, Janis Johnson janis_john...@mentor.com wrote: On 06/11/2012 12:15 PM, H.J. Lu wrote: On Mon, Jun 11, 2012 at 12:00 PM, Janis Johnson janis_john...@mentor.com wrote: The tests in gcc.dg/torture/stackalign use two sets of torture options: the usual optimization sets used as default for torture tests, and up to four sets of options that are specific to stack alignment. The stack alignment options are passed via an option that is used by the dg-test option dg_extra_tool_flags, which can be overridden by dg-options. This means that the seven tests in that test directory that use dg-option do not use the other alignment options. All seven of these tests are limited to x86 targets. Four of them use -msse2, and four use different values for -mpreferred-stack-boundary (one does both), so it doesn't look as if they are intended to use the four sets of stackalign torture options. This patch moves those seven tests out of the stackalign directory up to the general gcc.dg/torture directory. With them out of the way I'll be able to use clean up the remaining stackalign tests to use torture test support to combine the stack align options with other torture options so they'll show up in test summary lines, eliminating lots of duplicate lines in test summaries. Tested on i686-pc-linux-gnu and arm-none-eabi. OK for mainline? 2012-06-11 Janis Johnson jani...@codesourcery.com * gcc.dg/torture/stackalign/alloca-2.c: Move to ... * gcc.dg/torture/alloca-2.c: ... here. * gcc.dg/torture/stackalign/alloca-3.c: Move to ... * gcc.dg/torture/alloca-3.c: ... here. * gcc.dg/torture/stackalign/alloca-4.c: Move to ... * gcc.dg/torture/alloca-4.c: ... here. * gcc.dg/torture/stackalign/alloca-5.c: Move to ... * gcc.dg/torture/alloca-5.c: ... here. * gcc.dg/torture/stackalign/alloca-6.c: Move to ... * gcc.dg/torture/alloca-6.c: ... here. * gcc.dg/torture/stackalign/push-1.c: Move to ... * gcc.dg/torture/push-1.c: ... here. * gcc.dg/torture/stackalign/vararg-3.c: Move to ... * gcc.dg/torture/vararg-3.c: ... here. stackalign.exp has gcc-dg-runtest [lsort [glob $srcdir/$subdir/*.c]] $additional_flags if { [check_effective_target_fpic] } then { set pic_additional_flags $additional_flags lappend pic_additional_flags -fpic gcc-dg-runtest [lsort [glob $srcdir/$subdir/*.c]] $pic_additional_flags } It adds PIC tests. Will this change remove PIC tests? The options for PIC tests are overridden by dg-options. This change will make no difference at all to how these 7 tests are compiled. They are currently compiled four times with exactly the same options. That doesn't match what I see on trunk as of yesterday: Executing on host: /export/build/gnu/gcc-x32/build-x86_64-linux/gcc/xgcc -B/export/build/gnu/gcc-x32/build-x86_64-linux/gcc/ /export/gnu/import/git/gcc/gcc/testsuite/gcc.dg/torture/stackalign/comp-goto-1.c -fno-diagnostics-show-caret -O0 -mstackrealign -mpreferred-stack-boundary=5 -mno-mmx -lm -o ./comp-goto-1.exe (timeout = 300) Not only stackalign.exp adds -fpic, it also adds -mforce-drap, -mstackrealign, -mpreferred-stack-boundary=5 and -mno-mmx. Your example uses comp-goto-1.c which uses the four sets of stack alignment options as intended. That isn't the case for the 7 tests that use dg-options; try one of those. I plan to modify stackalign.exp so that the stackalign options will be treated as torture options and be reported in test summaries to make each line in a test summary unique, with the goal of running the tests with the same sets of options that are used now. Janis Can we use dg-add-options to properly add those options? Thanks. I'll look into that. Consider this patch dropped for now. Janis
Re: [patch]: move stackalign tests that use dg-options
On Mon, Jun 11, 2012 at 12:57 PM, Janis Johnson janis_john...@mentor.com wrote: On 06/11/2012 12:48 PM, H.J. Lu wrote: On Mon, Jun 11, 2012 at 12:41 PM, Janis Johnson janis_john...@mentor.com wrote: On 06/11/2012 12:35 PM, H.J. Lu wrote: On Mon, Jun 11, 2012 at 12:21 PM, Janis Johnson janis_john...@mentor.com wrote: On 06/11/2012 12:15 PM, H.J. Lu wrote: On Mon, Jun 11, 2012 at 12:00 PM, Janis Johnson janis_john...@mentor.com wrote: The tests in gcc.dg/torture/stackalign use two sets of torture options: the usual optimization sets used as default for torture tests, and up to four sets of options that are specific to stack alignment. The stack alignment options are passed via an option that is used by the dg-test option dg_extra_tool_flags, which can be overridden by dg-options. This means that the seven tests in that test directory that use dg-option do not use the other alignment options. All seven of these tests are limited to x86 targets. Four of them use -msse2, and four use different values for -mpreferred-stack-boundary (one does both), so it doesn't look as if they are intended to use the four sets of stackalign torture options. This patch moves those seven tests out of the stackalign directory up to the general gcc.dg/torture directory. With them out of the way I'll be able to use clean up the remaining stackalign tests to use torture test support to combine the stack align options with other torture options so they'll show up in test summary lines, eliminating lots of duplicate lines in test summaries. Tested on i686-pc-linux-gnu and arm-none-eabi. OK for mainline? 2012-06-11 Janis Johnson jani...@codesourcery.com * gcc.dg/torture/stackalign/alloca-2.c: Move to ... * gcc.dg/torture/alloca-2.c: ... here. * gcc.dg/torture/stackalign/alloca-3.c: Move to ... * gcc.dg/torture/alloca-3.c: ... here. * gcc.dg/torture/stackalign/alloca-4.c: Move to ... * gcc.dg/torture/alloca-4.c: ... here. * gcc.dg/torture/stackalign/alloca-5.c: Move to ... * gcc.dg/torture/alloca-5.c: ... here. * gcc.dg/torture/stackalign/alloca-6.c: Move to ... * gcc.dg/torture/alloca-6.c: ... here. * gcc.dg/torture/stackalign/push-1.c: Move to ... * gcc.dg/torture/push-1.c: ... here. * gcc.dg/torture/stackalign/vararg-3.c: Move to ... * gcc.dg/torture/vararg-3.c: ... here. stackalign.exp has gcc-dg-runtest [lsort [glob $srcdir/$subdir/*.c]] $additional_flags if { [check_effective_target_fpic] } then { set pic_additional_flags $additional_flags lappend pic_additional_flags -fpic gcc-dg-runtest [lsort [glob $srcdir/$subdir/*.c]] $pic_additional_flags } It adds PIC tests. Will this change remove PIC tests? The options for PIC tests are overridden by dg-options. This change will make no difference at all to how these 7 tests are compiled. They are currently compiled four times with exactly the same options. That doesn't match what I see on trunk as of yesterday: Executing on host: /export/build/gnu/gcc-x32/build-x86_64-linux/gcc/xgcc -B/export/build/gnu/gcc-x32/build-x86_64-linux/gcc/ /export/gnu/import/git/gcc/gcc/testsuite/gcc.dg/torture/stackalign/comp-goto-1.c -fno-diagnostics-show-caret -O0 -mstackrealign -mpreferred-stack-boundary=5 -mno-mmx -lm -o ./comp-goto-1.exe (timeout = 300) Not only stackalign.exp adds -fpic, it also adds -mforce-drap, -mstackrealign, -mpreferred-stack-boundary=5 and -mno-mmx. Your example uses comp-goto-1.c which uses the four sets of stack alignment options as intended. That isn't the case for the 7 tests that use dg-options; try one of those. I plan to modify stackalign.exp so that the stackalign options will be treated as torture options and be reported in test summaries to make each line in a test summary unique, with the goal of running the tests with the same sets of options that are used now. Janis Can we use dg-add-options to properly add those options? Thanks. I'll look into that. Consider this patch dropped for now. Thanks. -- H.J.
Re: [PATCH] Hoist adjacent loads
OK, once more with feeling... :) This patch differs from the previous one in two respects: It disables the optimization when either the then or else edge is well-predicted; and it now uses the existing l1-cache-line-size parameter instead of a new one (with updated commentary). Bootstraps and tests with no new regressions on powerpc64-unknown-linux-gnu. One last performance run is underway, but I don't expect any surprises since both changes are more conservative. The original benchmark issue is still resolved. Is this version ok for trunk? Thanks, Bill 2012-06-11 Bill Schmidt wschm...@linux.vnet.ibm.com * opts.c: Add -fhoist-adjacent-loads to -O2 and above. * tree-ssa-phiopt.c (tree_ssa_phiopt_worker): Add argument to forward declaration. (hoist_adjacent_loads, gate_hoist_loads): New forward declarations. (tree_ssa_phiopt): Call gate_hoist_loads. (tree_ssa_cs_elim): Add parm to tree_ssa_phiopt_worker call. (tree_ssa_phiopt_worker): Add do_hoist_loads to formal arg list; call hoist_adjacent_loads. (local_mem_dependence): New function. (hoist_adjacent_loads): Likewise. (gate_hoist_loads): Likewise. * common.opt (fhoist-adjacent-loads): New switch. * Makefile.in (tree-ssa-phiopt.o): Added dependencies. Index: gcc/opts.c === --- gcc/opts.c (revision 188390) +++ gcc/opts.c (working copy) @@ -489,6 +489,7 @@ static const struct default_options default_option { OPT_LEVELS_2_PLUS, OPT_falign_functions, NULL, 1 }, { OPT_LEVELS_2_PLUS, OPT_ftree_tail_merge, NULL, 1 }, { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_foptimize_strlen, NULL, 1 }, +{ OPT_LEVELS_2_PLUS, OPT_fhoist_adjacent_loads, NULL, 1 }, /* -O3 optimizations. */ { OPT_LEVELS_3_PLUS, OPT_ftree_loop_distribute_patterns, NULL, 1 }, Index: gcc/tree-ssa-phiopt.c === --- gcc/tree-ssa-phiopt.c (revision 188390) +++ gcc/tree-ssa-phiopt.c (working copy) @@ -37,9 +37,17 @@ along with GCC; see the file COPYING3. If not see #include cfgloop.h #include tree-data-ref.h #include tree-pretty-print.h +#include gimple-pretty-print.h +#include insn-config.h +#include expr.h +#include optabs.h +#ifndef HAVE_conditional_move +#define HAVE_conditional_move (0) +#endif + static unsigned int tree_ssa_phiopt (void); -static unsigned int tree_ssa_phiopt_worker (bool); +static unsigned int tree_ssa_phiopt_worker (bool, bool); static bool conditional_replacement (basic_block, basic_block, edge, edge, gimple, tree, tree); static int value_replacement (basic_block, basic_block, @@ -53,6 +61,9 @@ static bool cond_store_replacement (basic_block, b static bool cond_if_else_store_replacement (basic_block, basic_block, basic_block); static struct pointer_set_t * get_non_trapping (void); static void replace_phi_edge_with_variable (basic_block, edge, gimple, tree); +static void hoist_adjacent_loads (basic_block, basic_block, + basic_block, basic_block); +static bool gate_hoist_loads (void); /* This pass tries to replaces an if-then-else block with an assignment. We have four kinds of transformations. Some of these @@ -138,12 +149,56 @@ static void replace_phi_edge_with_variable (basic_ bb2: x = PHI x' (bb0), ...; - A similar transformation is done for MAX_EXPR. */ + A similar transformation is done for MAX_EXPR. + + This pass also performs a fifth transformation of a slightly different + flavor. + + Adjacent Load Hoisting + -- + + This transformation replaces + + bb0: + if (...) goto bb2; else goto bb1; + bb1: + x1 = (expr).field1; + goto bb3; + bb2: + x2 = (expr).field2; + bb3: + # x = PHI x1, x2; + + with + + bb0: + x1 = (expr).field1; + x2 = (expr).field2; + if (...) goto bb2; else goto bb1; + bb1: + goto bb3; + bb2: + bb3: + # x = PHI x1, x2; + + The purpose of this transformation is to enable generation of conditional + move instructions such as Intel CMOVE or PowerPC ISEL. Because one of + the loads is speculative, the transformation is restricted to very + specific cases to avoid introducing a page fault. We are looking for + the common idiom: + + if (...) + x = y-left; + else + x = y-right; + + where left and right are typically adjacent pointers in a tree structure. */ + static unsigned int tree_ssa_phiopt (void) { - return tree_ssa_phiopt_worker (false); + return tree_ssa_phiopt_worker (false, gate_hoist_loads ()); } /* This pass tries to transform conditional stores into unconditional @@ -190,7 +245,7 @@ tree_ssa_phiopt (void) static unsigned int tree_ssa_cs_elim (void) { - return tree_ssa_phiopt_worker (true); +
Re: [PATCH] don't presume undelegitimized UNSPEC_TLS SYMBOL_REF is a decl
On 2012-06-06 14:04, Roland McGrath wrote: 2012-06-06 Roland McGrath mcgra...@google.com * dwarf2out.c (const_ok_for_output_1): Detect a TLS UNSPEC using SYMBOL_REF_TLS_MODEL rather than DECL_THREAD_LOCAL_P, in case it's not a VAR_DECL. Also don't limit it to UNSPECs with exactly one operand. Applied. r~
Re: [PATCH] don't presume undelegitimized UNSPEC_TLS SYMBOL_REF is a decl
On Mon, Jun 11, 2012 at 1:42 PM, Richard Henderson r...@redhat.com wrote: Applied. Thanks!
Re: [PATCH] libgcc: If __GLIBC__ is defined, don't refer to pthread_cancel.
On 2012-06-08 16:23, Roland McGrath wrote: 2012-06-08 Roland McGrath mcgra...@google.com * gthr-posix.h [neither FreeBSD nor Solaris] (__gthread_active_p): If __GLIBC__ is defined, refer to __pthread_key_create instead of pthread_cancel. Applied. r~
Re: [RFC PR48941 / 51980] Rewrite arm_neon.h to use __builtin_shuffle
On 2012-06-11 14:12, Ramana Radhakrishnan wrote: I could generate the masks with appropriate casts to unsigned variants instead.That doesn't seem to make a difference either. No, that shouldn't make any difference. r~
Re: [PATCH] libgcc: If __GLIBC__ is defined, don't refer to pthread_cancel.
On Mon, Jun 11, 2012 at 2:16 PM, Richard Henderson r...@redhat.com wrote: Applied. Thanks!
Re: [RFA:] doc: TARGET_LEGITIMIZE_ADDRESS needs to be defined for native TLS
On 2012-05-15 17:24, Hans-Peter Nilsson wrote: gcc: * doc/tm.texi.in (Addressing Modes) TARGET_LEGITIMIZE_ADDRESS: Mention that this hook needs to be defined for native TLS. * doc/tm.texi: Regenerate. Ok. r~
Re: [PATCH] Fix ICE with asm goto and shrink wrapping (PR rtl-optimization/53589)
On 2012-06-06 23:53, Jakub Jelinek wrote: 2012-06-07 Jakub Jelinek ja...@redhat.com PR rtl-optimization/53589 * cfgrtl.c (force_nonfallthru_and_redirect): Do asm_goto_edge discovery even when e-dest != target. If any LABEL_REF points to e-dest label, redirect it to target's label. * gcc.dg/torture/pr53589.c: New test. Ok. r~
[SH] PR 50749 - Auto-inc-dec does not find subsequent contiguous mem accesses
Hello, I'd like to add the attached SH test cases to verify the auto-inc-dec pass operation on the SH target. Checked by invoking the individual test cases manually with make check-gcc on sh-sim. Some of the tests pass, some of them don't because of the auto-inc-dec issues mentioned in the PR. Cheers, Oleg testsuite/ChangeLog: PR target/50749 * gcc.target/sh/pr50749-sf-postinc-2.c: New. * gcc.target/sh/pr50749-sf-postinc-4.c: New. * gcc.target/sh/pr50749-qihisi-postinc-2.c: New. * gcc.target/sh/pr50749-qihisi-postinc-4.c: New. * gcc.target/sh/pr50749-sf-predec-2.c: New. * gcc.target/sh/pr50749-sf-predec-4.c: New. * gcc.target/sh/pr50749-qihisi-predec-1.c: New. * gcc.target/sh/pr50749-qihisi-predec-3.c: New. * gcc.target/sh/pr50749-sf-postinc-1.c: New. * gcc.target/sh/pr50749-sf-postinc-3.c: New. * gcc.target/sh/pr50749-qihisi-postinc-1.c: New. * gcc.target/sh/pr50749-qihisi-postinc-3.c: New. * gcc.target/sh/pr50749-sf-predec-1.c: New. * gcc.target/sh/pr50749-sf-predec-3.c: New. * gcc.target/sh/pr50749-qihisi-predec-2.c: New. * gcc.target/sh/pr50749-qihisi-predec-4.c: New. Index: gcc/testsuite/gcc.target/sh/pr50749-qihisi-predec-4.c === --- gcc/testsuite/gcc.target/sh/pr50749-qihisi-predec-4.c (revision 0) +++ gcc/testsuite/gcc.target/sh/pr50749-qihisi-predec-4.c (revision 0) @@ -0,0 +1,43 @@ +/* Verify that pre-decrement addressing is generated inside a loop. */ +/* { dg-do compile { target sh*-*-* } } */ +/* { dg-options -O2 } */ +/* { dg-final { scan-assembler-times mov.b\tr\[0-9]\+,@-r\[0-9]\+ 3 } } */ +/* { dg-final { scan-assembler-times mov.w\tr\[0-9]\+,@-r\[0-9]\+ 3 } } */ +/* { dg-final { scan-assembler-times mov.l\tr\[0-9]\+,@-r\[0-9]\+ 3 } } */ + +char* +test_func_00 (char* p, int c, int x) +{ + do + { +*--p = (char)x; +*--p = (char)x; +*--p = (char)x; + } while (--c); + return p; +} + +short* +test_func_01 (short* p, int c, int x) +{ + do + { +*--p = (short)x; +*--p = (short)x; +*--p = (short)x; + } while (--c); + return p; +} + +int* +test_func_02 (int* p, int c, int x) +{ + do + { +*--p = x; +*--p = x; +*--p = x; + } while (--c); + return p; +} + Index: gcc/testsuite/gcc.target/sh/pr50749-sf-postinc-1.c === --- gcc/testsuite/gcc.target/sh/pr50749-sf-postinc-1.c (revision 0) +++ gcc/testsuite/gcc.target/sh/pr50749-sf-postinc-1.c (revision 0) @@ -0,0 +1,15 @@ +/* Verify that post-increment addressing is generated. */ +/* { dg-do compile { target sh*-*-* } } */ +/* { dg-options -O2 } */ +/* { dg-skip-if { sh*-*-* } { -m1 -m2* -m4al *nofpu -m4-340* -m4-400* -m4-500* -m5* } { } } */ +/* { dg-final { scan-assembler-times fmov.s\t@r\[0-9]\+\\+,fr\[0-9]\+ 1 } } */ + +float* +test_func_00 (float* p, float* x) +{ + float r = 0; + r += *p++; + *x = r; + return p; +} + Index: gcc/testsuite/gcc.target/sh/pr50749-sf-postinc-3.c === --- gcc/testsuite/gcc.target/sh/pr50749-sf-postinc-3.c (revision 0) +++ gcc/testsuite/gcc.target/sh/pr50749-sf-postinc-3.c (revision 0) @@ -0,0 +1,17 @@ +/* Verify that post-increment addressing is generated inside a loop. */ +/* { dg-do compile { target sh*-*-* } } */ +/* { dg-options -O2 } */ +/* { dg-skip-if { sh*-*-* } { -m1 -m2* -m4al *nofpu -m4-340* -m4-400* -m4-500* -m5* } { } } */ +/* { dg-final { scan-assembler-times fmov.s\t@r\[0-9]\+\\+,fr\[0-9]\+ 1 } } */ + +float +test_func_00 (float* p, int c) +{ + float r = 0; + do + { +r += *p++; + } while (--c); + return r; +} + Index: gcc/testsuite/gcc.target/sh/pr50749-qihisi-postinc-1.c === --- gcc/testsuite/gcc.target/sh/pr50749-qihisi-postinc-1.c (revision 0) +++ gcc/testsuite/gcc.target/sh/pr50749-qihisi-postinc-1.c (revision 0) @@ -0,0 +1,34 @@ +/* Verify that post-increment addressing is generated. */ +/* { dg-do compile { target sh*-*-* } } */ +/* { dg-options -O2 } */ +/* { dg-final { scan-assembler-times mov.b\t@r\[0-9]\+\\+,r\[0-9]\+ 1 } } */ +/* { dg-final { scan-assembler-times mov.w\t@r\[0-9]\+\\+,r\[0-9]\+ 1 } } */ +/* { dg-final { scan-assembler-times mov.l\t@r\[0-9]\+\\+,r\[0-9]\+ 1 } } */ + +char* +test_func_00 (char* p, int* x) +{ + int r = 0; + r += *p++; + *x = r; + return p; +} + +short* +test_func_01 (short* p, int* x) +{ + int r = 0; + r += *p++; + *x = r; + return p; +} + +int* +test_func_02 (int* p, int* x) +{ + int r = 0; + r += *p++; + *x = r; + return p; +} + Index: gcc/testsuite/gcc.target/sh/pr50749-qihisi-postinc-3.c === --- gcc/testsuite/gcc.target/sh/pr50749-qihisi-postinc-3.c (revision 0) +++ gcc/testsuite/gcc.target/sh/pr50749-qihisi-postinc-3.c (revision 0)
Re: [RFC] Target-specific limits on vector alignment
From: Richard Earnshaw rearn...@arm.com Date: Mon, 11 Jun 2012 15:16:50 +0200 The ARM ABI states that vectors larger than 64 bits in size still have 64-bit alignment; never-the-less, the HW supports alignment hints of up to 128-bits in some cases and will trap in a vector has an alignment that less than the hint. GCC currently hard-codes larger vectors to be aligned by the size of the vector, which means that 128-bit vectors are marked as being 128-bit aligned. The ARM ABI unfortunately does not support generating such alignment for parameters passed by value and this can lead to traps at run time. It seems that the best way to solve this problem is to allow the back-end to set an upper limit on the alignment permitted for a vector. Don't you mean lowering the limit on the alignment required and guaranteed for a vector? (Or else it sounds like you want to force attribute-aligned to refuse higher values.) Wouldn't it be better to make the */*modes.def ADJUST_ALIGNMENT macro actually work? Does it work for you? The epiphany port tries to do it that way (optionally, not the default AFAICT), but I bet it's actually unsuccessful. I've implemented this as a separate hook, rather than using the existing hooks because there's a strong likelihood of breaking some existing ABIs if I did it another way. There are a couple of tests that will need some re-working before this can be committed to deal with the fall-out of making this change; I'll prepare those changes if this patch is deemed generally acceptable. It'd be nice to have non-naturally-aligned vectors working in general. I have a MIPS COP2 128-bit-SIMD implementation for 32-bit MIPS, which means 64-bit stack-alignment and memory loads and stores are in 64-bit parts. So, I've chosen to not require any larger alignment to keep ABI compatibility (vectors aren't passed in registers; core use is intra-function anyway). This mostly works, but there's pushback from gcc (at least in 4.3): trying to adjust vector-type-alignment by e.g. ADJUST_ALIGNMENT (V4SI, 8); in mips-modes.def has no effect; I had to do it in mips_builtin_vector_type, which might be brittle or just working by chance. Then there's the tree-vectorizer, which for some optimizations wants to assume natural alignment when doing floor alignment. I couldn't define a usable vec_realign_load_mode (undocumented) as it assumes that the low vector-size bits of an address will exactly correspond to misalignment when masking off and applying to a vector. And, you may have to conditionalize some vectorizer tests with the effective-target unaligned_stack. Just preparing you for the fall-out. :] brgds, H-P
Re: constant that doesn't fit in 32bits in alpha.c
On 2012-06-10 02:18, Jay K wrote: gcc-4.7.0/gcc/config/alpha/alpha.c word1 = expand_and (DImode, word1, GEN_INT (0x0ffffff0), NULL); That big constant isn't portable since it doesn't fit in 32bits. 1) append LL or 2) break it up into an expression, like ((HOST_WIDE_INT)0x0fff) 8) | 0x0fff0 or such. Addressed like so. I couldn't think of any nice way to define this such that it could be generic, so I left it local to this bit o code. r~ * lib/target-supports.exp (check_effective_target_sync_long_long_runtime): Use check_effective_target_lp64 instead of check_effective_target_powerpc64 for powerpc targets. diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp index 568f6b1..c937484 100644 --- a/gcc/testsuite/lib/target-supports.exp +++ b/gcc/testsuite/lib/target-supports.exp @@ -3796,7 +3796,7 @@ proc check_effective_target_sync_long_long_runtime { } { [check_effective_target_lp64] [check_effective_target_ultrasparc_hw]) } { return 1 -} elseif { [check_effective_target_powerpc64] } { +} elseif { [istarget powerpc*-*-*] [check_effective_target_lp64] } { return 1 } else { return 0
Re: constant that doesn't fit in 32bits in alpha.c
Bah. Wrong patch. r~ * config/alpha/alpha.c (alpha_trampoline_init): Split large constants. diff --git a/gcc/config/alpha/alpha.c b/gcc/config/alpha/alpha.c index 6d15bf7..3dda9fb 100644 --- a/gcc/config/alpha/alpha.c +++ b/gcc/config/alpha/alpha.c @@ -5451,6 +5451,8 @@ alpha_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value) chain_value = convert_memory_address (Pmode, chain_value); #endif +#define HWI_HEX2(X,Y) (((HOST_WIDE_INT)0x ## X ## u) | 0x ## Y ## u) + if (TARGET_ABI_OPEN_VMS) { const char *fnname; @@ -5468,7 +5470,8 @@ alpha_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value) the function's procedure descriptor with certain fields zeroed IAW the VMS calling standard. This is stored in the first quadword. */ word1 = force_reg (DImode, gen_const_mem (DImode, fnaddr)); - word1 = expand_and (DImode, word1, GEN_INT (0x0ffffff0), NULL); + word1 = expand_and (DImode, word1, + GEN_INT (HWI_HEX2(0fff,fff0)), NULL); } else { @@ -5479,10 +5482,12 @@ alpha_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value) nop We don't bother setting the HINT field of the jump; the nop is merely there for padding. */ - word1 = GEN_INT (0xa77b0010a43b0018); - word2 = GEN_INT (0x47ff041f6bfb); + word1 = GEN_INT (HWI_HEX2 (a77b0010,a43b0018)); + word2 = GEN_INT (HWI_HEX2 (47ff041f,6bfb)); } +#undef HWI_HEX2 + /* Store the first two words, as computed above. */ mem = adjust_address (m_tramp, DImode, 0); emit_move_insn (mem, word1);
Re: libgo patch committed: Update to Go 1.0.1 release
On May 4, 2012, at 8:01 AM, Ian Lance Taylor wrote: This patch updates libgo to the Go 1.0.1 release. This is a relatively small collection of bug fixes, with no API changes. So, it is an important part of the test case for libgo/go/mime/multipart/testdata/nested-mime to have ^Ms in the file? If not, could you strip them out? Thanks. If so, the testcase might need to marked as binary to keep svn and or git from stripping the ^Ms out.
RE: constant that doesn't fit in 32bits in alpha.c
Thank you. I like it. May I have another? book2:gcc jay$ grep -i epoch vms* vmsdbgout.c:/* Difference in seconds between the VMS Epoch and the Unix Epoch */ vmsdbgout.c:static const long long vms_epoch_offset = 3506716800ll; vmsdbgout.c:#define VMS_EPOCH_OFFSET 350671680 vmsdbgout.c: + VMS_EPOCH_OFFSET; :) - Jay Date: Mon, 11 Jun 2012 16:06:03 -0700 From: r...@redhat.com To: jay.kr...@cornell.edu CC: gcc-patches@gcc.gnu.org Subject: Re: constant that doesn't fit in 32bits in alpha.c Bah. Wrong patch. r~
Re: constant that doesn't fit in 32bits in alpha.c
On Jun 11, 2012, at 4:06 PM, Richard Henderson wrote: Bah. Wrong patch. r~ z.txt Hum, I'm trying to see how this patch works... I feel like there is something I'm missing, like a shift?
Re: [SH] PR 50749 - Auto-inc-dec does not find subsequent contiguous mem accesses
Oleg Endo oleg.e...@t-online.de wrote: Some of the tests pass, some of them don't because of the auto-inc-dec issues mentioned in the PR. I thought that the tests which are known to fail are marked with XFAIL, though I'm not sure the usual way for the failing test cases. Regards, kaz
Re: constant that doesn't fit in 32bits in alpha.c
On 2012-06-11 16:23, Mike Stump wrote: On Jun 11, 2012, at 4:06 PM, Richard Henderson wrote: Bah. Wrong patch. r~ z.txt Hum, I'm trying to see how this patch works... I feel like there is something I'm missing, like a shift? Double-bah. That's what I get for changing the patch at the last minute. r~ * config/alpha/alpha.c (HWI_HEX2): Add missing shift. diff --git a/gcc/config/alpha/alpha.c b/gcc/config/alpha/alpha.c index 3dda9fb..2177288 100644 --- a/gcc/config/alpha/alpha.c +++ b/gcc/config/alpha/alpha.c @@ -5451,7 +5451,7 @@ alpha_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value) chain_value = convert_memory_address (Pmode, chain_value); #endif -#define HWI_HEX2(X,Y) (((HOST_WIDE_INT)0x ## X ## u) | 0x ## Y ## u) +#define HWI_HEX2(X,Y) (((HOST_WIDE_INT)0x ## X ## u 32) | 0x ## Y ## u) if (TARGET_ABI_OPEN_VMS) {
Re: constant that doesn't fit in 32bits in alpha.c
On 2012-06-11 16:23, Jay K wrote: Thank you. I like it. May I have another? book2:gcc jay$ grep -i epoch vms* vmsdbgout.c:/* Difference in seconds between the VMS Epoch and the Unix Epoch */ vmsdbgout.c:static const long long vms_epoch_offset = 3506716800ll; vmsdbgout.c:#define VMS_EPOCH_OFFSET 350671680 vmsdbgout.c:+ VMS_EPOCH_OFFSET; In this case, long long is explicitly referenced in the surrounding code, so I feel the safest change is just to add LL here. I've cross-compiled to alpha-dec-vms just to make sure, but I can't actually test this. r~ * vmsdbgout.c (VMS_EPOCH_OFFSET): Add LL suffix. diff --git a/gcc/vmsdbgout.c b/gcc/vmsdbgout.c index 9689653..eedf1bd 100644 --- a/gcc/vmsdbgout.c +++ b/gcc/vmsdbgout.c @@ -1676,7 +1676,7 @@ to_vms_file_spec (char *filespec) } #else -#define VMS_EPOCH_OFFSET 350671680 +#define VMS_EPOCH_OFFSET 350671680LL #define VMS_GRANULARITY_FACTOR 1000 #endif
Re: [SH] PR 50749 - Auto-inc-dec does not find subsequent contiguous mem accesses
On Tue, 12 Jun 2012, Kaz Kojima wrote: Oleg Endo oleg.e...@t-online.de wrote: Some of the tests pass, some of them don't because of the auto-inc-dec issues mentioned in the PR. I thought that the tests which are known to fail are marked with XFAIL, Yes, with a clear reference to the PR at the xfail. The closest thing to a written rule for this seems to be http://gcc.gnu.org/wiki/HowToPrepareATestcase. brgds, H-P
Re: MIPS testsuite patch for --with-synci configurations
On 06/11/2012 12:30 PM, Steve Ellcey wrote: This patch is to fix MIPS failures that occcur when configuring GCC with the -with-synci option. In this case GCC will generate the synci instruction by default if on a MIPS architecture that supports it . When on an architecture that does not support synci, GCC will not generate it by default but it does generate a warning saying that it is not doing so and that warning is causing tests that explicitly specify a MIPS architecture that does not support synci but do not explicitly turn off synci to fail with an extra, unexpected warning. I initially looked at changing GCC to remove the warning but that did not look workable, see http://gcc.gnu.org/ml/gcc/2012-06/msg00100.html for more details. This patch addes the -mno-synci flag to MIPS tests that specify an architecture that does not support synci, thus getting rid of the warning message and making the tests pass. Tested with the mips-linux-gnu and mips-sde-elf targets both with and without --with-synci on the GCC configuration. OK to checkin? I wonder if it would make more sense to modify the testsuite driver to take care of this. It seems like the set of files with the -mno-synci annotation could easily become different than the set that requires it. David Daney Steve Ellcey sell...@mips.com 2012-06-11 Steve Ellceysell...@mips.com * gcc.target/mips/call-saved-1.c: Add -mno-synci flag. * gcc.target/mips/call-saved-2.c: Ditto. * gcc.target/mips/call-saved-3.c: Ditto. * gcc.target/mips/clear-cache-2.c: Ditto. * gcc.target/mips/ext-8.c: Ditto. * gcc.target/mips/extend-2.c: Ditto. * gcc.target/mips/fix-r4000-1.c: Ditto. * gcc.target/mips/fix-r4000-10.c: Ditto. * gcc.target/mips/fix-r4000-11.c: Ditto. * gcc.target/mips/fix-r4000-12.c: Ditto. * gcc.target/mips/fix-r4000-2.c: Ditto. * gcc.target/mips/fix-r4000-3.c: Ditto. * gcc.target/mips/fix-r4000-4.c: Ditto. * gcc.target/mips/fix-r4000-5.c: Ditto. * gcc.target/mips/fix-r4000-6.c: Ditto. * gcc.target/mips/fix-r4000-7.c: Ditto. * gcc.target/mips/fix-r4000-8.c: Ditto. * gcc.target/mips/fix-r4000-9.c: Ditto. * gcc.target/mips/fix-vr4130-1.c: Ditto. * gcc.target/mips/fix-vr4130-2.c: Ditto. * gcc.target/mips/fix-vr4130-3.c: Ditto. * gcc.target/mips/fix-vr4130-4.c: Ditto. * gcc.target/mips/fpr-moves-1.c: Ditto. * gcc.target/mips/fpr-moves-2.c: Ditto. * gcc.target/mips/loongson-muldiv-1.c: Ditto. * gcc.target/mips/loongson-muldiv-2.c: Ditto. * gcc.target/mips/loongson-shift-count-truncated-1.c: Ditto. * gcc.target/mips/loongson-simd.c: Ditto. * gcc.target/mips/loongson3a-muldiv-1.c: Ditto. * gcc.target/mips/loongson3a-muldiv-2.c: Ditto. * gcc.target/mips/madd-1.c: Ditto. * gcc.target/mips/madd-2.c: Ditto. * gcc.target/mips/madd-5.c: Ditto. * gcc.target/mips/madd-6.c: Ditto. * gcc.target/mips/madd-7.c: Ditto. * gcc.target/mips/madd-8.c: Ditto. * gcc.target/mips/maddu-1.c: Ditto. * gcc.target/mips/maddu-2.c: Ditto. * gcc.target/mips/msub-1.c: Ditto. * gcc.target/mips/msub-2.c: Ditto. * gcc.target/mips/msub-5.c: Ditto. * gcc.target/mips/msub-6.c: Ditto. * gcc.target/mips/msub-7.c: Ditto. * gcc.target/mips/msub-8.c: Ditto. * gcc.target/mips/msubu-1.c: Ditto. * gcc.target/mips/msubu-2.c: Ditto. * gcc.target/mips/nmadd-1.c: Ditto. * gcc.target/mips/nmadd-2.c: Ditto. * gcc.target/mips/nmadd-3.c: Ditto. * gcc.target/mips/no-smartmips-ror-1.c: Ditto. * gcc.target/mips/pr34831.c: Ditto. * gcc.target/mips/r10k-cache-barrier-10.c: Ditto. * gcc.target/mips/r3900-mult.c: Ditto. * gcc.target/mips/rsqrt-1.c: Ditto. * gcc.target/mips/rsqrt-2.c: Ditto. * gcc.target/mips/rsqrt-3.c: Ditto. * gcc.target/mips/rsqrt-4.c: Ditto. * gcc.target/mips/sb1-1.c: Ditto. * gcc.target/mips/vr-mult-1.c: Ditto. * gcc.target/mips/vr-mult-2.c: Ditto. [...]
Re: [SH] PR 50749 - Auto-inc-dec does not find subsequent contiguous mem accesses
Hans-Peter Nilsson h...@bitrange.com wrote: On Tue, 12 Jun 2012, Kaz Kojima wrote: Oleg Endo oleg.e...@t-online.de wrote: Some of the tests pass, some of them don't because of the auto-inc-dec issues mentioned in the PR. I thought that the tests which are known to fail are marked with XFAIL, Yes, with a clear reference to the PR at the xfail. The closest thing to a written rule for this seems to be http://gcc.gnu.org/wiki/HowToPrepareATestcase. Thanks for clarification! Oleg, the patch is OK with that change. Regards, kaz
Re: [PATCH 3/3] rs6000: Rewrite sync patterns for atomic; expand early.
On Sat, Jun 9, 2012 at 10:40 AM, Richard Henderson r...@twiddle.net wrote: Nope. I do see the obvious mistake in the atomic_load pattern though: The mode iterator should have been INT1 not INT. Did you want to commit the fix for the iterator? ... and for extra credit we ought to implement DImode atomic load/store via the FPU, like we do for i386. Yes, however the PPC embedded community used to complain about FP loads and stores generated by the movdi pattern in 32 bit mode. Why is this function using FPRs when it does not contain any floating point operations?. This is for a processor with an FPU, but the developer does not want extraneous FPR usage that will force a lazy FP save in the kernel. In other words, one can use FPRs if the function already explicitly uses FP operations in user code, but the compiler should not introduce it. This seems to require scanning each function for floating point usage before allowing FPRs for DImode load/store operations. I like your suggestion, but the PowerPC developer community does not uniformly appreciate that behavior. Thanks, David
[testsuite] gcc.dg/cpp: make message check lines unique in test summary
This test modifies dg-message test directives by adding comments that will be added to lines in the test summary to eliminate non-unique lines for checks of messages for the same line of source code in a test. Tested on i686-pc-linux-gnu and arm-none-eabi. OK for mainline? Janis 2012-06-11 Janis Johnson jani...@codesourcery.com * gcc.dg/cpp/include2a.c: Add comments to checks for multiple messages reported for one line of source code. * gcc.dg/cpp/pr30786.c: Likewise. * gcc.dg/cpp/pr28709.c: Likewise. * gcc.dg/cpp/missing-header-MD.c: Likewise. * gcc.dg/cpp/macspace2.c: Likewise. * gcc.dg/cpp/missing-header-1.c: Likewise. * gcc.dg/cpp/missing-header-MMD.c: Likewise. * gcc.dg/cpp/missing-sysheader-MD.c: Likewise. * gcc.dg/cpp/missing-sysheader-MMD.c: Likewise. Index: gcc.dg/cpp/include2a.c === --- gcc.dg/cpp/include2a.c (revision 188249) +++ gcc.dg/cpp/include2a.c (working copy) @@ -11,6 +11,6 @@ /* These error is No such file or directory, just once. However, this message is locale-dependent, so don't test for it. */ -/* { dg-error silly { target *-*-* } 0 } */ -/* { dg-error missing { target *-*-* } 0 } */ -/* { dg-message terminated { target *-*-* } 0 } */ +/* { dg-error silly silly { target *-*-* } 0 } */ +/* { dg-error missing missing { target *-*-* } 0 } */ +/* { dg-message terminated terminated { target *-*-* } 0 } */ Index: gcc.dg/cpp/pr30786.c === --- gcc.dg/cpp/pr30786.c(revision 188249) +++ gcc.dg/cpp/pr30786.c(working copy) @@ -1,8 +1,8 @@ /* PR preprocessor/30786 - _Pragma at end of file should not ICE */ /* { dg-do compile } */ -/* { dg-error parenthesized { target *-*-* } 9 } */ -/* { dg-error expected { target *-*-* } 9 } */ +/* { dg-error parenthesized parenthesized { target *-*-* } 9 } */ +/* { dg-error expected expected { target *-*-* } 9 } */ int x; Index: gcc.dg/cpp/pr28709.c === --- gcc.dg/cpp/pr28709.c(revision 188249) +++ gcc.dg/cpp/pr28709.c(working copy) @@ -6,5 +6,5 @@ #define foo - ## foo; -/* { dg-error expected identifier.*'-' { target *-*-* } 8 } */ -/* { dg-error pasting { target *-*-* } 8 } */ +/* { dg-error expected identifier.*'-' expected { target *-*-* } 8 } */ +/* { dg-error pasting pasting { target *-*-* } 8 } */ Index: gcc.dg/cpp/missing-header-MD.c === --- gcc.dg/cpp/missing-header-MD.c (revision 188249) +++ gcc.dg/cpp/missing-header-MD.c (working copy) @@ -3,8 +3,8 @@ /* { dg-options -MD } */ #include nonexistent.h -/* { dg-message nonexistent.h { target *-*-* } 0 } */ -/* { dg-message terminated { target *-*-* } 0 } */ +/* { dg-message nonexistent.h nonexistent.h { target *-*-* } 0 } */ +/* { dg-message terminated terminated { target *-*-* } 0 } */ /* This declaration should not receive any diagnostic. */ foo bar; Index: gcc.dg/cpp/macspace2.c === --- gcc.dg/cpp/macspace2.c (revision 188249) +++ gcc.dg/cpp/macspace2.c (working copy) @@ -59,5 +59,5 @@ #define agabc/* { dg-error requires whitespace } */ int dummy; -/* { dg-error missing terminating { target *-*-* } 6 } */ -/* { dg-error missing terminating { target *-*-* } 10 } */ +/* { dg-error missing terminating missing-terminating { target *-*-* } 6 } */ +/* { dg-error missing terminating missing-terminating { target *-*-* } 10 } */ Index: gcc.dg/cpp/missing-header-1.c === --- gcc.dg/cpp/missing-header-1.c (revision 188249) +++ gcc.dg/cpp/missing-header-1.c (working copy) @@ -3,8 +3,8 @@ /* { dg-options } */ #include nonexistent.h -/* { dg-message nonexistent.h { target *-*-* } 0 } */ -/* { dg-message terminated { target *-*-* } 0 } */ +/* { dg-message nonexistent.h nonexistent.h { target *-*-* } 0 } */ +/* { dg-message terminated terminated { target *-*-* } 0 } */ /* This declaration should not receive any diagnostic. */ foo bar; Index: gcc.dg/cpp/missing-header-MMD.c === --- gcc.dg/cpp/missing-header-MMD.c (revision 188249) +++ gcc.dg/cpp/missing-header-MMD.c (working copy) @@ -3,8 +3,8 @@ /* { dg-options -MMD } */ #include nonexistent.h -/* { dg-message nonexistent.h { target *-*-* } 0 } */ -/* { dg-message terminated { target *-*-* } 0 } */ +/* { dg-message nonexistent.h nonexistent.h { target *-*-* } 0 } */ +/* { dg-message terminated terminated { target *-*-* } 0 } */ /* This declaration should not receive any diagnostic. */ foo bar; Index: gcc.dg/cpp/missing-sysheader-MD.c
[testsuite] c-c++-common: make message check lines unique in test summary
This test modifies dg-error and dg-warning test directives by adding comments that will be added to lines in the test summary to eliminate non-unique lines for checks of messages for the same line of source code in a test. Tested on i686-pc-linux-gnu and arm-none-eabi. OK for mainline? Janis 2012-06-11 Janis Johnson jani...@codesourcery.com * c-c++-common/raw-string-3.c: Add comments to checks for multiple messages reported for for one line of source code. * c-c++-common/raw-string-5.c: Likewise. * c-c++-common/raw-string-4.c: Likewise. * c-c++-common/raw-string-6.c: Likewise. * c-c++-common/pr2.c: Likewise. Index: c-c++-common/raw-string-3.c === --- c-c++-common/raw-string-3.c (revision 188249) +++ c-c++-common/raw-string-3.c (working copy) @@ -4,27 +4,27 @@ // { dg-options { target c } } // { dg-options -std=c++98 { target c++ } } -const void *s0 = R(a); // { dg-error was not declared|undeclared } -// { dg-error expected ',' or ';' { target c } 7 } -const void *s1 = uR(a); // { dg-error was not declared|undeclared } -// { dg-error expected ',' or ';' { target c } 9 } -const void *s2 = UR(a); // { dg-error was not declared|undeclared } -// { dg-error expected ',' or ';' { target c } 11 } -const void *s3 = u8R(a); // { dg-error was not declared|undeclared } -// { dg-error expected ',' or ';' { target c } 13 } -const void *s4 = LR(a); // { dg-error was not declared|undeclared } -// { dg-error expected ',' or ';' { target c } 15 } +const void *s0 = R(a); // { dg-error was not declared|undeclared undeclared } +// { dg-error expected ',' or ';' expected { target c } 7 } +const void *s1 = uR(a); // { dg-error was not declared|undeclared undeclared } +// { dg-error expected ',' or ';' expected { target c } 9 } +const void *s2 = UR(a); // { dg-error was not declared|undeclared undeclared } +// { dg-error expected ',' or ';' expected { target c } 11 } +const void *s3 = u8R(a); // { dg-error was not declared|undeclared undeclared } +// { dg-error expected ',' or ';' expected { target c } 13 } +const void *s4 = LR(a); // { dg-error was not declared|undeclared undeclared } +// { dg-error expected ',' or ';' expected { target c } 15 } -const int i0 = R'a'; // { dg-error was not declared { target c++ } } -// { dg-error expected ',' or ';' { target c } 18 } -const int i1 = uR'a';// { dg-error was not declared { target c++ } } -// { dg-error expected ',' or ';' { target c } 20 } -const int i2 = UR'a';// { dg-error was not declared { target c++ } } -// { dg-error expected ',' or ';' { target c } 22 } -const int i3 = u8R'a'; // { dg-error was not declared { target c++ } } -// { dg-error expected ',' or ';' { target c } 24 } -const int i4 = LR'a';// { dg-error was not declared { target c++ } } -// { dg-error expected ',' or ';' { target c } 26 } +const int i0 = R'a'; // { dg-error was not declared undeclared { target c++ } } +// { dg-error expected ',' or ';' expected { target c } 18 } +const int i1 = uR'a';// { dg-error was not declared undeclared { target c++ } } +// { dg-error expected ',' or ';' expected { target c } 20 } +const int i2 = UR'a';// { dg-error was not declared undeclared { target c++ } } +// { dg-error expected ',' or ';' expected { target c } 22 } +const int i3 = u8R'a'; // { dg-error was not declared undeclared { target c++ } } +// { dg-error expected ',' or ';' expected { target c } 24 } +const int i4 = LR'a';// { dg-error was not declared undeclared { target c++ } } +// { dg-error expected ',' or ';' expected { target c } 26 } #define R a #define uR b Index: c-c++-common/raw-string-5.c === --- c-c++-common/raw-string-5.c (revision 188249) +++ c-c++-common/raw-string-5.c (working copy) @@ -3,25 +3,25 @@ // { dg-options -std=c++0x { target c++ } } const void *s0 = R0123456789abcdefg()0123456789abcdefg; - // { dg-error raw string delimiter longer { target *-*-* } 5 } - // { dg-error stray { target *-*-* } 5 } + // { dg-error raw string delimiter longer longer { target *-*-* } 5 } + // { dg-error stray stray { target *-*-* } 5 } const void *s1 = R () ; - // { dg-error invalid character { target *-*-* } 8 } -
[testsuite] gcc.dg: make message check lines unique in test summary
This test modifies dg-error, dg-warning, dg-message, and dg-bogus test directives by adding comments that will be added to lines in the test summary to eliminate non-unique lines for checks of messages for the same line of source code in a test. Tested on i686-pc-linux-gnu and arm-none-eabi. OK for mainline? Janis 2012-06-11 Janis Johnson jani...@codesourcery.com * gcc.dg/20031223-1.c: Add comments to check for multiple messages reported for one line of source code. * gcc.dg/Wconversion-integer.c: Likewise. * gcc.dg/Wfatal-2.c: Likewise. * gcc.dg/Wfatal.c: Likewise. * gcc.dg/Wobjsize-1.c: Likewise. * gcc.dg/c99-vla-jump-1.c: Likewise. * gcc.dg/c99-vla-jump-2.c: Likewise. * gcc.dg/c99-vla-jump-3.c: Likewise. * gcc.dg/c99-vla-jump-4.c: Likewise. * gcc.dg/c99-vla-jump-5.c: Likewise. * gcc.dg/decl-9.c: Likewise. * gcc.dg/declspec-10.c: Likewise. * gcc.dg/declspec-18.c: Likewise. * gcc.dg/mtune.c: Likewise. * gcc.dg/parser-pr28152-2.c: Likewise. * gcc.dg/parser-pr28152.c: Likewise. * gcc.dg/pr14475.c: Likewise. * gcc.dg/pr27953.c: Likewise. * gcc.dg/pr28322-3.c: Likewise. * gcc.dg/pr30457.c: Likewise. * gcc.dg/pr30551-2.c: Likewise. * gcc.dg/pr30551-3.c: Likewise. * gcc.dg/pr30551-4.c: Likewise. * gcc.dg/pr30551-5.c: Likewise. * gcc.dg/pr30551-6.c: Likewise. * gcc.dg/pr30551.c: Likewise. * gcc.dg/pr45461.c: Likewise. * gcc.dg/pr48552-1.c: Likewise. * gcc.dg/pr48552-2.c: Likewise. * gcc.dg/redecl-1.c: Likewise. * gcc.dg/transparent-union-3.c: Likewise. * gcc.dg/utf-dflt.c: Likewise. * gcc.dg/utf-dflt2.c: Likewise. * gcc.dg/vla-8.c: Likewise. * gcc.dg/vla-init-1.c: Likewise. * gcc.dg/wtr-int-type-1.c: Likewise. Index: gcc.dg/20031223-1.c === --- gcc.dg/20031223-1.c (revision 188393) +++ gcc.dg/20031223-1.c (working copy) @@ -7,7 +7,7 @@ void f () { - l: int; /* { dg-error a label can only be part of a statement and a declaration is not a statement } */ - /* { dg-warning useless type name in empty declaration { target *-*-* } 10 } */ - /* { dg-error label at end of compound statement { target *-*-* } 10 } */ + l: int; /* { dg-error a label can only be part of a statement and a declaration is not a statement not stmt } */ + /* { dg-warning useless type name in empty declaration type name { target *-*-* } 10 } */ + /* { dg-error label at end of compound statement label { target *-*-* } 10 } */ } Index: gcc.dg/Wconversion-integer.c === --- gcc.dg/Wconversion-integer.c(revision 188393) +++ gcc.dg/Wconversion-integer.c(working copy) @@ -40,10 +40,10 @@ fuc ('A'); uc = 'A'; - uc = x ? 1U : -1; /* { dg-warning conversion } */ - /* { dg-warning negative integer implicitly converted to unsigned type { target *-*-* } 43 } */ - uc = x ? SCHAR_MIN : 1U; /* { dg-warning conversion } */ - /* { dg-warning negative integer implicitly converted to unsigned type { target *-*-* } 45 } */ + uc = x ? 1U : -1; /* { dg-warning conversion conversion } */ + /* { dg-warning negative integer implicitly converted to unsigned type implicit { target *-*-* } 43 } */ + uc = x ? SCHAR_MIN : 1U; /* { dg-warning conversion conversion } */ + /* { dg-warning negative integer implicitly converted to unsigned type implicit { target *-*-* } 45 } */ uc = x ? 1 : -1; /* { dg-warning negative integer implicitly converted to unsigned type } */ uc = x ? SCHAR_MIN : 1; /* { dg-warning negative integer implicitly converted to unsigned type } */ ui = x ? 1U : -1; /* { dg-warning negative integer implicitly converted to unsigned type } */ Index: gcc.dg/Wfatal-2.c === --- gcc.dg/Wfatal-2.c (revision 188393) +++ gcc.dg/Wfatal-2.c (working copy) @@ -5,5 +5,5 @@ int i = INT_MAX + 1; /* { dg-error integer overflow in expression } */ int k = 1 / 0; int j = INT_MIN - 1; -/* { dg-message being treated as errors { target *-*-* } 0 } */ -/* { dg-message terminated due to -Wfatal-errors { target *-*-* } 0 } */ +/* { dg-message being treated as errors treated as errors { target *-*-* } 0 } */ +/* { dg-message terminated due to -Wfatal-errors terminated { target *-*-* } 0 } */ Index: gcc.dg/Wfatal.c === --- gcc.dg/Wfatal.c (revision 188393) +++ gcc.dg/Wfatal.c (working copy) @@ -5,8 +5,8 @@ int i = INT_MAX + 1; /* { dg-warning integer overflow in expression } */ int k = 1 / 0; /* { dg-error division by zero } */ int j = INT_MIN - 1; -/* { dg-message some warnings being treated as errors {target *-*-*} 0 } */ -/* { dg-message terminated
[testsuite] gcc.c-torture/compile: make msg check lines unique in test summary
This test modifies dg-message test directives by adding comments that will be added to lines in the test summary to eliminate non-unique lines for checks of messages for the same line of source code in a test. Tested on i686-pc-linux-gnu and arm-none-eabi. OK for mainline? Janis 2012-06-11 Janis Johnson jani...@codesourcery.com * gcc.c-torture/compile/sync-1.c: Add comments to checks for multiple messages reported for one line of source code. Index: gcc.c-torture/compile/sync-1.c === --- gcc.c-torture/compile/sync-1.c (revision 188249) +++ gcc.c-torture/compile/sync-1.c (working copy) @@ -1,5 +1,5 @@ -/* { dg-message note: '__sync_fetch_and_nand' changed semantics in GCC 4.4 { target *-*-* } 0 } */ -/* { dg-message note: '__sync_nand_and_fetch' changed semantics in GCC 4.4 { target *-*-* } 0 } */ +/* { dg-message note: '__sync_fetch_and_nand' changed semantics in GCC 4.4 fetch_and_nand { target *-*-* } 0 } */ +/* { dg-message note: '__sync_nand_and_fetch' changed semantics in GCC 4.4 nand_and_fetch { target *-*-* } 0 } */ /* { dg-options -ffat-lto-objects } */ /* Validate that each of the __sync builtins compiles. This won't