Re: [PATCH][AArch64] Change aarch64 vector cost to match vectorizer

2015-08-04 Thread James Greenhalgh
On Tue, Aug 04, 2015 at 11:06:11AM +0100, Pawel Kupidura wrote:
 Hi,
 
 I'm sorry about the issues with formatting, it should be fixed now. 
 Here's corrected version with diff to current trunk.

Hi Pawel,

I'm still having trouble getting this patch to apply, I'm not sure whether
it is the format=flowed in your mail headers, or the quoted-printable
encoding, or something else. Certainly when I open your emails I see :

if (where == vect_body  stmt_info  stmt_in_inner_loop_p 
(stmt_info))

The content of the patch is OK to commit, but it would be good to
have a copy on list that can be easily applied.

Thanks,
James

 diff --git a/gcc/ChangeLog b/gcc/ChangeLog
 index fdc4a7e..d1c6663 100644
 --- a/gcc/ChangeLog
 +++ b/gcc/ChangeLog
 @@ -1,3 +1,7 @@
 +2015-08-04  Pawel Kupidura  pawel.kupid...@arm.com
 + * config/aarch64/aarch64.c: Change inner loop statement cost
 + to be consistent with other targets.
 +
   2015-08-03  Abe Skolnik  a.skol...@samsung.com
 
   * tree-if-conv.c: Fix various typos in comments.
 diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
 index 2b1ae36..173a385 100644
 --- a/gcc/config/aarch64/aarch64.c
 +++ b/gcc/config/aarch64/aarch64.c
 @@ -7086,15 +7086,9 @@ aarch64_add_stmt_cost (void *data, int count, 
 enum vect_cost_for_stmt kind,
 
 /* Statements in an inner loop relative to the loop being
vectorized are weighted more heavily.  The value here is
 -  a function (linear for now) of the loop nest level.  */
 +  arbitrary and could potentially be improved with analysis.  */
 if (where == vect_body  stmt_info  stmt_in_inner_loop_p 
 (stmt_info))
 - {
 -   loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
 -   struct loop *loop =  LOOP_VINFO_LOOP (loop_info);
 -   unsigned nest_level = loop_depth (loop);
 -
 -   count *= nest_level;
 - }
 + count *= 50; /*  FIXME  */
 
 retval = (unsigned) (count * stmt_cost);
 cost[where] += retval;


Re: [PATCH 9/15][AArch64] vld{2,3,4}{,_lane,_dup}, vcombine, vcreate

2015-08-04 Thread Alan Lawrence

James Greenhalgh wrote:

On Tue, Jul 28, 2015 at 12:25:55PM +0100, Alan Lawrence wrote:

gcc/ChangeLog:

* config/aarch64/aarch64.c (aarch64_split_simd_combine): Add V4HFmode.
* config/aarch64/aarch64-builtins.c (VAR13, VAR14): New.
(aarch64_scalar_builtin_types, aarch64_init_simd_builtin_scalar_types):
Add __builtin_aarch64_simd_hf.
* config/aarch64/arm_neon.h (float16x4x2_t, float16x8x2_t,
float16x4x3_t, float16x8x3_t, float16x4x4_t, float16x8x4_t,
vcombine_f16, vst2_lane_f16, vst2q_lane_f16, vst3_lane_f16,
vst3q_lane_f16, vst4_lane_f16, vst4q_lane_f16, vld2_f16, vld2q_f16,
vld3_f16, vld3q_f16, vld4_f16, vld4q_f16, vld2_dup_f16, vld2q_dup_f16,
vld3_dup_f16, vld3q_dup_f16, vld4_dup_f16, vld4q_dup_f16,
vld2_lane_f16, vld2q_lane_f16, vld3_lane_f16, vld3q_lane_f16,
vld4_lane_f16, vld4q_lane_f16, vst2_f16, vst2q_f16, vst3_f16,
vst3q_f16, vst4_f16, vst4q_f16, vcreate_f16): New.

* config/aarch64/iterators.md (VALLDIF, Vtype, Vetype, Vbtype,
V_cmp_result, v_cmp_result): Add cases for V4HF and V8HF.
(VDC, Vdbl): Add V4HF.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/vldN_1.c: Add float16x4_t and float16x8_t cases.
* gcc.target/aarch64/vldN_dup_1.c: Likewise.
* gcc.target/aarch64/vldN_lane_1.c: Likewise.


Hi Alan,

The arm_neon.h portion of this patch does not apply after Charles' recent
changes. Could you please rebase and resubmit the patch for review?

Thanks,
James


Ah, indeed, thanks. Here's a rebased version, using Charles' new versions of 
__(LD|ST)[234]_LANE_FUNC. I'll follow with a patch adding corresponding 
lane_f16_indices tests in a separate email.


(Changelog as before)

Bootstrapped + check-gcc on aarch64-none-linux-gnu.
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 800f6e1ffcd358aa22ceecbc460bc1dcac4acd9e..2394efdb483e1128d2990852871ab4abfed8bdfc 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -61,6 +61,7 @@
 
 #define v8qi_UP  V8QImode
 #define v4hi_UP  V4HImode
+#define v4hf_UP  V4HFmode
 #define v2si_UP  V2SImode
 #define v2sf_UP  V2SFmode
 #define v1df_UP  V1DFmode
@@ -68,6 +69,7 @@
 #define df_UPDFmode
 #define v16qi_UP V16QImode
 #define v8hi_UP  V8HImode
+#define v8hf_UP  V8HFmode
 #define v4si_UP  V4SImode
 #define v4sf_UP  V4SFmode
 #define v2di_UP  V2DImode
@@ -520,6 +522,8 @@ aarch64_simd_builtin_std_type (enum machine_mode mode,
   return aarch64_simd_intCI_type_node;
 case XImode:
   return aarch64_simd_intXI_type_node;
+case HFmode:
+  return aarch64_fp16_type_node;
 case SFmode:
   return float_type_node;
 case DFmode:
@@ -604,6 +608,8 @@ aarch64_init_simd_builtin_types (void)
   aarch64_simd_types[Poly64x2_t].eltype = aarch64_simd_types[Poly64_t].itype;
 
   /* Continue with standard types.  */
+  aarch64_simd_types[Float16x4_t].eltype = aarch64_fp16_type_node;
+  aarch64_simd_types[Float16x8_t].eltype = aarch64_fp16_type_node;
   aarch64_simd_types[Float32x2_t].eltype = float_type_node;
   aarch64_simd_types[Float32x4_t].eltype = float_type_node;
   aarch64_simd_types[Float64x1_t].eltype = double_type_node;
diff --git a/gcc/config/aarch64/aarch64-simd-builtin-types.def b/gcc/config/aarch64/aarch64-simd-builtin-types.def
index bb54e56ce63c040dbfe69e2249e642d2c43fd0af..ea219b72ff9ac406c2439cda002617e710b2966c 100644
--- a/gcc/config/aarch64/aarch64-simd-builtin-types.def
+++ b/gcc/config/aarch64/aarch64-simd-builtin-types.def
@@ -44,6 +44,8 @@
   ENTRY (Poly16x8_t, V8HI, poly, 12)
   ENTRY (Poly64x1_t, DI, poly, 12)
   ENTRY (Poly64x2_t, V2DI, poly, 12)
+  ENTRY (Float16x4_t, V4HF, none, 13)
+  ENTRY (Float16x8_t, V8HF, none, 13)
   ENTRY (Float32x2_t, V2SF, none, 13)
   ENTRY (Float32x4_t, V4SF, none, 13)
   ENTRY (Float64x1_t, V1DF, none, 13)
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index d0f298a1f075f51d4d47c6f364860dd1d0a545e0..39ff34e16d8bb79bcd44a4f40d214963996968af 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -367,11 +367,11 @@
   VAR1 (UNOP, float_extend_lo_, 0, v2df)
   VAR1 (UNOP, float_truncate_lo_, 0, v2sf)
 
-  /* Implemented by aarch64_ld1VALL:mode.  */
-  BUILTIN_VALL (LOAD1, ld1, 0)
+  /* Implemented by aarch64_ld1VALL_F16:mode.  */
+  BUILTIN_VALL_F16 (LOAD1, ld1, 0)
 
-  /* Implemented by aarch64_st1VALL:mode.  */
-  BUILTIN_VALL (STORE1, st1, 0)
+  /* Implemented by aarch64_st1VALL_F16:mode.  */
+  BUILTIN_VALL_F16 (STORE1, st1, 0)
 
   /* Implemented by fmamode4.  */
   BUILTIN_VDQF (TERNOP, fma, 4)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 97774181fab11b846d40c3981e2d1f9ea4891337..cab712d7d18dc8a9bebf2b25608b5b4490a07b45 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ 

Re: revised and updated new-if-converter patch… [PATCH] fix PR46029: reimplement if conversion of loads and stores

2015-08-04 Thread Richard Biener
On Tue, Aug 4, 2015 at 7:05 AM, Jeff Law l...@redhat.com wrote:
 On 07/17/2015 01:57 PM, Abe wrote:

 Dear all,

 Relative to the previous submission of this same patch, the below
 corrects some minor spacing and/or indentation issues,
 misc. other formatting fixes, and makes the disabled vectorization tests
 be disabled via xfail rather than by adding spaces to
 deliberately cause the relevant scanned-for text to not be found by
 DejaGNU so as to prevent the DejaGNU line from being interpreted.

 The below is also based on a Git checkout that was rebased to the latest
 upstream check-in from today,
 so it should merge cleanly with trunk as of today.

 Regards,

 Abe








 2015-06-12  Sebastian Pop  s@samsung.com
  Abe Skolnik  a.skol...@samsung.com

  PR tree-optimization/46029
  * tree-data-ref.c (struct data_ref_loc_d): Moved...
  (get_references_in_stmt): Exported.
  * tree-data-ref.h (struct data_ref_loc_d): ... here.
  (get_references_in_stmt): Declared.

  * tree-if-conv.c (struct ifc_dr): Removed.
  (IFC_DR): Removed.
  (DR_WRITTEN_AT_LEAST_ONCE): Removed.
  (DR_RW_UNCONDITIONALLY): Removed.
  (memrefs_read_or_written_unconditionally): Removed.
  (write_memrefs_written_at_least_once): Removed.
  (ifcvt_could_trap_p): Does not take refs parameter anymore.
  (ifcvt_memrefs_wont_trap): Removed.
  (has_non_addressable_refs): New.
  (if_convertible_gimple_assign_stmt_p): Call has_non_addressable_refs.
  Removed use of refs.
  (if_convertible_stmt_p): Removed use of refs.
  (if_convertible_gimple_assign_stmt_p): Same.
  (if_convertible_loop_p_1): Removed use of refs.  Remove
 initialization
  of dr-aux, DR_WRITTEN_AT_LEAST_ONCE, and DR_RW_UNCONDITIONALLY.
  (insert_address_of): New.
  (create_scratchpad): New.
  (create_indirect_cond_expr): New.
  (predicate_mem_writes): Call create_indirect_cond_expr.  Take an
 extra
  parameter for scratch_pad.
  (combine_blocks): Same.
  (tree_if_conversion): Same.

  testsuite/
  * g++.dg/tree-ssa/ifc-pr46029.C: New.
  * gcc.dg/tree-ssa/ifc-5.c: Make it exactly like the FFmpeg kernel.
  * gcc.dg/tree-ssa/ifc-8.c: New.
  * gcc.dg/tree-ssa/ifc-9.c: New.
  * gcc.dg/tree-ssa/ifc-10.c: New.
  * gcc.dg/tree-ssa/ifc-11.c: New.
  * gcc.dg/tree-ssa/ifc-12.c: New.
  * gcc.dg/vect/if-cvt-stores-vect-ifcvt-18.c: Disabled.



 diff --git a/gcc/common.opt b/gcc/common.opt
 index 6b2ccbc..49f6b9f 100644
 --- a/gcc/common.opt
 +++ b/gcc/common.opt
 @@ -1413,7 +1413,7 @@ Common Report Var(flag_tree_loop_if_convert)
 Init(-1) Optimization
   Convert conditional jumps in innermost loops to branchless equivalents

   ftree-loop-if-convert-stores
 -Common Report Var(flag_tree_loop_if_convert_stores) Optimization
 +Common Report Var(flag_tree_loop_if_convert_stores) Init(-1) Optimization
   Also if-convert conditional jumps containing memory writes

   ; -finhibit-size-directive inhibits output of .size for ELF.

 I don't see this change mentioned anywhere in the ChangeLog.  That seems to
 be a relatively common situation.  I called out some of those issues, but
 didn't try to catch them all.  Please make sure all changes are reflected in
 the ChangeLog.




 diff --git a/gcc/testsuite/gcc.dg/tree-ssa/cunroll-10.c
 b/gcc/testsuite/gcc.dg/tree-ssa/cunroll-10.c
 index f392fbe..775fcd5 100644
 --- a/gcc/testsuite/gcc.dg/tree-ssa/cunroll-10.c
 +++ b/gcc/testsuite/gcc.dg/tree-ssa/cunroll-10.c

 This change isn't mentioned in the ChangeLog.


  diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ifc-5.c

 b/gcc/testsuite/gcc.dg/tree-ssa/ifc-5.c
 index 875d2d3..fc69ca2 100644
 --- a/gcc/testsuite/gcc.dg/tree-ssa/ifc-5.c
 +++ b/gcc/testsuite/gcc.dg/tree-ssa/ifc-5.c
 @@ -1,5 +1,5 @@
   /* { dg-do compile } */
 -/* { dg-options -c -O2 -ftree-vectorize -fdump-tree-ifcvt-stats {
 target *-*-* } } */
 +/* { dg-options -c -O2 -ftree-vectorize -ftree-loop-if-convert-stores
 -fdump-tree-ifcvt-stats { target *-*-* } } */

 ISTM this really should be two tests, one with this code as-is, another that
 exactly matches the ffmpeg kernel.




 diff --git a/gcc/testsuite/gcc.dg/vect/vect-mask-loadstore-1.c
 b/gcc/testsuite/gcc.dg/vect/vect-mask-loadstore-1.c
 index 11e9533..cbd3378 100644
 --- a/gcc/testsuite/gcc.dg/vect/vect-mask-loadstore-1.c
 +++ b/gcc/testsuite/gcc.dg/vect/vect-mask-loadstore-1.c

 I don't see this mentioned in the ChangeLog.  It also doesn't look like you
 actually disabled the test.  Obviously this will need to be addressed before
 your patch could go in.

 diff --git a/gcc/testsuite/gcc.target/i386/avx2-gather-6.c
 b/gcc/testsuite/gcc.target/i386/avx2-gather-6.c
 index 180b490..aedc66a 100644
 --- a/gcc/testsuite/gcc.target/i386/avx2-gather-6.c
 +++ b/gcc/testsuite/gcc.target/i386/avx2-gather-6.c

 Not mentioned in the ChangeLog.   xfail needs to be fixed.

 Similarly for the others were you added xfails.


 diff --git 

Use gcc/coretypes.h:enum offload_abi in mkoffloads (was: [PATCH 1/4] Add mkoffload for Intel MIC)

2015-08-04 Thread Thomas Schwinge
Hi!

On Thu, 8 Jan 2015 07:02:19 -0800, H.J. Lu hjl.to...@gmail.com wrote:
 On Thu, Jan 8, 2015 at 6:59 AM, Thomas Schwinge tho...@codesourcery.com 
 wrote:
  On Mon, 22 Dec 2014 12:28:20 +0100, Jakub Jelinek ja...@redhat.com wrote:
  On Mon, Dec 22, 2014 at 12:25:32PM +0100, Thomas Schwinge wrote:
   On Wed, 22 Oct 2014 22:57:01 +0400, Ilya Verbin iver...@gmail.com 
   wrote:
--- /dev/null
+++ b/gcc/config/i386/intelmic-mkoffload.c
@@ -0,0 +1,541 @@
+/* Offload image generation tool for Intel MIC devices.
  
+/* Shows if we should compile binaries for i386 instead of x86-64.  */
+bool target_ilp32 = false;

Once the following refactoring to use gcc/coretypes.h:enum offload_abi in
mkoffloads gets approved...

 Should we also handle x32?

..., that should be more easy to do.  OK for trunk, once testing
succeeds?

commit de4d7cbcf979edc095a48dff5b38d12846bdab6f
Author: Thomas Schwinge tho...@codesourcery.com
Date:   Tue Aug 4 13:12:36 2015 +0200

Use gcc/coretypes.h:enum offload_abi in mkoffloads
---
 gcc/config/i386/intelmic-mkoffload.c |   90 +++---
 gcc/config/nvptx/mkoffload.c |   56 +++--
 2 files changed, 101 insertions(+), 45 deletions(-)

diff --git gcc/config/i386/intelmic-mkoffload.c 
gcc/config/i386/intelmic-mkoffload.c
index ca15868..ffa6d01 100644
--- gcc/config/i386/intelmic-mkoffload.c
+++ gcc/config/i386/intelmic-mkoffload.c
@@ -42,8 +42,7 @@ int num_temps = 0;
 const int MAX_NUM_TEMPS = 10;
 const char *temp_files[MAX_NUM_TEMPS];
 
-/* Shows if we should compile binaries for i386 instead of x86-64.  */
-bool target_ilp32 = false;
+enum offload_abi offload_abi = OFFLOAD_ABI_UNSET;
 
 /* Delete tempfiles and exit function.  */
 void
@@ -200,10 +199,17 @@ out:
 static void
 compile_for_target (struct obstack *argv_obstack)
 {
-  if (target_ilp32)
-obstack_ptr_grow (argv_obstack, -m32);
-  else
-obstack_ptr_grow (argv_obstack, -m64);
+  switch (offload_abi)
+{
+case OFFLOAD_ABI_LP64:
+  obstack_ptr_grow (argv_obstack, -m64);
+  break;
+case OFFLOAD_ABI_ILP32:
+  obstack_ptr_grow (argv_obstack, -m32);
+  break;
+default:
+  abort ();
+}
   obstack_ptr_grow (argv_obstack, NULL);
   char **argv = XOBFINISH (argv_obstack, char **);
 
@@ -379,10 +385,17 @@ generate_host_descr_file (const char *host_compiler)
   new_argv[new_argc++] = -c;
   new_argv[new_argc++] = -fPIC;
   new_argv[new_argc++] = -shared;
-  if (target_ilp32)
-new_argv[new_argc++] = -m32;
-  else
-new_argv[new_argc++] = -m64;
+  switch (offload_abi)
+{
+case OFFLOAD_ABI_LP64:
+  new_argv[new_argc++] = -m64;
+  break;
+case OFFLOAD_ABI_ILP32:
+  new_argv[new_argc++] = -m32;
+  break;
+default:
+  abort ();
+}
   new_argv[new_argc++] = src_filename;
   new_argv[new_argc++] = -o;
   new_argv[new_argc++] = obj_filename;
@@ -442,10 +455,17 @@ prepare_target_image (const char *target_compiler, int 
argc, char **argv)
   objcopy_argv[3] = -I;
   objcopy_argv[4] = binary;
   objcopy_argv[5] = -O;
-  if (target_ilp32)
-objcopy_argv[6] = elf32-i386;
-  else
-objcopy_argv[6] = elf64-x86-64;
+  switch (offload_abi)
+{
+case OFFLOAD_ABI_LP64:
+  objcopy_argv[6] = elf64-x86-64;
+  break;
+case OFFLOAD_ABI_ILP32:
+  objcopy_argv[6] = elf32-i386;
+  break;
+default:
+  abort ();
+}
   objcopy_argv[7] = target_so_filename;
   objcopy_argv[8] = --rename-section;
   objcopy_argv[9] = rename_section_opt;
@@ -517,17 +537,22 @@ main (int argc, char **argv)
  passed with @file.  Expand them into argv before processing.  */
   expandargv (argc, argv);
 
-  /* Find out whether we should compile binaries for i386 or x86-64.  */
-  for (int i = argc - 1; i  0; i--)
-if (strncmp (argv[i], -foffload-abi=, sizeof (-foffload-abi=) - 1) == 
0)
-  {
-   if (strstr (argv[i], ilp32))
- target_ilp32 = true;
-   else if (!strstr (argv[i], lp64))
- fatal_error (input_location,
-  unrecognizable argument of option -foffload-abi);
-   break;
-  }
+  /* Scan the argument vector.  */
+  for (int i = 1; i  argc; i++)
+{
+#define STR -foffload-abi=
+  if (strncmp (argv[i], STR, strlen (STR)) == 0)
+   {
+ if (strcmp (argv[i] + strlen (STR), lp64))
+   offload_abi = OFFLOAD_ABI_LP64;
+ else if (strcmp (argv[i] + strlen (STR), ilp32))
+   offload_abi = OFFLOAD_ABI_ILP32;
+ else
+   fatal_error (input_location,
+unrecognizable argument of option  STR);
+   }
+#undef STR
+}
 
   const char *target_so_filename
 = prepare_target_image (target_compiler, argc, argv);
@@ -540,10 +565,17 @@ main (int argc, char **argv)
   const char *new_argv[9];
   new_argv[new_argc++] = ld;
   new_argv[new_argc++] = -m;
-  if (target_ilp32)
-new_argv[new_argc++] = elf_i386;
-  else
-new_argv[new_argc++] = elf_x86_64;

Re: [RFC] [Patch]: Try and vectorize with shift for mult expr with power 2 integer constant.

2015-08-04 Thread Richard Biener
On Tue, Aug 4, 2015 at 10:52 AM, Kumar, Venkataramanan
venkataramanan.ku...@amd.com wrote:
 Hi Jeff,

 -Original Message-
 From: gcc-patches-ow...@gcc.gnu.org [mailto:gcc-patches-
 ow...@gcc.gnu.org] On Behalf Of Jeff Law
 Sent: Monday, August 03, 2015 11:42 PM
 To: Kumar, Venkataramanan; Jakub Jelinek
 Cc: Richard Beiner (richard.guent...@gmail.com); gcc-patches@gcc.gnu.org
 Subject: Re: [RFC] [Patch]: Try and vectorize with shift for mult expr with
 power 2 integer constant.

 On 08/02/2015 05:03 AM, Kumar, Venkataramanan wrote:
  Hi Jakub,
 
  Thank you for reviewing the patch.
 
  I have incorporated your comments in the attached patch.
 Note Jakub is on PTO for the next 3 weeks.

  Thank you for this information.



 
 
 
  vectorize_mults_via_shift.diff.txt
 
 
  diff --git a/gcc/testsuite/gcc.dg/vect/vect-mult-patterns.c
  b/gcc/testsuite/gcc.dg/vect/vect-mult-patterns.c
 Jakub would probably like more testcases :-)

 The most obvious thing to test would be other shift factors.

 A negative test to verify we don't try to turn a multiply by non-constant or
 multiply by a constant that is not a power of 2 into shifts.

 I have added negative test in the attached patch.



 [ Would it make sense, for example, to turn a multiply by 3 into a shift-add
 sequence?  As Jakub said, choose_mult_variant can be your friend. ]

 Yes I will do that in a follow up patch.

 The new change log becomes

 gcc/ChangeLog
 2015-08-04  Venkataramanan Kumar  venkataramanan.ku...@amd.com
  * tree-vect-patterns.c (vect_recog_mult_pattern): New function for 
 vectorizing
 multiplication patterns.
  * tree-vectorizer.h: Adjust the number of patterns.

 gcc/testsuite/ChangeLog
 2015-08-04  Venkataramanan Kumar  venkataramanan.ku...@amd.com
  * gcc.dg/vect/vect-mult-pattern-1.c: New
 * gcc.dg/vect/vect-mult-pattern-2.c: New

 Bootstrapped and reg tested on aarch64-unknown-linux-gnu.

 Ok for trunk ?

+  if (TREE_CODE (oprnd0) != SSA_NAME
+  || TREE_CODE (oprnd1) != INTEGER_CST
+  || TREE_CODE (itype) != INTEGER_TYPE

INTEGRAL_TYPE_P (itype)

+  optab = optab_for_tree_code (LSHIFT_EXPR, vectype, optab_vector);
+  if (!optab
+  || optab_handler (optab, TYPE_MODE (vectype)) == CODE_FOR_nothing)
+   return NULL;
+

indent of the return stmt looks wrong

+  /* Handle constant operands that are postive or negative powers of 2.  */
+  if ( wi::exact_log2 (oprnd1) != -1  ||
+   wi::exact_log2 (wi::neg (oprnd1)) != -1)

no space after (, || goes to the next line.

+{
+  tree shift;
+
+  if (wi::exact_log2 (oprnd1) != -1)

please cache wi::exact_log2

in fact the first if () looks redundant if you simply put an else return NULL
after a else if (wi::exact_log2 (wi::neg (oprnd1)) != -1)

Note that the issue with INT_MIN is that wi::neg (INT_MIN) is INT_MIN
again, but it seems that wi::exact_log2 returns -1 in that case so you
are fine (and in fact not handling this case).

Thanks,
Richard.




  @@ -2147,6 +2152,140 @@ vect_recog_vector_vector_shift_pattern
 (vecgimple *stmts,
  return pattern_stmt;
}
 
  +/* Detect multiplication by constant which are postive or negatives
  +of power 2,
 s/postive/positive/


 Jeff

 Regards,
 Venkat.



Re: [PATCH 8/15][AArch64] Add support for float16x{4,8}_t vectors/builtins

2015-08-04 Thread Alan Lawrence

James Greenhalgh wrote:

-;; All modes.
+;; All vector modes on which we support any arithmetic operations.
 (define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF 
V2DF])
 
-;; All vector modes and DI.

+;; All vector modes, including HF modes on which we cannot operate


The wording here is a bit off, we can operate on them - for a limited set
of operations (and you are missing a full stop). How
about something like:

  All vector modes suitable for moving, loading and storing.


+(define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
+   V4HF V8HF V2SF V4SF V2DF])
+
+;; All vector modes barring F16, plus DI.


barring HF modes for consistency with the above comment.


 (define_mode_iterator VALLDI [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF 
V2DF DI])
 
+;; All vector modes and DI.

+(define_mode_iterator VALLDI_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
+ V4HF V8HF V2SF V4SF V2DF DI])
+
 ;; All vector modes and DI and DF.


Except HF modes.


Here's a new version, updating the comments much as you suggest, dropping the 
unrelated testsuite changes (already pushed), and adding VRL2/3/4 iterator 
values only for V4HF.


Bootstrapped + check-gcc on aarch64-none-linux-gnu.

gcc/ChangeLog:

* config/aarch64/aarch64.c (aarch64_vector_mode_supported_p): Support
V4HFmode and V8HFmode.
(aarch64_split_simd_move): Add case for V8HFmode.
* config/aarch64/aarch64-builtins.c (v4hf_UP, v8hf_UP): Define.
(aarch64_simd_builtin_std_type): Handle HFmode.
(aarch64_init_simd_builtin_types): Include Float16x4_t and Float16x8_t.

* config/aarch64/aarch64-simd.md (movmode, aarch64_get_lanemode,
aarch64_ld1VALL:mode, aarch64_st1VALL:mode): Use VALL_F16 iterator.
(aarch64_be_ld1mode, aarch64_be_st1mode): Use VALLDI_F16 iterator.

* config/aarch64/aarch64-simd-builtin-types.def: Add Float16x4_t,
Float16x8_t.

* config/aarch64/aarch64-simd-builtins.def (ld1, st1): Use VALL_F16.
* config/aarch64/arm_neon.h (float16x4_t, float16x8_t, float16_t):
New typedefs.
(vget_lane_f16, vgetq_lane_f16, vset_lane_f16, vsetq_lane_f16,
vld1_f16, vld1q_f16, vst1_f16, vst1q_f16, vst1_lane_f16,
vst1q_lane_f16): New.
* config/aarch64/iterators.md (VD, VQ, VQ_NO2E): Add vectors of HFmode.
(VALLDI_F16, VALL_F16): New.
(Vmtype, VEL, VCONQ, VHALF, V_TWO_ELEM, V_THREE_ELEM, V_FOUR_ELEM, q):
Add cases for V4HF and V8HF.
(VDBL, VRL2, VRL3, VRL4): Add V4HF case.

gcc/testsuite/ChangeLog:

* g++.dg/abi/mangle-neon-aarch64.C: Add cases for float16x4_t and
float16x8_t.
* gcc.target/aarch64/vset_lane_1.c: Likewise.
* gcc.target/aarch64/vld1-vst1_1.c: Likewise.
* gcc.target/aarch64/vld1_lane.c: Likewise.

commit 49cb53a94a44fcda845c3f6ef11e88f9be458aad
Author: Alan Lawrence alan.lawre...@arm.com
Date:   Tue Dec 2 13:08:15 2014 +

AArch64 2/N: Vector/__builtin basics: define+support types, movs, test ABI.

Patterns, builtins, intrinsics for {ld1,st1}{,_lane},v{g,s}et_lane. Tests: vld1-vst1_1, vset_lane_1, vld1_lane.c

diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index cfb2dc1..a6c3377 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -66,6 +66,7 @@
 
 #define v8qi_UP  V8QImode
 #define v4hi_UP  V4HImode
+#define v4hf_UP  V4HFmode
 #define v2si_UP  V2SImode
 #define v2sf_UP  V2SFmode
 #define v1df_UP  V1DFmode
@@ -73,6 +74,7 @@
 #define df_UPDFmode
 #define v16qi_UP V16QImode
 #define v8hi_UP  V8HImode
+#define v8hf_UP  V8HFmode
 #define v4si_UP  V4SImode
 #define v4sf_UP  V4SFmode
 #define v2di_UP  V2DImode
@@ -523,6 +525,8 @@ aarch64_simd_builtin_std_type (enum machine_mode mode,
   return aarch64_simd_intCI_type_node;
 case XImode:
   return aarch64_simd_intXI_type_node;
+case HFmode:
+  return aarch64_fp16_type_node;
 case SFmode:
   return float_type_node;
 case DFmode:
@@ -607,6 +611,8 @@ aarch64_init_simd_builtin_types (void)
   aarch64_simd_types[Poly64x2_t].eltype = aarch64_simd_types[Poly64_t].itype;
 
   /* Continue with standard types.  */
+  aarch64_simd_types[Float16x4_t].eltype = aarch64_fp16_type_node;
+  aarch64_simd_types[Float16x8_t].eltype = aarch64_fp16_type_node;
   aarch64_simd_types[Float32x2_t].eltype = float_type_node;
   aarch64_simd_types[Float32x4_t].eltype = float_type_node;
   aarch64_simd_types[Float64x1_t].eltype = double_type_node;
diff --git a/gcc/config/aarch64/aarch64-simd-builtin-types.def b/gcc/config/aarch64/aarch64-simd-builtin-types.def
index bb54e56..ea219b7 100644
--- a/gcc/config/aarch64/aarch64-simd-builtin-types.def
+++ b/gcc/config/aarch64/aarch64-simd-builtin-types.def
@@ -44,6 +44,8 @@
   ENTRY (Poly16x8_t, V8HI, poly, 12)
   ENTRY (Poly64x1_t, DI, poly, 12)
   ENTRY 

[PATCH][ARM/AArch64 Testsuite] Add float16 lane_indices tests (was: Re: [PATCH 9/15][AArch64] vld{2,3,4}{,_lane,_dup}, vcombine, vcreate)

2015-08-04 Thread Alan Lawrence

James Greenhalgh wrote:

Hi Alan,

The arm_neon.h portion of this patch does not apply after Charles' recent
changes. Could you please rebase and resubmit the patch for review?

Thanks,
James


These are straightforward copies of the corresponding uint16 tests, with 
appropriate substitutions uint-float and u16-f16. As per the existing tests, 
these are xfailed on ARM targets, pending further work on PR/63870.


Cross-tested on aarch64-none-elf.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/advsimd-intrinsics/vld2_lane_indices_1.c: New.
* gcc.target/aarch64/advsimd-intrinsics/vld2q_lane_indices_1.c: New.
* gcc.target/aarch64/advsimd-intrinsics/vld3_lane_indices_1.c: New.
* gcc.target/aarch64/advsimd-intrinsics/vld3q_lane_indices_1.c: New.
* gcc.target/aarch64/advsimd-intrinsics/vld4_lane_indices_1.c: New.
* gcc.target/aarch64/advsimd-intrinsics/vld4q_lane_indices_1.c: New.
* gcc.target/aarch64/advsimd-intrinsics/vst2_lane_indices_1.c: New.
* gcc.target/aarch64/advsimd-intrinsics/vst2q_lane_indices_1.c: New.
* gcc.target/aarch64/advsimd-intrinsics/vst3_lane_indices_1.c: New.
* gcc.target/aarch64/advsimd-intrinsics/vst3q_lane_indices_1.c: New.
* gcc.target/aarch64/advsimd-intrinsics/vst4_lane_indices_1.c: New.
* gcc.target/aarch64/advsimd-intrinsics/vst4q_lane_indices_1.c: New.
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld2_lane_f16_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld2_lane_f16_indices_1.c
new file mode 100644
index ..2174d6eaa8ff1a1d28261b5f1ef3d137d206070d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld2_lane_f16_indices_1.c
@@ -0,0 +1,16 @@
+#include arm_neon.h
+
+/* { dg-do compile } */
+/* { dg-skip-if  { *-*-* } { -fno-fat-lto-objects } } */
+/* { dg-excess-errors  { xfail arm*-*-* } } */
+
+float16x4x2_t
+f_vld2_lane_f16 (float16_t * p, float16x4x2_t v)
+{
+  float16x4x2_t res;
+  /* { dg-error lane 4 out of range 0 - 3  { xfail arm*-*-* } 0 } */
+  res = vld2_lane_f16 (p, v, 4);
+  /* { dg-error lane -1 out of range 0 - 3  { xfail arm*-*-* } 0 } */
+  res = vld2_lane_f16 (p, v, -1);
+  return res;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld2q_lane_f16_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld2q_lane_f16_indices_1.c
new file mode 100644
index ..83ae82c82423b9fbcb98c04d0b26ca69db7a5faa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld2q_lane_f16_indices_1.c
@@ -0,0 +1,16 @@
+#include arm_neon.h
+
+/* { dg-do compile } */
+/* { dg-skip-if  { *-*-* } { -fno-fat-lto-objects } } */
+/* { dg-excess-errors  { xfail arm*-*-* } } */
+
+float16x8x2_t
+f_vld2q_lane_f16 (float16_t * p, float16x8x2_t v)
+{
+  float16x8x2_t res;
+  /* { dg-error lane 8 out of range 0 - 7  { xfail arm*-*-* } 0 } */
+  res = vld2q_lane_f16 (p, v, 8);
+  /* { dg-error lane -1 out of range 0 - 7  { xfail arm*-*-* } 0 } */
+  res = vld2q_lane_f16 (p, v, -1);
+  return res;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld3_lane_f16_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld3_lane_f16_indices_1.c
new file mode 100644
index ..21b7861ba7549ffb692effad2c4e5194c67f3a3c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld3_lane_f16_indices_1.c
@@ -0,0 +1,16 @@
+#include arm_neon.h
+
+/* { dg-do compile } */
+/* { dg-skip-if  { *-*-* } { -fno-fat-lto-objects } } */
+/* { dg-excess-errors  { xfail arm*-*-* } } */
+
+float16x4x3_t
+f_vld3_lane_f16 (float16_t * p, float16x4x3_t v)
+{
+  float16x4x3_t res;
+  /* { dg-error lane 4 out of range 0 - 3  { xfail arm*-*-* } 0 } */
+  res = vld3_lane_f16 (p, v, 4);
+  /* { dg-error lane -1 out of range 0 - 3  { xfail arm*-*-* } 0 } */
+  res = vld3_lane_f16 (p, v, -1);
+  return res;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld3q_lane_f16_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld3q_lane_f16_indices_1.c
new file mode 100644
index ..95ec3913eef77afdf8ce1a7d7a95ddfa3bdf9fc3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld3q_lane_f16_indices_1.c
@@ -0,0 +1,16 @@
+#include arm_neon.h
+
+/* { dg-do compile } */
+/* { dg-skip-if  { *-*-* } { -fno-fat-lto-objects } } */
+/* { dg-excess-errors  { xfail arm*-*-* } } */
+
+float16x8x3_t
+f_vld3q_lane_f16 (float16_t * p, float16x8x3_t v)
+{
+  float16x8x3_t res;
+  /* { dg-error lane 8 out of range 0 - 7  { xfail arm*-*-* } 0 } */
+  res = vld3q_lane_f16 (p, v, 8);
+  /* { dg-error lane -1 out of range 0 - 7  { xfail arm*-*-* } 0 } */
+  res = vld3q_lane_f16 (p, v, -1);
+  return res;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld4_lane_f16_indices_1.c 

Re: [PATCH 8/15][AArch64] Add support for float16x{4,8}_t vectors/builtins

2015-08-04 Thread Alan Lawrence

Sorry, attached the wrong file. Here!

--Alan

Alan Lawrence wrote:

James Greenhalgh wrote:

-;; All modes.
+;; All vector modes on which we support any arithmetic operations.
 (define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF 
V2DF])
 
-;; All vector modes and DI.

+;; All vector modes, including HF modes on which we cannot operate

The wording here is a bit off, we can operate on them - for a limited set
of operations (and you are missing a full stop). How
about something like:

  All vector modes suitable for moving, loading and storing.


+(define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
+   V4HF V8HF V2SF V4SF V2DF])
+
+;; All vector modes barring F16, plus DI.

barring HF modes for consistency with the above comment.


 (define_mode_iterator VALLDI [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF 
V2DF DI])
 
+;; All vector modes and DI.

+(define_mode_iterator VALLDI_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
+ V4HF V8HF V2SF V4SF V2DF DI])
+
 ;; All vector modes and DI and DF.

Except HF modes.


Here's a new version, updating the comments much as you suggest, dropping the 
unrelated testsuite changes (already pushed), and adding VRL2/3/4 iterator 
values only for V4HF.


Bootstrapped + check-gcc on aarch64-none-linux-gnu.

gcc/ChangeLog:

* config/aarch64/aarch64.c (aarch64_vector_mode_supported_p): Support
V4HFmode and V8HFmode.
(aarch64_split_simd_move): Add case for V8HFmode.
* config/aarch64/aarch64-builtins.c (v4hf_UP, v8hf_UP): Define.
(aarch64_simd_builtin_std_type): Handle HFmode.
(aarch64_init_simd_builtin_types): Include Float16x4_t and Float16x8_t.

* config/aarch64/aarch64-simd.md (movmode, aarch64_get_lanemode,
aarch64_ld1VALL:mode, aarch64_st1VALL:mode): Use VALL_F16 iterator.
(aarch64_be_ld1mode, aarch64_be_st1mode): Use VALLDI_F16 iterator.

* config/aarch64/aarch64-simd-builtin-types.def: Add Float16x4_t,
Float16x8_t.

* config/aarch64/aarch64-simd-builtins.def (ld1, st1): Use VALL_F16.
* config/aarch64/arm_neon.h (float16x4_t, float16x8_t, float16_t):
New typedefs.
(vget_lane_f16, vgetq_lane_f16, vset_lane_f16, vsetq_lane_f16,
vld1_f16, vld1q_f16, vst1_f16, vst1q_f16, vst1_lane_f16,
vst1q_lane_f16): New.
* config/aarch64/iterators.md (VD, VQ, VQ_NO2E): Add vectors of HFmode.
(VALLDI_F16, VALL_F16): New.
(Vmtype, VEL, VCONQ, VHALF, V_TWO_ELEM, V_THREE_ELEM, V_FOUR_ELEM, q):
Add cases for V4HF and V8HF.
(VDBL, VRL2, VRL3, VRL4): Add V4HF case.

gcc/testsuite/ChangeLog:

* g++.dg/abi/mangle-neon-aarch64.C: Add cases for float16x4_t and
float16x8_t.
* gcc.target/aarch64/vset_lane_1.c: Likewise.
* gcc.target/aarch64/vld1-vst1_1.c: Likewise.
* gcc.target/aarch64/vld1_lane.c: Likewise.



diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 800f6e1ffcd358aa22ceecbc460bc1dcac4acd9e..2394efdb483e1128d2990852871ab4abfed8bdfc 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -61,6 +61,7 @@
 
 #define v8qi_UP  V8QImode
 #define v4hi_UP  V4HImode
+#define v4hf_UP  V4HFmode
 #define v2si_UP  V2SImode
 #define v2sf_UP  V2SFmode
 #define v1df_UP  V1DFmode
@@ -68,6 +69,7 @@
 #define df_UPDFmode
 #define v16qi_UP V16QImode
 #define v8hi_UP  V8HImode
+#define v8hf_UP  V8HFmode
 #define v4si_UP  V4SImode
 #define v4sf_UP  V4SFmode
 #define v2di_UP  V2DImode
@@ -520,6 +522,8 @@ aarch64_simd_builtin_std_type (enum machine_mode mode,
   return aarch64_simd_intCI_type_node;
 case XImode:
   return aarch64_simd_intXI_type_node;
+case HFmode:
+  return aarch64_fp16_type_node;
 case SFmode:
   return float_type_node;
 case DFmode:
@@ -604,6 +608,8 @@ aarch64_init_simd_builtin_types (void)
   aarch64_simd_types[Poly64x2_t].eltype = aarch64_simd_types[Poly64_t].itype;
 
   /* Continue with standard types.  */
+  aarch64_simd_types[Float16x4_t].eltype = aarch64_fp16_type_node;
+  aarch64_simd_types[Float16x8_t].eltype = aarch64_fp16_type_node;
   aarch64_simd_types[Float32x2_t].eltype = float_type_node;
   aarch64_simd_types[Float32x4_t].eltype = float_type_node;
   aarch64_simd_types[Float64x1_t].eltype = double_type_node;
diff --git a/gcc/config/aarch64/aarch64-simd-builtin-types.def b/gcc/config/aarch64/aarch64-simd-builtin-types.def
index bb54e56ce63c040dbfe69e2249e642d2c43fd0af..ea219b72ff9ac406c2439cda002617e710b2966c 100644
--- a/gcc/config/aarch64/aarch64-simd-builtin-types.def
+++ b/gcc/config/aarch64/aarch64-simd-builtin-types.def
@@ -44,6 +44,8 @@
   ENTRY (Poly16x8_t, V8HI, poly, 12)
   ENTRY (Poly64x1_t, DI, poly, 12)
   ENTRY (Poly64x2_t, V2DI, poly, 12)
+  ENTRY (Float16x4_t, V4HF, none, 13)
+  ENTRY (Float16x8_t, V8HF, none, 13)
   ENTRY (Float32x2_t, 

Re: [PATCH 5/15][ARM] Remaining intrinsics

2015-08-04 Thread Kyrill Tkachov


On 28/07/15 12:24, Alan Lawrence wrote:

This is a respin of https://gcc.gnu.org/ml/gcc-patches/2015-07/msg00479.html,
again to make the intrinsics available only if we have a scalar __fp16 type.

This does not fix existing indentation issues in neon.md but rather keeps the
affected lines consistent with those around them.

gcc/ChangeLog (as before):

* config/arm/arm-builtins.c (VAR11, VAR12): New.
* config/arm/arm_neon_builtins.def (vcombine, vld2_dup, vld3_dup,
vld4_dup): Add v4hf variant.
(vget_high, vget_low): Add v8hf variant.
(vld1, vst1, vst1_lane, vld2, vld2_lane, vst2, vst2_lane, vld3,
vld3_lane, vst3, vst3_lane, vld4, vld4_lane, vst4, vst4_lane): Add
v4hf and v8hf variants.

* config/arm/iterators.md (VD_LANE, VD_RE, VQ2, VQ_HS): New.
(VDX): Add V4HF.
(V_DOUBLE): Add case for V4HF.
(VQX): Add V8HF.
(V_HALF): Add case for V8HF.
(VDQX): Add V4HF, V8HF.
(V_elem, V_two_elem, V_three_elem, V_four_elem, V_cmp_result,
V_uf_sclr, V_sz_elem, V_mode_nunits, q): Add cases for V4HF  V8HF.

* config/arm/neon.md (vec_setmodeinternal, vec_extractmode,
neon_vget_lanemode_sext_internal, neon_vget_lanemode_zext_internal,
vec_load_lanesoimode, neon_vld2mode, vec_store_lanesoimode,
neon_vst2mode, vec_load_lanescimode, neon_vld3mode,
neon_vld3qamode, neon_vld3qbmode, vec_store_lanescimode,
neon_vst3mode, neon_vst3qamode, neon_vst3qbmode,
vec_load_lanesximode, neon_vld4mode, neon_vld4qamode,
neon_vld4qbmode, vec_store_lanesximode, neon_vst4mode,
neon_vst4qamode, neon_vst4qbmode): Change VQ iterator to VQ2.

(neon_vcreate, neon_vreinterpretv8qimode,
neon_vreinterpretv4himode, neon_vreinterpretv2simode,
neon_vreinterpretv2sfmode, neon_vreinterpretdimode):
Change VDX to VD_RE.

(neon_vld2_lanemode, neon_vst2_lanemode, neon_vld3_lanemode,
neon_vst3_lanemode, neon_vld4_lanemode, neon_vst4_lanemode):
Change VD iterator to VD_LANE, and VMQ iterator to VQ_HS.

* config/arm/arm_neon.h (float16x4x2_t, float16x8x2_t, float16x4x3_t,
float16x8x3_t, float16x4x4_t, float16x8x4_t, vcombine_f16,
vget_high_f16, vget_low_f16, vld1_f16, vld1q_f16, vst1_f16, vst1q_f16,
vst1_lane_f16, vst1q_lane_f16, vld2_f16, vld2q_f16, vld2_lane_f16,
vld2q_lane_f16, vld2_dup_f16, vst2_f16, vst2q_f16, vst2_lane_f16,
vst2q_lane_f16, vld3_f16, vld3q_f16, vld3_lane_f16, vld3q_lane_f16,
vld3_dup_f16, vst3_f16, vst3q_f16, vst3_lane_f16, vst3q_lane_f16,
vld4_f16, vld4q_f16, vld4_lane_f16, vld4q_lane_f16, vld4_dup_f16,
vst4_f16, vst4q_f16, vst4_lane_f16, vst4q_lane_f16, ): New.


Trailing comma in that list.
Ok with the ChangeLog fixed.
Thanks,
Kyrill




[PATCH, i386] Merge SSE and AVX ptest patterns.

2015-08-04 Thread Kirill Yukhin
Hello,
I've merged ptest insn patterns from AVX and SSE.
I've also extended mode iterator to allow any 128/256 bit mode
for the insn as it register-wide, which may help implementing
https://gcc.gnu.org/ml/gcc-patches/2015-05/msg02788.html


Bootstrapped and regtested.

If no objections, I'll commit it into main trunk tomorrow morning (Moscow time).

gcc/
* config/i386/i386.c (bdesc_args): Rename CODE_FOR_sse4_1_ptest into
CODE_FOR_sse4_1_ptestv2di and CODE_FOR_avx_vtestps256 into
CODE_FOR_avx_ptestv4di.
* config/i386/sse.md (define_mode_iterator V_AVX): New.
(define_mode_attr sse4_1): Extend to other 128/256-bit modes.
(define_insn avx_ptest256): Merge this ...
(define_insn sse4_1_ptest): And this ...
(define_insn sse4_1_ptestmode): Into this. Use V_AVX iterator.

--
Thanks, K


commit 64741d31c19d464a1ca4270b775a7b54c1253019
Author: Kirill Yukhin kirill.yuk...@intel.com
Date:   Tue Aug 4 10:36:10 2015 +0300

Merge SSE 4.1 and AVX ptest patterns. Extend iterator for new one.

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 128c5af..f93a5ce 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -31734,9 +31734,9 @@ static const struct builtin_description bdesc_args[] =
   { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, __builtin_ia32_roundps_az, 
IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
   { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, 
__builtin_ia32_roundps_az_sfix, IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) 
V4SI_FTYPE_V4SF },
 
-  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, __builtin_ia32_ptestz128, 
IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
-  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, __builtin_ia32_ptestc128, 
IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
-  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, 
__builtin_ia32_ptestnzc128, IX86_BUILTIN_PTESTNZC, GTU, (int) 
INT_FTYPE_V2DI_V2DI_PTEST },
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptestv2di, 
__builtin_ia32_ptestz128, IX86_BUILTIN_PTESTZ, EQ, (int) 
INT_FTYPE_V2DI_V2DI_PTEST },
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptestv2di, 
__builtin_ia32_ptestc128, IX86_BUILTIN_PTESTC, LTU, (int) 
INT_FTYPE_V2DI_V2DI_PTEST },
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptestv2di, 
__builtin_ia32_ptestnzc128, IX86_BUILTIN_PTESTNZC, GTU, (int) 
INT_FTYPE_V2DI_V2DI_PTEST },
 
   /* SSE4.2 */
   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, __builtin_ia32_pcmpgtq, 
IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
@@ -31892,9 +31892,9 @@ static const struct builtin_description bdesc_args[] =
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, 
__builtin_ia32_vtestzps256, IX86_BUILTIN_VTESTZPS256, EQ, (int) 
INT_FTYPE_V8SF_V8SF_PTEST },
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, 
__builtin_ia32_vtestcps256, IX86_BUILTIN_VTESTCPS256, LTU, (int) 
INT_FTYPE_V8SF_V8SF_PTEST },
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, 
__builtin_ia32_vtestnzcps256, IX86_BUILTIN_VTESTNZCPS256, GTU, (int) 
INT_FTYPE_V8SF_V8SF_PTEST },
-  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, __builtin_ia32_ptestz256, 
IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
-  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, __builtin_ia32_ptestc256, 
IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
-  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, __builtin_ia32_ptestnzc256, 
IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptestv4di, __builtin_ia32_ptestz256, 
IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptestv4di, __builtin_ia32_ptestc256, 
IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptestv4di, __builtin_ia32_ptestnzc256, 
IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
 
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, 
__builtin_ia32_movmskpd256, IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) 
INT_FTYPE_V4DF  },
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, 
__builtin_ia32_movmskps256, IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) 
INT_FTYPE_V8SF },
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 0970f0e..f9994e4 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -299,6 +299,12 @@
V8DI (V4DI TARGET_AVX512VL) (V2DI TARGET_AVX512VL)])
 
 ;; All DImode vector integer modes
+(define_mode_iterator V_AVX
+  [V16QI V8HI V4SI V2DI V4SF V2DF
+   (V32QI TARGET_AVX) (V16HI TARGET_AVX)
+   (V8SI TARGET_AVX) (V4DI TARGET_AVX)
+   (V8SF TARGET_AVX) (V4DFTARGET_AVX)])
+
 (define_mode_iterator VI8
   [(V8DI TARGET_AVX512F) (V4DI TARGET_AVX) V2DI])
 
@@ -566,7 +572,11 @@
 (define_mode_attr sse4_1
   [(V4SF sse4_1) (V2DF sse4_1)
(V8SF avx) (V4DF avx)
-   (V8DF avx512f)])
+   (V8DF avx512f)
+   (V4DI avx) (V2DI sse4_1)
+   (V8SI avx) (V4SI sse4_1)
+   (V16QI 

Re: [PATCH][AArch64] Change aarch64 vector cost to match vectorizer

2015-08-04 Thread Pawel Kupidura
On 04/08/15 11:48, James Greenhalgh wrote:
 On Tue, Aug 04, 2015 at 11:06:11AM +0100, Pawel Kupidura wrote:
 Hi,

 I'm sorry about the issues with formatting, it should be fixed now. 
 Here's corrected version with diff to current trunk.
 
 Hi Pawel,
 
 I'm still having trouble getting this patch to apply, I'm not sure whether
 it is the format=flowed in your mail headers, or the quoted-printable
 encoding, or something else. Certainly when I open your emails I see :
 
 if (where == vect_body  stmt_info  stmt_in_inner_loop_p 
 (stmt_info))
 
 The content of the patch is OK to commit, but it would be good to
 have a copy on list that can be easily applied.
 
 Thanks,
 James
 
 diff --git a/gcc/ChangeLog b/gcc/ChangeLog
 index fdc4a7e..d1c6663 100644
 --- a/gcc/ChangeLog
 +++ b/gcc/ChangeLog
 @@ -1,3 +1,7 @@
 +2015-08-04  Pawel Kupidura  pawel.kupid...@arm.com
 +* config/aarch64/aarch64.c: Change inner loop statement cost
 +to be consistent with other targets.
 +
   2015-08-03  Abe Skolnik  a.skol...@samsung.com

  * tree-if-conv.c: Fix various typos in comments.
 diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
 index 2b1ae36..173a385 100644
 --- a/gcc/config/aarch64/aarch64.c
 +++ b/gcc/config/aarch64/aarch64.c
 @@ -7086,15 +7086,9 @@ aarch64_add_stmt_cost (void *data, int count, 
 enum vect_cost_for_stmt kind,

 /* Statements in an inner loop relative to the loop being
   vectorized are weighted more heavily.  The value here is
 - a function (linear for now) of the loop nest level.  */
 + arbitrary and could potentially be improved with analysis.  */
 if (where == vect_body  stmt_info  stmt_in_inner_loop_p 
 (stmt_info))
 -{
 -  loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
 -  struct loop *loop =  LOOP_VINFO_LOOP (loop_info);
 -  unsigned nest_level = loop_depth (loop);
 -
 -  count *= nest_level;
 -}
 +count *= 50; /*  FIXME  */

 retval = (unsigned) (count * stmt_cost);
 cost[where] += retval;

Hi,

The issue was flowed format forced by mail client. I've tested it and the patch 
should apply now. 

Thanks,
Pawel 

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 28a55d5..c8b94d6 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,7 @@
+2015-08-04  Pawel Kupidura  pawel.kupid...@arm.com
+   * config/aarch64/aarch64.c: Change inner loop statement cost
+   to be consistent with other targets.
+
 2015-08-04  Kyrylo Tkachov  kyrylo.tkac...@arm.com
 
* config/aarch64/aarch64.c (aarch64_tribools_ok_for_inlining_p):
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 6b418a7..5727bc7 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -7091,15 +7091,9 @@ aarch64_add_stmt_cost (void *data, int count, enum 
vect_cost_for_stmt kind,
 
   /* Statements in an inner loop relative to the loop being
 vectorized are weighted more heavily.  The value here is
-a function (linear for now) of the loop nest level.  */
+arbitrary and could potentially be improved with analysis.  */
   if (where == vect_body  stmt_info  stmt_in_inner_loop_p (stmt_info))
-   {
- loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
- struct loop *loop =  LOOP_VINFO_LOOP (loop_info);
- unsigned nest_level = loop_depth (loop);
-
- count *= nest_level;
-   }
+   count *= 50; /*  FIXME  */
 
   retval = (unsigned) (count * stmt_cost);
   cost[where] += retval;



Re: [PATCH, i386] Disable AVX-512VL insns for scalar mode operands on -march=knl.

2015-08-04 Thread Kirill Yukhin
On 04 Aug 15:31, Kirill Yukhin wrote:
 On 04 Aug 14:10, Uros Bizjak wrote:
  On Tue, Aug 4, 2015 at 1:47 PM, Kirill Yukhin kirill.yuk...@gmail.com 
  wrote:
   Hello,
   -   (set_attr prefix_data16 *,*,*,1,*,*,*,*)
   -   (set_attr prefix orig,vex,maybe_vex,orig,vex,maybe_vex,orig,orig)
   -   (set_attr mode V2DF,V2DF,DF,V1DF,V1DF,DF,V4SF,V2SF)])
   +   (set_attr prefix_data16 *,*,*,*,*,1,*,*,*,*)
  
  Please change the above to:
  
 (set (attr prefix_data16)
  (if_then_else (eq_attr alternative 5)
(const_string 1)
(const_string *)))
 Thanks, fixed!
  
  Uros.

Wrong patch. Here is proper.

commit 1055739cb51648794a01afd85f59efadd14378ed
Author: Kirill Yukhin kirill.yuk...@intel.com
Date:   Mon Aug 3 15:21:06 2015 +0300

Fix vec_concatv2df and vec_dupv2df to block wrongly enabled AVX-512VL insns.

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 5c5c1fc..9ffe9aa 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -784,7 +784,8 @@
 (define_attr isa base,x64,x64_sse4,x64_sse4_noavx,x64_avx,nox64,
sse2,sse2_noavx,sse3,sse4,sse4_noavx,avx,noavx,
avx2,noavx2,bmi,bmi2,fma4,fma,avx512f,noavx512f,
-   fma_avx512f,avx512bw,noavx512bw,avx512dq,noavx512dq
+   fma_avx512f,avx512bw,noavx512bw,avx512dq,noavx512dq,
+   avx512vl,noavx512vl
   (const_string base))
 
 (define_attr enabled 
@@ -819,6 +820,8 @@
 (eq_attr isa noavx512bw) (symbol_ref !TARGET_AVX512BW)
 (eq_attr isa avx512dq) (symbol_ref TARGET_AVX512DQ)
 (eq_attr isa noavx512dq) (symbol_ref !TARGET_AVX512DQ)
+(eq_attr isa avx512vl) (symbol_ref TARGET_AVX512VL)
+(eq_attr isa noavx512vl) (symbol_ref !TARGET_AVX512VL)
]
(const_int 1)))
 
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 0970f0e..ca1ec2e 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -8638,44 +8638,50 @@
(set_attr mode DF,DF,V1DF,V1DF,V1DF,V2DF,V1DF,V1DF,V1DF)])
 
 (define_insn vec_dupv2dfmask_name
-  [(set (match_operand:V2DF 0 register_operand =x,v)
+  [(set (match_operand:V2DF 0 register_operand =x,x,v)
(vec_duplicate:V2DF
- (match_operand:DF 1 nonimmediate_operand  0,vm)))]
+ (match_operand:DF 1 nonimmediate_operand  0,xm,vm)))]
   TARGET_SSE2  mask_avx512vl_condition
   @
unpcklpd\t%0, %0
-   %vmovddup\t{%1, %0mask_operand2|%0mask_operand2, %1}
-  [(set_attr isa noavx,sse3)
+   %vmovddup\t{%1, %0mask_operand2|%0mask_operand2, %1}
+   vmovddup\t{%1, %0mask_operand2|%0mask_operand2, %1}
+  [(set_attr isa noavx,sse3,avx512vl)
(set_attr type sselog1)
-   (set_attr prefix orig,maybe_vex)
-   (set_attr mode V2DF,DF)])
+   (set_attr prefix orig,maybe_vex,evex)
+   (set_attr mode V2DF,DF,DF)])
 
 (define_insn *vec_concatv2df
-  [(set (match_operand:V2DF 0 register_operand =x,v,v,x,x,v,x,x)
+  [(set (match_operand:V2DF 0 register_operand =x,x,v,x,v,x,x,v,x,x)
(vec_concat:V2DF
- (match_operand:DF 1 nonimmediate_operand  0,v,m,0,x,m,0,0)
- (match_operand:DF 2 vector_move_operand   x,v,1,m,m,C,x,m)))]
+ (match_operand:DF 1 nonimmediate_operand  0,x,v,m,m,0,x,m,0,0)
+ (match_operand:DF 2 vector_move_operand   x,x,v,1,1,m,m,C,x,m)))]
   TARGET_SSE
 (!(MEM_P (operands[1])  MEM_P (operands[2]))
|| (TARGET_SSE3  rtx_equal_p (operands[1], operands[2])))
   @
unpcklpd\t{%2, %0|%0, %2}
vunpcklpd\t{%2, %1, %0|%0, %1, %2}
+   vunpcklpd\t{%2, %1, %0|%0, %1, %2}
%vmovddup\t{%1, %0|%0, %1}
+   vmovddup\t{%1, %0|%0, %1}
movhpd\t{%2, %0|%0, %2}
vmovhpd\t{%2, %1, %0|%0, %1, %2}
%vmovsd\t{%1, %0|%0, %1}
movlhps\t{%2, %0|%0, %2}
movhps\t{%2, %0|%0, %2}
-  [(set_attr isa sse2_noavx,avx,sse3,sse2_noavx,avx,sse2,noavx,noavx)
+  [(set_attr isa 
sse2_noavx,avx,avx512vl,sse3,avx512vl,sse2_noavx,avx,sse2,noavx,noavx)
(set (attr type)
  (if_then_else
(eq_attr alternative 0,1,2)
(const_string sselog)
(const_string ssemov)))
-   (set_attr prefix_data16 *,*,*,1,*,*,*,*)
-   (set_attr prefix orig,vex,maybe_vex,orig,vex,maybe_vex,orig,orig)
-   (set_attr mode V2DF,V2DF,DF,V1DF,V1DF,DF,V4SF,V2SF)])
+   (set (attr prefix_data16)
+   (if_then_else (eq_attr alternative 5)
+ (const_string 1)
+ (const_string *)))
+   (set_attr prefix 
orig,vex,evex,maybe_vex,evex,orig,vex,maybe_vex,orig,orig)
+   (set_attr mode V2DF,V2DF,V2DF, DF, DF, V1DF,V1DF,DF,V4SF,V2SF)])
 
 ;
 ;;


Re: [PATCH, i386] Merge SSE and AVX ptest patterns.

2015-08-04 Thread Uros Bizjak
On Tue, Aug 4, 2015 at 1:58 PM, Kirill Yukhin kirill.yuk...@gmail.com wrote:
 Hello,
 I've merged ptest insn patterns from AVX and SSE.
 I've also extended mode iterator to allow any 128/256 bit mode
 for the insn as it register-wide, which may help implementing
 https://gcc.gnu.org/ml/gcc-patches/2015-05/msg02788.html


 Bootstrapped and regtested.

 If no objections, I'll commit it into main trunk tomorrow morning (Moscow 
 time).

 gcc/
 * config/i386/i386.c (bdesc_args): Rename CODE_FOR_sse4_1_ptest into
 CODE_FOR_sse4_1_ptestv2di and CODE_FOR_avx_vtestps256 into
 CODE_FOR_avx_ptestv4di.
 * config/i386/sse.md (define_mode_iterator V_AVX): New.
 (define_mode_attr sse4_1): Extend to other 128/256-bit modes.
 (define_insn avx_ptest256): Merge this ...
 (define_insn sse4_1_ptest): And this ...
 (define_insn sse4_1_ptestmode): Into this. Use V_AVX iterator.

 +   (set (attr btver2_decode)
 + (if_then_else
 +   (and (eq_attr alternative 2)
 +   (match_test sseinsnmodemode==OImode))
 + (const_string vector)
 + (const_string *)))

vector does not depend on alternative, but only on
sseinsnsmodemode. So the and above should be removed.

Uros.


Re: [PATCH, i386] Disable AVX-512VL insns for scalar mode operands on -march=knl.

2015-08-04 Thread Uros Bizjak
On Tue, Aug 4, 2015 at 1:47 PM, Kirill Yukhin kirill.yuk...@gmail.com wrote:
 Hello,

 For vec_dup and vec_concat patterns (of v2df mode) second operand
 is of scalar mode, so `ix86_hard_regno_mode_ok’ didn’t block EVEX registers,
 of non-512b modes (when AVX-512VL is turned off).
 This turns into 128/256b xmm[15] regs emit on -march=knl.

 There’re should be more patterns w/ similar issue. Will look for them later.

 Bootstrapped and regtested.

 If no objections, I'll commit it tomorrow morning (Moscow time).

 gcc/
 * config/i386/i386.md (define_attr isa): Addd avx512vl and
 noavx512vl.
 (define_attr enabled): Handle avx521vl and noavx512vl.
 * config/i386/sse.md (define_insn vec_dupv2dfmask_name): Split
 AVX-512 alternative out of SSE.
 (define_insn *vec_concatv2df): Ditto.

 -   (set_attr prefix_data16 *,*,*,1,*,*,*,*)
 -   (set_attr prefix orig,vex,maybe_vex,orig,vex,maybe_vex,orig,orig)
 -   (set_attr mode V2DF,V2DF,DF,V1DF,V1DF,DF,V4SF,V2SF)])
 +   (set_attr prefix_data16 *,*,*,*,*,1,*,*,*,*)

Please change the above to:

   (set (attr prefix_data16)
(if_then_else (eq_attr alternative 5)
  (const_string 1)
  (const_string *)))

Uros.


Re: [ARM] Fix vget_lane for big-endian targets

2015-08-04 Thread Christophe Lyon
On 21 July 2015 at 16:01, Kyrill Tkachov kyrylo.tkac...@arm.com wrote:

 On 16/07/15 08:56, Christophe Lyon wrote:

 AdvSIMD vget_lane tests currently fail on armeb targets when dealing
 with vectors of 2 64-bits elements. This patches fixes it, by adding a
 code fragment similar to what is dones in other cases. I could have
 simplified it a bit given that the vector width is known, but I chose
 to hardcode 'reg_nelts = 2' to keep the code closer to what is done
 elsewhere.

 OK for trunk?

 Christophe

 2015-07-16  Christophe Lyon  christophe.l...@linaro.org

 * config/arm/neon.md (neon_vget_lanev2di): Handle big-endian
 targets.


 I see we do this for other lanewise patterns as well.
 Has this been tested on an arm big-endian target?

 If so, ok for trunk.

I forgot to mention that yes, I actually tested it on arm big-endian,
using QEMU.

Christophe.


 Thanks,
 Kyrill



 diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
 index 654d9d5..59ddc5b 100644
 --- a/gcc/config/arm/neon.md
 +++ b/gcc/config/arm/neon.md
 @@ -2736,6 +2736,19 @@
  (match_operand:SI 2 immediate_operand )]
 TARGET_NEON
   {
 +  if (BYTES_BIG_ENDIAN)
 +{
 +  /* The intrinsics are defined in terms of a model where the
 +element ordering in memory is vldm order, whereas the generic
 +RTL is defined in terms of a model where the element ordering
 +in memory is array order.  Convert the lane number to conform
 +to this model.  */
 +  unsigned int elt = INTVAL (operands[2]);
 +  unsigned int reg_nelts = 2;
 +  elt ^= reg_nelts - 1;
 +  operands[2] = GEN_INT (elt);
 +}
 +
 switch (INTVAL (operands[2]))
   {
   case 0:




[PATCH] Add __builtin_stack_top

2015-08-04 Thread H.J. Lu
On Wed, Jul 22, 2015 at 8:44 AM, H.J. Lu hjl.to...@gmail.com wrote:
 On Wed, Jul 22, 2015 at 6:59 AM, H.J. Lu hjl.to...@gmail.com wrote:
 On Wed, Jul 22, 2015 at 6:55 AM, Segher Boessenkool
 seg...@kernel.crashing.org wrote:
 On Wed, Jul 22, 2015 at 05:10:04AM -0700, H.J. Lu wrote:
 I got a feedback, suggesting __builtin_stack_top, instead of
 __builtin_ia32_stack_top.  But I don't know if

 +  /* After the prologue, stack top is at -WORD(AP) in the current
 +frame.  */
 +  emit_insn (gen_rtx_SET (target,
 + plus_constant (Pmode, arg_pointer_rtx,
 +-UNITS_PER_WORD)));

 is true for all backends.  If it works on all backends, I can move
 it to builtins.c.

 It doesn't afaik.  But can't you define INITIAL_FRAME_ADDRESS_RTX?


 Segher

 Does INITIAL_FRAME_ADDRESS_RTX point to stack top? It certainly
 can't be defined for x86.   I will write a midld-end patch and leave to each
 backend to enable it.

 Here is a patch.  Any comments, feedbacks?

Where does this feature belong?  Middle-end or x86 backend?
Here is the updated patch to implement it in middle-end.  Any
comments?

Thanks.

 Thanks.

 --
 H.J.
 ---
 When __builtin_frame_address is used to retrieve the address of the
 function stack frame, the frame pointer is always kept, which wastes one
 register and 2 instructions.  For x86-32, one less register means
 significant negative impact on performance.  This patch adds a new
 builtin function, __builtin_stack_top.  It returns the stack address
 when the function is called.

 This patch only enables __builtin_stack_top for x86 backend.  Using
 __builtin_stack_top with other backends will lead to

 sorry, unimplemented: ‘__builtin_stack_top’ not supported on this target

 TARGET_STACK_TOP_RTX must be defined to enable __builtin_stack_top.
 default_stack_top_rtx may be extended to support more backends,
 including those with INITIAL_FRAME_ADDRESS_RTX.

 gcc/

 PR target/66960
 * builtin-types.def (BT_FN_PTR_VOID): New function type.
 * builtins.c (expand_builtin): Handle BUILT_IN_STACK_TOP.
 (is_simple_builtin): Likewise.
 * ipa-pure-const.c (special_builtin_state): Likewise.
 * builtins.def: Add BUILT_IN_STACK_TOP.
 * function.h (function): Add stack_top_taken.
 * target.def (stack_top_rtx): New target hook.
 * targhooks.c (default_stack_top_rtx): New.
 * targhooks.h (default_stack_top_rtx): Likewise.
 * config/i386/i386.c (ix86_expand_prologue): Sorry if DRAP is
 used and the stack address has been taken.
 (TARGET_STACK_TOP_RTX): New.
 * doc/extend.texi: Document __builtin_stack_top.
 * doc/tm.texi.in (TARGET_STACK_TOP_RTX): New.
 * doc/tm.texi: Regenerated.

 gcc/testsuite/

 PR target/66960
 * gcc.target/i386/pr66960-1.c: New test.
 * gcc.target/i386/pr66960-2.c: Likewise.
 * gcc.target/i386/pr66960-3.c: Likewise.
 * gcc.target/i386/pr66960-4.c: Likewise.
 * gcc.target/i386/pr66960-5.c: Likewise.



-- 
H.J.
From 267982f7c76cc6eece0dd7896555d27291f587ef Mon Sep 17 00:00:00 2001
From: H.J. Lu hjl.to...@gmail.com
Date: Tue, 21 Jul 2015 14:32:09 -0700
Subject: [PATCH] Add __builtin_stack_top
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When __builtin_frame_address is used to retrieve the address of the
function stack frame, the frame pointer register is required, which
wastes one register and 2 instructions.  For x86-32, one less register
means significant negative impact on performance.  This patch adds a
new builtin function, __builtin_stack_top.  It returns the stack address
when the function is called.

This patch only enables __builtin_stack_top for x86 backend.  Using
__builtin_stack_top with other backends will lead to

sorry, unimplemented: ‘__builtin_stack_top’ not supported on this target

TARGET_STACK_TOP_RTX must be defined to enable __builtin_stack_top.
default_stack_top_rtx may be extended to support more backends,
including those with INITIAL_FRAME_ADDRESS_RTX.

gcc/

	PR target/66960
	* builtin-types.def (BT_FN_PTR_VOID): New function type.
	* builtins.c (expand_builtin): Handle BUILT_IN_STACK_TOP.
	(is_simple_builtin): Likewise.
	* ipa-pure-const.c (special_builtin_state): Likewise.
	* builtins.def: Add BUILT_IN_STACK_TOP.
	* function.h (function): Add stack_top_taken.
	* target.def (stack_top_rtx): New target hook.
	* targhooks.c (default_stack_top_rtx): New.
	* targhooks.h (default_stack_top_rtx): Likewise.
	* config/i386/i386.c (ix86_expand_prologue): Sorry if DRAP is
	used and the stack address has been taken.
	(TARGET_STACK_TOP_RTX): New.
	* doc/extend.texi: Document __builtin_stack_top.
	* doc/tm.texi.in (TARGET_STACK_TOP_RTX): New.
	* doc/tm.texi: Regenerated.

gcc/testsuite/

	PR target/66960
	* gcc.target/i386/pr66960-1.c: New test.
	* gcc.target/i386/pr66960-2.c: Likewise.
	* gcc.target/i386/pr66960-3.c: Likewise.
	* gcc.target/i386/pr66960-4.c: Likewise.
	* gcc.target/i386/pr66960-5.c: Likewise.
---
 gcc/builtin-types.def |  1 

Re: Regression in target MIC compiler

2015-08-04 Thread Thomas Schwinge
Hi!

Testing some offloading patches for trunk, I'm encountering the same
problem already reported here:

On Fri, 31 Jul 2015 20:13:02 +0300, Ilya Verbin iver...@gmail.com wrote:
 On Fri, Jul 31, 2015 at 18:59:59 +0200, Jakub Jelinek wrote:
  On Fri, Jul 31, 2015 at 07:53:16PM +0300, Ilya Verbin wrote:
   On Fri, Jul 31, 2015 at 19:27:58 +0300, Ilya Verbin wrote:
I've noticed that target MIC compiler from trunk hangs forever in
lto_input_mode_table in this loop, even on simple testcases.

Confirmed.

On Wed, Feb 18, 2015 at 11:00:35 +0100, Jakub Jelinek wrote:
+  /* First search just the GET_CLASS_NARROWEST_MODE to wider modes,
+if not found, fallback to all modes.  */
+  int pass;
+  for (pass = 0; pass  2; pass++)
+   for (machine_mode mr = pass ? VOIDmode
+   : GET_CLASS_NARROWEST_MODE (mclass);
+pass ? mr  MAX_MACHINE_MODE : mr != VOIDmode;
+pass ? mr = (machine_mode) (m + 1)
+ : mr = GET_MODE_WIDER_MODE (mr))
+ if (GET_MODE_CLASS (mr) != mclass
+ || GET_MODE_SIZE (mr) != size
+ || GET_MODE_PRECISION (mr) != prec
+ || GET_MODE_INNER (mr) != inner
+ || GET_MODE_IBIT (mr) != ibit
+ || GET_MODE_FBIT (mr) != fbit
+ || GET_MODE_NUNITS (mr) != nunits)
+   continue;

Given that gomp-4_1-branch works ok, the problem was introduced 
somewhere
between 9 and 31 Jul.  I'll try to find the revision.
   
   Shouldn't 'mr' be here instead of 'm'?
  
  I think so.  If it works, patch preapproved.
 
 It fixes the infinite loop, but causes an error:
 lto1: fatal error: unsupported mode QI

Confirmed.

  But wonder what changed that we haven't been triggering it before.
  What mode do you think it on (mclass/size/prec/inner/ibit/fbit/nunits)?
 
 When in hangs, mr is HImode.

Do you already have any further analysis, a workaround, or even a fix?


Grüße,
 Thomas


signature.asc
Description: PGP signature


Re: [PATCH] [AVX512F] Add scatter support for vectorizer

2015-08-04 Thread Uros Bizjak
On Tue, Aug 4, 2015 at 2:15 PM, Richard Biener rguent...@suse.de wrote:

 This patch adds scatter support for vectorizer (for AVX512F
 instructions). Please have a look. Is it OK for trunk?

 +/* Target builtin that implements vector scatter operation.  */
 +DEFHOOK
 +(builtin_scatter,
 + ,
 + tree,
 + (const_tree vectype, const_tree index_type, int scale),
 + NULL)

 please add documentation inline here, like for builtin_gather,
 and let tm.texi be auto-populated.

 Note that the i386 changes need target maintainer approval, CCing
 Uros.

As said many times, please don't mix middle-end and target parts into
one patch. Middle-end part (usually algorithmic one) has to be
discussed, reviewed and approved first, and at that stage, the target
part can be used as an implementation example. Only *after* approval
of the middle-end part, target part can be reviewed.

Not to mention that every part has different reviews, so the review of
the patch can stall due to this fact.

Uros.


Re: [PATCH][AArch64][8/14] Implement TARGET_OPTION_VALID_ATTRIBUTE_P

2015-08-04 Thread Kyrill Tkachov


On 04/08/15 09:53, James Greenhalgh wrote:

On Mon, Aug 03, 2015 at 04:20:13PM +0100, Kyrill Tkachov wrote:

Ok, I've removed usages of 'ret' in favor of returning when appropriate.
In this last one I left the ret (but cleaned up the control flow a bit)
because if the processing fails we need to clean up a bit of state before
returning.

This is OK with the changes below fixed, or commented on as justification.


diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index fc1cec7..3a5482d 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -376,6 +378,8 @@ extern bool aarch64_madd_needs_nop (rtx_insn *);
  extern void aarch64_final_prescan_insn (rtx_insn *);
  extern bool
  aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel);
+bool aarch64_handle_option (struct gcc_options *, struct gcc_options *,
+const struct cl_decoded_option *, location_t);

Please try to keep this file in alphabetical order, first by return type,
then by function name.


Ok, will do.




  void aarch64_atomic_assign_expand_fenv (tree *, tree *, tree *);
  int aarch64_ccmp_mode_to_code (enum machine_mode mode);
  
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c

index d0d62e7..7a369fd 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
+static bool
+aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
+{
+  const struct processor *tmp_arch = NULL;
+  enum aarch64_parse_opt_result parse_res
+= aarch64_parse_arch (str, tmp_arch, aarch64_isa_flags);
+
+  if (parse_res == AARCH64_PARSE_OK)
+{
+  gcc_assert (tmp_arch);
+  selected_arch = tmp_arch;
+  explicit_arch = selected_arch-arch;
+  return true;
+}

Why not pull this in to the switch case below?


I chose to keep the success case separate from error handling and reporting as 
it made it
easier to find it (and it is the more interesting case in these functions). I 
can add a comment
to that effect there if you'd like.

Thanks,
Kyrill




+
+  switch (parse_res)
+{
+  case AARCH64_PARSE_MISSING_ARG:
+   error (missing architecture name in 'arch' target %s, pragma_or_attr);
+   break;
+  case AARCH64_PARSE_INVALID_ARG:
+   error (unknown value %qs for 'arch' target %s, str, pragma_or_attr);
+   break;
+  case AARCH64_PARSE_INVALID_FEATURE:
+   error (invalid feature modifier %qs for 'arch' target %s,
+  str, pragma_or_attr);
+   break;
+  default:
+   gcc_unreachable ();
+}
+
+  return false;
+}
+
+/* Handle the argument CPU_STR to the cpu= target attribute.
+   PRAGMA_OR_ATTR is used in potential error messages.  */
+
+static bool
+aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
+{
+  const struct processor *tmp_cpu = NULL;
+  enum aarch64_parse_opt_result parse_res
+= aarch64_parse_cpu (str, tmp_cpu, aarch64_isa_flags);
+
+  if (parse_res == AARCH64_PARSE_OK)
+{
+  gcc_assert (tmp_cpu);
+  selected_tune = tmp_cpu;
+  explicit_tune_core = selected_tune-ident;
+
+  selected_arch = all_architectures[tmp_cpu-arch];
+  explicit_arch = selected_arch-arch;
+  return true;
+}

Likewise here.


+
+  switch (parse_res)
+{
+  case AARCH64_PARSE_MISSING_ARG:
+   error (missing cpu name in 'cpu' target %s, pragma_or_attr);
+   break;
+  case AARCH64_PARSE_INVALID_ARG:
+   error (unknown value %qs for 'cpu' target %s, str, pragma_or_attr);
+   break;
+  case AARCH64_PARSE_INVALID_FEATURE:
+   error (invalid feature modifier %qs for 'cpu' target %s,
+  str, pragma_or_attr);
+   break;
+  default:
+   gcc_unreachable ();
+}
+
+  return false;
+}
+
+/* Handle the argument STR to the tune= target attribute.
+   PRAGMA_OR_ATTR is used in potential error messages.  */
+
+static bool
+aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
+{
+  const struct processor *tmp_tune = NULL;
+  enum aarch64_parse_opt_result parse_res
+= aarch64_parse_tune (str, tmp_tune);
+
+  if (parse_res == AARCH64_PARSE_OK)
+{
+  gcc_assert (tmp_tune);
+  selected_tune = tmp_tune;
+  explicit_tune_core = selected_tune-ident;
+  return true;
+}
+

And likewise here.


+  switch (parse_res)
+{
+  case AARCH64_PARSE_INVALID_ARG:
+   error (unknown value %qs for 'tune' target %s, str, pragma_or_attr);
+   break;
+  default:
+   gcc_unreachable ();
+}
+
+  return false;
+}
+
+/* Parse an architecture extensions target attribute string specified in STR.
+   For example +fp+nosimd.  Show any errors if needed.  Return TRUE
+   if successful.  Update aarch64_isa_flags to reflect the ISA features
+   modified.
+   PRAGMA_OR_ATTR is used in potential error messages.  */
+
+static bool
+aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
+{
+  enum 

Re: Re: [PATCH][AArch64] PR target/66731 Fix fnmul insn with -frounding-math

2015-08-04 Thread Szabolcs Nagy

On 16/07/15 10:24, Szabolcs Nagy wrote:

On 06/07/15 11:24, James Greenhalgh wrote:


Please make sure in a follow-up patch that the costing logic in
aarch64_rtx_costs also gets updated.



Tested with aarch64-none-linux-gnu cross compiler.
is this OK?

i assume i should backport the fnmul fixes to the gcc-5 branch.

2015-07-16  Szabolcs Nagy  szabolcs.n...@arm.com

* config/aarch64/aarch64.c (aarch64_rtx_costs): Fix NEG cost for FNMUL.
(aarch64_rtx_mult_cost): Fix MULT cost with -frounding-math.



ping.



Re: [PATCH][AArch64] PR target/66731 Fix fnmul insn with -frounding-math

2015-08-04 Thread James Greenhalgh
On Thu, Jul 16, 2015 at 10:24:20AM +0100, Szabolcs Nagy wrote:
 On 06/07/15 11:24, James Greenhalgh wrote:
  
  Please make sure in a follow-up patch that the costing logic in
  aarch64_rtx_costs also gets updated.
  
 
 Tested with aarch64-none-linux-gnu cross compiler.
 is this OK?

This is OK, sorry for the delay.

 i assume i should backport the fnmul fixes to the gcc-5 branch.

I see the ARM fixes went back to the release branches, so yes, a backport
would be appreciated.

Thanks,
James


 
 2015-07-16  Szabolcs Nagy  szabolcs.n...@arm.com
 
   * config/aarch64/aarch64.c (aarch64_rtx_costs): Fix NEG cost for FNMUL.
   (aarch64_rtx_mult_cost): Fix MULT cost with -frounding-math.




[PATCH] Use gassign/gcall in genmatch generated code

2015-08-04 Thread Richard Biener

This makes the code ready for the lightweight overloads, reducing
the amount of checking we have to compile in stage2.

Bootstrapped and tested on x86_64-unknown-linux-gnu, applied.

Richard.

2015-08-04  Richard Biener  rguent...@suse.de

* genmatch.c (dt_node::gen_kids_1): Use gassign and gcall in
generated code.
(dt_operand::gen_gimple_expr): Adjust.

Index: gcc/genmatch.c
===
--- gcc/genmatch.c  (revision 226576)
+++ gcc/genmatch.c  (working copy)
@@ -2396,7 +2396,7 @@ dt_operand::gen_gimple_expr (FILE *f, in
 match this.  The only sensible operand types are
 SSA names and invariants.  */
  fprintf_indent (f, indent,
- tree %s = TREE_OPERAND (gimple_assign_rhs1 
(def_stmt), %i);\n,
+ tree %s = TREE_OPERAND (gimple_assign_rhs1 
(def), %i);\n,
  child_opname, i);
  fprintf_indent (f, indent,
  if ((TREE_CODE (%s) == SSA_NAME\n,
@@ -2414,12 +2414,12 @@ dt_operand::gen_gimple_expr (FILE *f, in
}
  else
fprintf_indent (f, indent,
-   tree %s = gimple_assign_rhs%u (def_stmt);\n,
+   tree %s = gimple_assign_rhs%u (def);\n,
child_opname, i + 1);
}
   else
fprintf_indent (f, indent,
-   tree %s = gimple_call_arg (def_stmt, %u);\n,
+   tree %s = gimple_call_arg (def, %u);\n,
child_opname, i);
   fprintf_indent (f, indent,
  if ((%s = do_valueize (valueize, %s)))\n,
@@ -2600,9 +2600,9 @@ dt_node::gen_kids_1 (FILE *f, int indent
   if (exprs_len)
{
  fprintf_indent (f, indent,
- if (is_gimple_assign (def_stmt))\n);
+ if (gassign *def = dyn_cast gassign * 
(def_stmt))\n);
  fprintf_indent (f, indent,
-   switch (gimple_assign_rhs_code (def_stmt))\n);
+   switch (gimple_assign_rhs_code (def))\n);
  indent += 4;
  fprintf_indent (f, indent, {\n);
  for (unsigned i = 0; i  exprs_len; ++i)
@@ -2625,16 +2625,15 @@ dt_node::gen_kids_1 (FILE *f, int indent
 
   if (fns_len)
{
- if (exprs_len)
-   fprintf_indent (f, indent, else );
- else
-   fprintf_indent (f, indent,  );
-
- fprintf (f, if (gimple_call_builtin_p (def_stmt, 
BUILT_IN_NORMAL))\n);
+ fprintf_indent (f, indent,
+ %sif (gimple_call_builtin_p (def_stmt, 
BUILT_IN_NORMAL))\n,
+ exprs_len ? else  : );
  fprintf_indent (f, indent,
{\n);
  fprintf_indent (f, indent,
- tree fndecl = gimple_call_fndecl (def_stmt);\n);
+ gcall *def = as_a gcall * (def_stmt);\n);
+ fprintf_indent (f, indent,
+ tree fndecl = gimple_call_fndecl (def);\n);
  fprintf_indent (f, indent,
  switch (DECL_FUNCTION_CODE (fndecl))\n);
  fprintf_indent (f, indent,


Re: Regression in target MIC compiler

2015-08-04 Thread Richard Biener
On Tue, Aug 4, 2015 at 3:06 PM, Ilya Verbin iver...@gmail.com wrote:
 On Tue, Aug 04, 2015 at 14:35:11 +0200, Thomas Schwinge wrote:
 On Fri, 31 Jul 2015 20:13:02 +0300, Ilya Verbin iver...@gmail.com wrote:
  On Fri, Jul 31, 2015 at 18:59:59 +0200, Jakub Jelinek wrote:
 On Wed, Feb 18, 2015 at 11:00:35 +0100, Jakub Jelinek wrote:
 +  /* First search just the GET_CLASS_NARROWEST_MODE to wider 
 modes,
 +  if not found, fallback to all modes.  */
 +  int pass;
 +  for (pass = 0; pass  2; pass++)
 + for (machine_mode mr = pass ? VOIDmode
 + : GET_CLASS_NARROWEST_MODE 
 (mclass);
 +  pass ? mr  MAX_MACHINE_MODE : mr != VOIDmode;
 +  pass ? mr = (machine_mode) (m + 1)
 +   : mr = GET_MODE_WIDER_MODE (mr))
 +   if (GET_MODE_CLASS (mr) != mclass
 +   || GET_MODE_SIZE (mr) != size
 +   || GET_MODE_PRECISION (mr) != prec
 +   || GET_MODE_INNER (mr) != inner
 +   || GET_MODE_IBIT (mr) != ibit
 +   || GET_MODE_FBIT (mr) != fbit
 +   || GET_MODE_NUNITS (mr) != nunits)
 + continue;

 Given that gomp-4_1-branch works ok, the problem was introduced 
 somewhere
 between 9 and 31 Jul.  I'll try to find the revision.
   
Shouldn't 'mr' be here instead of 'm'?
  
   I think so.  If it works, patch preapproved.

^^^

looks like an obvious error anyway.

Richard.

  It fixes the infinite loop, but causes an error:
  lto1: fatal error: unsupported mode QI

 Confirmed.

   But wonder what changed that we haven't been triggering it before.
   What mode do you think it on (mclass/size/prec/inner/ibit/fbit/nunits)?
 
  When in hangs, mr is HImode.

 Do you already have any further analysis, a workaround, or even a fix?

 Not yet.  I thought since Jakub is the author of this function, he could 
 easily
 point what is wrong here :)  Actually, intelmic doesn't require
 lto_input_mode_table, so temporary workaround is just to disable it.

   -- Ilya


Re: [PATCH] Simplify vector compare-not-select sequence

2015-08-04 Thread Bill Schmidt
Thanks for verifying!

Also verified it still works on powerpc64le-unknown-linux-gnu.
Committed as obvious.

Thanks,
Bill


2015-08-04  Bill Schmidt  wschm...@vnet.linux.ibm.com

* gcc.target/powerpc/vec-cmp-sel.c: Avoid test failure on machines
without VSX an Power8 vector support.


Index: gcc/testsuite/gcc.target/powerpc/vec-cmp-sel.c
===
--- gcc/testsuite/gcc.target/powerpc/vec-cmp-sel.c  (revision 226505)
+++ gcc/testsuite/gcc.target/powerpc/vec-cmp-sel.c  (working copy)
@@ -1,6 +1,7 @@
 /* { dg-do compile { target powerpc64*-*-* } } */
 /* { dg-require-effective-target powerpc_p8vector_ok } */
-/* { dg-options -maltivec -O2 } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+/* { dg-options -maltivec -O2 -mvsx -mpower8-vector } */
 /* { dg-final { scan-assembler vcmpgtsd } } */
 /* { dg-final { scan-assembler-not xxlnor } } */
 



On Mon, 2015-08-03 at 19:35 +0200, Andreas Schwab wrote:
 Bill Schmidt wschm...@linux.vnet.ibm.com writes:
 
  Index: gcc/testsuite/gcc.target/powerpc/vec-cmp-sel.c
  ===
  --- gcc/testsuite/gcc.target/powerpc/vec-cmp-sel.c  (revision 226505)
  +++ gcc/testsuite/gcc.target/powerpc/vec-cmp-sel.c  (working copy)
  @@ -1,6 +1,7 @@
   /* { dg-do compile { target powerpc64*-*-* } } */
 
 If you want -m64 you need dg-require-effective-target lp64, but I see no
 need for that.
 
   /* { dg-require-effective-target powerpc_p8vector_ok } */
  -/* { dg-options -maltivec -O2 } */
  +/* { dg-require-effective-target powerpc_vsx_ok } */
  +/* { dg-options -maltivec -O2 -mvsx -mpower8-vector } */
   /* { dg-final { scan-assembler vcmpgtsd } } */
   /* { dg-final { scan-assembler-not xxlnor } } */
 
 Looks good.
 
 PASS: gcc.target/powerpc/vec-cmp-sel.c (test for excess errors)
 PASS: gcc.target/powerpc/vec-cmp-sel.c scan-assembler vcmpgtsd
 PASS: gcc.target/powerpc/vec-cmp-sel.c scan-assembler-not xxlnor
 PASS: gcc.target/powerpc/vec-cmp-sel.c (test for excess errors)
 PASS: gcc.target/powerpc/vec-cmp-sel.c scan-assembler vcmpgtsd
 PASS: gcc.target/powerpc/vec-cmp-sel.c scan-assembler-not xxlnor
 
 Andreas.
 




Re: [RFC] [Patch]: Try and vectorize with shift for mult expr with power 2 integer constant.

2015-08-04 Thread Richard Biener
On Tue, Aug 4, 2015 at 4:21 PM, Richard Biener
richard.guent...@gmail.com wrote:
 On Tue, Aug 4, 2015 at 4:15 PM, Richard Sandiford
 richard.sandif...@arm.com wrote:
 Richard Biener richard.guent...@gmail.com writes:
 On Tue, Aug 4, 2015 at 10:52 AM, Kumar, Venkataramanan
 venkataramanan.ku...@amd.com wrote:
 Hi Jeff,

 -Original Message-
 From: gcc-patches-ow...@gcc.gnu.org [mailto:gcc-patches-
 ow...@gcc.gnu.org] On Behalf Of Jeff Law
 Sent: Monday, August 03, 2015 11:42 PM
 To: Kumar, Venkataramanan; Jakub Jelinek
 Cc: Richard Beiner (richard.guent...@gmail.com); gcc-patches@gcc.gnu.org
 Subject: Re: [RFC] [Patch]: Try and vectorize with shift for mult expr 
 with
 power 2 integer constant.

 On 08/02/2015 05:03 AM, Kumar, Venkataramanan wrote:
  Hi Jakub,
 
  Thank you for reviewing the patch.
 
  I have incorporated your comments in the attached patch.
 Note Jakub is on PTO for the next 3 weeks.

  Thank you for this information.



 
 
 
  vectorize_mults_via_shift.diff.txt
 
 
  diff --git a/gcc/testsuite/gcc.dg/vect/vect-mult-patterns.c
  b/gcc/testsuite/gcc.dg/vect/vect-mult-patterns.c
 Jakub would probably like more testcases :-)

 The most obvious thing to test would be other shift factors.

 A negative test to verify we don't try to turn a multiply by non-constant 
 or
 multiply by a constant that is not a power of 2 into shifts.

 I have added negative test in the attached patch.



 [ Would it make sense, for example, to turn a multiply by 3 into a 
 shift-add
 sequence?  As Jakub said, choose_mult_variant can be your friend. ]

 Yes I will do that in a follow up patch.

 The new change log becomes

 gcc/ChangeLog
 2015-08-04  Venkataramanan Kumar  venkataramanan.ku...@amd.com
  * tree-vect-patterns.c (vect_recog_mult_pattern): New function for 
 vectorizing
 multiplication patterns.
  * tree-vectorizer.h: Adjust the number of patterns.

 gcc/testsuite/ChangeLog
 2015-08-04  Venkataramanan Kumar  venkataramanan.ku...@amd.com
  * gcc.dg/vect/vect-mult-pattern-1.c: New
 * gcc.dg/vect/vect-mult-pattern-2.c: New

 Bootstrapped and reg tested on aarch64-unknown-linux-gnu.

 Ok for trunk ?

 +  if (TREE_CODE (oprnd0) != SSA_NAME
 +  || TREE_CODE (oprnd1) != INTEGER_CST
 +  || TREE_CODE (itype) != INTEGER_TYPE

 INTEGRAL_TYPE_P (itype)

 +  optab = optab_for_tree_code (LSHIFT_EXPR, vectype, optab_vector);
 +  if (!optab
 +  || optab_handler (optab, TYPE_MODE (vectype)) == CODE_FOR_nothing)
 +   return NULL;
 +

 indent of the return stmt looks wrong

 +  /* Handle constant operands that are postive or negative powers of 2.  */
 +  if ( wi::exact_log2 (oprnd1) != -1  ||
 +   wi::exact_log2 (wi::neg (oprnd1)) != -1)

 no space after (, || goes to the next line.

 +{
 +  tree shift;
 +
 +  if (wi::exact_log2 (oprnd1) != -1)

 please cache wi::exact_log2

 in fact the first if () looks redundant if you simply put an else return 
 NULL
 after a else if (wi::exact_log2 (wi::neg (oprnd1)) != -1)

 Note that the issue with INT_MIN is that wi::neg (INT_MIN) is INT_MIN
 again, but it seems that wi::exact_log2 returns -1 in that case so you
 are fine (and in fact not handling this case).

 Are you sure it returns -1 for INT_MIN?  It isn't supposed to, assuming
 INT_MIN is shorthand for minimum value for a signed type.  wide_ints
 aren't signed, so INT_MIN is indistinguishable from an unsigned
 1(prec-1).

 No, not sure.  I spotted

   /* Reject cases where there are implicit -1 blocks above HIGH.  */
   if (x.len * HOST_BITS_PER_WIDE_INT  x.precision  x.sign_mask ()  0)
 return -1;

 and thought that would catch it.  I mean the tree value is negative so
 exact_log2 must see it is a negative value.

Now re-sent with Richards company disclaimer stripped...

 Richard.

 wi::exact_log2 (wi::to_widest (INT_MIN)) would return -1, but that's
 the difference between infinite precision and exact precision.

 Thanks,
 Richard


RFA: RL78: Remove far operand optimization in rl78_force_nonfar_3

2015-08-04 Thread Nick Clifton
Hi DJ,

  It turns out that the optimization in rl78_force_nonfar_3 to allow
  some special cases to be kept in far pointers does not always work.
  The test case included with this patch will trigger ICEs if the
  optimization is allowed to persist.

  So, may I check this patch in please ?

Cheers
  Nick

gcc/ChangeLog
2015-08-04  Nick Clifton  ni...@redhat.com

* config/rl78/rl78.c (rl78_force_nonfar_3): Remove optimization
to allow identical far pointers to remain.


gcc/testsuite/ChangeLog
2015-08-04  Nick Clifton  ni...@redhat.com

* gcc.target/rl78: New directory.
* gcc.target/rl78/rl78.exp: New file: Test driver.
* gcc.target/rl78/test_addm3.c: New file: Test adds.

Index: gcc/config/rl78/rl78.c
===
--- gcc/config/rl78/rl78.c	(revision 226548)
+++ gcc/config/rl78/rl78.c	(working copy)
@@ -608,13 +608,6 @@
   int did = 0;
   rtx temp_reg = NULL;
 
-  /* As an exception, we allow two far operands if they're identical
- and the third operand is not a MEM.  This allows global variables
- to be incremented, for example.  */
-  if (rtx_equal_p (operands[0], operands[1])
-   ! MEM_P (operands[2]))
-return 0;
-
   /* FIXME: Likewise.  */
   if (rl78_far_p (operands[1]))
 {
--- /dev/null	2015-08-04 08:05:06.160754276 +0100
+++ gcc/testsuite/gcc.target/rl78/rl78.exp	2015-08-04 14:03:20.759389085 +0100
@@ -0,0 +1,43 @@
+# Copyright (C) 2015 Free Software Foundation, Inc.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  
+
+# GCC testsuite that uses the `dg.exp' driver.
+
+# Exit immediately if this isn't the right target.
+if { ![istarget rl78-*-*] } then {
+  return
+}
+
+# Load support procs.
+load_lib gcc-dg.exp
+
+# If a testcase doesn't have special options, use these.
+global DEFAULT_CFLAGS
+if ![info exists DEFAULT_CFLAGS] then {
+set DEFAULT_CFLAGS 
+}
+
+# Initialize `dg'.
+dg-init
+
+# Find all tests
+set tests [lsort [find $srcdir/$subdir *.\[cS\]]]
+
+# Main loop.
+gcc-dg-runtest $tests  $DEFAULT_CFLAGS
+
+# All done.
+dg-finish
--- /dev/null	2015-08-04 08:05:06.160754276 +0100
+++ gcc/testsuite/gcc.target/rl78/test_addm3.c	2015-08-04 15:12:53.509456677 +0100
@@ -0,0 +1,100 @@
+/* Remove `-ansi' from options to enable the use of __far and long long.  */
+/* { dg-options  } */
+
+#define ADD(TYPE, name)			\
+  TYPE		\
+  add##name(TYPE a, TYPE b)			\
+  {		\
+return a + b;\
+  }		\
+  
+#define ADDIMM(TYPE, name)			\
+  TYPE		\
+  addimm##name(TYPE a)\
+  {		\
+return a + 50;\
+  }		\
+
+#define ADDFAR(TYPE, name)			\
+  TYPE __far gf##name;\
+  void		\
+  addfar##name(TYPE __far *pa, TYPE b)		\
+  {		\
+gf##name += b;\
+*pa += 50;	\
+  }		\
+  
+
+ADD (char, qi3)
+ADD (int, hi3)
+ADD (long, si3)
+ADD (long long, di3)
+ADD (float, sf3)
+ADD (double, df3)
+
+ADDIMM (char, qi3)
+ADDIMM (int, hi3)
+ADDIMM (long, si3)
+ADDIMM (long long, di3)
+ADDIMM (float, sf3)
+ADDIMM (double, df3)
+
+ADDFAR (char, qi3)
+ADDFAR (int, hi3)
+ADDFAR (long, si3)
+ADDFAR (long long, di3)
+ADDFAR (float, sf3)
+ADDFAR (double, df3)
+
+char aqi1, aqi2;
+int ahi1, ahi2;
+long asi1, asi2;
+long long adi1, adi2;
+float af1, af2;
+double ad1, ad2;
+
+void
+testglobal (void)
+{
+  aqi1 += aqi2;
+  ahi1 += ahi2;
+  asi1 += asi2;
+  adi1 += adi2;
+  af1 += af2;
+  ad1 += ad2;
+}
+
+void
+testglobal2 (void)
+{
+  aqi1 += 10;
+  ahi1 += 11;
+  asi1 += 12;
+  adi1 += 13;
+  af1 += 2.0;
+  ad1 += 4.0;
+}
+
+void
+testptr (char *aqi1, int *ahi1, long *asi1, long long *adi1, float *af1, double *ad1, 
+	 char *aqi2, int *ahi2, long *asi2, long long *adi2, float *af2, double *ad2)
+{
+  *aqi1 += *aqi2;
+  *ahi1 += *ahi2;
+  *asi1 += *asi2;
+  *adi1 += *adi2;
+  *af1 += *af2;
+  *ad1 += *ad2;
+}
+
+void
+testptr2 (char *aqi1, int *ahi1, long *asi1, long long *adi1, float *af1, double *ad1)
+{
+  *aqi1 += 5;
+  *ahi1 += 10;
+  *asi1 += 11;
+  *adi1 += 12;
+  *af1 += 4.5;
+  *ad1 += 5.5;
+}
+


[PATCH/libiberty] Remove use of strtod in libiberty/d-demangle.c

2015-08-04 Thread Iain Buclaw
Fixes PR 18669 raised against gdb/binutils.

https://sourceware.org/bugzilla/show_bug.cgi?id=18669

While it is possible to roll our own strtod that handles hexadecimal
to float conversion, I'm no longer interested taking time out to
implement or maintain such a thing.  So the next obvious thing to do
is nothing, which is what I've settled for.

Regards
Iain.
2015-08-04  Iain Buclaw  ibuc...@gdcproject.org

	* d-demangle.c (dlang_parse_real): Remove call to strtod.
	(strtod): Remove declaration.
	* testsuite/d-demangle-expected: Update float and complex literal
	tests to check correct hexadecimal demangling.


--- a/libiberty/d-demangle.c
+++ b/libiberty/d-demangle.c
@@ -28,7 +28,7 @@ If not, see http://www.gnu.org/licenses/.  */
 
 /* This file exports one function; dlang_demangle.
 
-   This file imports strtol and strtod for decoding mangled literals.  */
+   This file imports strtol for decoding mangled literals.  */
 
 #ifdef HAVE_CONFIG_H
 #include config.h
@@ -44,7 +44,6 @@ If not, see http://www.gnu.org/licenses/.  */
 #include stdlib.h
 #else
 extern long strtol (const char *nptr, char **endptr, int base);
-extern double strtod (const char *nptr, char **endptr);
 #endif
 
 #include demangle.h
@@ -970,8 +969,6 @@ dlang_parse_real (string *decl, const char *mangled)
 {
   char buffer[64];
   int len = 0;
-  double value;
-  char *endptr;
 
   /* Handle NAN and +-INF.  */
   if (strncmp (mangled, NAN, 3) == 0)
@@ -1035,14 +1032,10 @@ dlang_parse_real (string *decl, const char *mangled)
   mangled++;
 }
 
-  /* Convert buffer from hexadecimal to floating-point.  */
+  /* Write out the demangled hexadecimal, rather than trying to
+ convert the buffer into a floating-point value.  */
   buffer[len] = '\0';
-  value = strtod (buffer, endptr);
-
-  if (endptr == NULL || endptr != (buffer + len))
-return NULL;
-
-  len = snprintf (buffer, sizeof(buffer), %#g, value);
+  len = strlen (buffer);
   string_appendn (decl, buffer, len);
   return mangled;
 }
--- a/libiberty/testsuite/d-demangle-expected
+++ b/libiberty/testsuite/d-demangle-expected
@@ -719,19 +719,19 @@ demangle.test!('\U000186a0')
 #
 --format=dlang
 _D8demangle17__T4testVde0A8P6Zv
-demangle.test!(42.)
+demangle.test!(0x0.A8p6)
 #
 --format=dlang
 _D8demangle16__T4testVdeA8P2Zv
-demangle.test!(42.)
+demangle.test!(0xA.8p2)
 #
 --format=dlang
 _D8demangle18__T4testVdeN0A8P6Zv
-demangle.test!(-42.)
+demangle.test!(-0x0.A8p6)
 #
 --format=dlang
 _D8demangle31__T4testVde0F6E978D4FDF3B646P7Zv
-demangle.test!(123.456)
+demangle.test!(0x0.F6E978D4FDF3B646p7)
 #
 --format=dlang
 _D8demangle15__T4testVdeNANZv
@@ -747,27 +747,27 @@ demangle.test!(-Inf)
 #
 --format=dlang
 _D8demangle23__T4testVfe0FFP128Zv
-demangle.test!(3.40282e+38)
+demangle.test!(0x0.FFp128)
 #
 --format=dlang
 _D8demangle32__T4testVde0F8P1024Zv
-demangle.test!(1.79769e+308)
+demangle.test!(0x0.F8p1024)
 #
 --format=dlang
 _D8demangle19__T4testVfe08PN125Zv
-demangle.test!(1.17549e-38)
+demangle.test!(0x0.8p-125)
 #
 --format=dlang
 _D8demangle20__T4testVde08PN1021Zv
-demangle.test!(2.22507e-308)
+demangle.test!(0x0.8p-1021)
 #
 --format=dlang
 _D8demangle51__T4testVrc0C4CDP4c0B666P6Zv
-demangle.test!(12.3000+45.6000i)
+demangle.test!(0x0.C4CDp4+0x0.B666p6i)
 #
 --format=dlang
 _D8demangle52__T4testVrcN0C4CDP4c0B666P6Zv
-demangle.test!(-12.3000+45.6000i)
+demangle.test!(-0x0.C4CDp4+0x0.B666p6i)
 #
 --format=dlang
 _D8demangle22__T4testVG3ua3_616263Zv
@@ -787,7 +787,7 @@ demangle.test!([1, 2, 3, 4])
 #
 --format=dlang
 _D8demangle25__T4testVAdA2e08P1eN08P1Zv
-demangle.test!([1.0, -1.0])
+demangle.test!([0x0.8p1, -0x0.8p1])
 #
 --format=dlang
 _D8demangle23__T4testVHiiA2i1i2i3i4Zv


Re: [Bug fortran/52846] [F2008] Support submodules - part 3/3

2015-08-04 Thread Paul Richard Thomas
Dear Mikael,

Thanks for your comments. I will commit the patch tonight. If folk get
steamed up about .smod files appearing when they compile their
favourite non-submodule-based code, I guess that we can put in a
compilation flag to suppress them. We have plenty of time to tweak
this before the release of 6 branch.

Once committed, I will get on with the documentation and updating of
gfortran wiki.

Cheers

Paul

On 3 August 2015 at 17:39, Mikael Morin mikael.mo...@sfr.fr wrote:
 Le 03/08/2015 14:36, Paul Richard Thomas a écrit :

 Dear Mikael,

 Thanks for your green light!

 I have been mulling over the trans-decl part of the patch and having
 been wondering if it is necessary.

 You mean marking entities as public?  Or setting the hidden visibility
 attribute?  Or both?
 I think both are necessary.

 Without optimization, private
 entities can be linked to. Given the discussion concerning the
 combination of submodules and private entities, I wonder if this is
 not sufficient? Within submodule scope, an advisory could be given for
 undefined references to suggest recompiling the module without
 optimization or making the entities public.

 About recompiling without optimization:
 If the module contains no code, I guess that would be OK.
 But otherwise, it would be pretty bad.
 And one would have to do the same for submodules of a submodule: the parent
 submodule would be compiled without optimization. :-(

 About making the entities public:
 I think the goal of submodules is providing a way to specify a (hopefully)
 stable interface free of any internal implementation details that users
 would start playing with if the opportunity was given to them.  Making all
 entities public would go against that.


 I've been reading about the hidden visibility attribute since you submitted
 the 3/3 patch(es).  I think it's the right thing. :-)

 Mikael



-- 
Outside of a dog, a book is a man's best friend. Inside of a dog it's
too dark to read.

Groucho Marx


Re: [AArch64] Tighten direct call pattern to repair -fno-plt

2015-08-04 Thread James Greenhalgh
On Thu, Jul 16, 2015 at 11:21:25AM +0100, Jiong Wang wrote:
 
 Jeff Law writes:
 
  On 06/23/2015 02:29 AM, Ramana Radhakrishnan wrote:
 
  If you try disabling the REG_EQUAL note generation [*], you'll probably 
  find a
  performance regression on arm32 (and probably on aarch64 as well?
  we only
 
  IMHO disabling the REG_EQUAL note generation is the wrong way to go about 
  this.
  Agreed.
 
  Irrespective of combine, as a first step we should fix the predicates
  and the call expanders to prevent this sort of replacement in the
  backends. Tightening the predicates in the call patterns will achieve
  the same for you and then we can investigate the use of GOT_PREL. My
  recollection of this is that you need to work out when it's more
  beneficial to use GOT_PREL over GOT but it's been a while since I
  looked in that area.
  Also agreed.  This is primarily a backend issue with the call patterns.
 
  This is similar to the situation on the PA with the 32bit SOM runtime 
  where direct and indirect calls have different calling conventions. 
  Those different calling conventions combined with the early loading of 
  the parameter registers in effect restricts us from being able to 
  transform an indirect call into a direct call (combine) or vice-versa (cse).
 
  The way we handled this was to split the calls into two patterns, one 
  for direct one for indirect and tightening their predicates appropriately.
 
  Jeff
 
 Attachment is the patch which repair -fno-plt support for AArch64.
 
 aarch64_is_noplt_call_p will only be true if:
 
   * gcc is generating position independent code.
   * function symbol has declaration.
   * either -fno-plt or (no_plt) attribute specified.
   * it's a external function.
   
 OK for trunk?

OK.

Thanks,
James

 
 2015-07-16  Jiong Wang  jiong.w...@arm.com
 
 gcc/
   * config/aarch64/aarch64-protos.h (aarch64_is_noplt_call_p): New
   declaration.
   * config/aarch64/aarch64.c (aarch64_is_noplt_call_p): New function.
   * config/aarch64/aarch64.md (call_value_symbol): Check noplt
   scenarios.
   (call_symbol): Ditto.
 
 gcc/testsuite/
   * gcc.target/aarch64/noplt_1.c: New testcase.
   * gcc.target/aarch64/noplt_2.c: Ditto.
 

((Though do check the ChangeLog formatting when you commit :-).))



Re: [AArch64][sibcall]Tighten direct call pattern to repair -fno-plt

2015-08-04 Thread James Greenhalgh
On Tue, Jul 21, 2015 at 01:42:35PM +0100, Jiong Wang wrote:
 
 Jiong Wang writes:
 
  Alexander Monakov writes:
 
  Attachment is the patch which repair -fno-plt support for AArch64.
  
  aarch64_is_noplt_call_p will only be true if:
  
* gcc is generating position independent code.
* function symbol has declaration.
* either -fno-plt or (no_plt) attribute specified.
* it's a external function.

  OK for trunk?
  
  2015-07-16  Jiong Wang  jiong.w...@arm.com
  
  gcc/
* config/aarch64/aarch64-protos.h (aarch64_is_noplt_call_p): New
declaration.
* config/aarch64/aarch64.c (aarch64_is_noplt_call_p): New function.
* config/aarch64/aarch64.md (call_value_symbol): Check noplt
scenarios.
(call_symbol): Ditto.
 
  Shouldn't the same treatment be applied to tailcall 
  (sibcall_{,value_}symbol)
  patterns?  I guess it could be done as a followup patch, but would be nice 
  if
  that isn't forgotten.
 
  Thanks for the remaind, that will be done as a followup patch.
 
 Patch attached.
 
 Added one more restriction to Usf constraint which is used by sibcall
 pattern when matching direct call.
 
 given example like
 
 void
 cal_novalue (int a)
 {
   dec (a);
 }
 
 when -fpic -fno-plt specified we now generate:
 
 cal:
 adrpx1, :got:dec
 ldr x1, [x1, #:got_lo12:dec]
 br  x1
 
 instead of:
 
 cal:
 b dec

OK.

Thanks,
James

 2015-07-20  Jiong Wang  jiong.w...@arm.com
 
 gcc/
   * config/aarch64/constraints.md (Usf): Add the test of
   aarch64_is_noplt_call_p.
 
 gcc/testsuite/
   * gcc.target/aarch64/noplt_3.c: New test.



Re: [PATCH][AArch64] Change aarch64 vector cost to match vectorizer

2015-08-04 Thread Pawel Kupidura

On 03/08/15 17:26, James Greenhalgh wrote:

On Mon, Jul 27, 2015 at 02:22:41PM +0100, Pawel Kupidura wrote:

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 10df325..ffafc3f 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,7 @@
+2015-07-27  Pawel Kupidurapawel.kupid...@arm.com


Two spaces between your name and your email address, like so:

2015-07-27  Pawel Kupidurapawel.kupid...@arm.com


+
+* config/aarch64/aarch64.c: Changed inner loop statement cost
+to be consistent with vectorizer code.
+


s/Changed/Change


   2015-07-26  Uros Bizjakubiz...@gmail.com

   * config/alpha/alpha.c: Use SUBREG_P predicate.
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 020f63c..3b6f8c5 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -7079,15 +7079,9 @@ aarch64_add_stmt_cost (void *data, int count,
enum vect_cost_for_stmt kind,

 /* Statements in an inner loop relative to the loop being
vectorized are weighted more heavily.  The value here is
- a function (linear for now) of the loop nest level.  */
+ arbitrary and could potentially be improved with analysis.  */


Your mail client has mangled the tabs in this diff, so the patch will
not apply in this form. Could you try posting again having resolved the
issues with your mail client?


 if (where == vect_body  stmt_info  stmt_in_inner_loop_p
(stmt_info))
-{
-  loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
-  struct loop *loop =  LOOP_VINFO_LOOP (loop_info);
-  unsigned nest_level = loop_depth (loop);
-
-  count *= nest_level;
-}
+count *= 50; /* FIXME */


Likewise here.

Thanks,
James


Hi,

I'm sorry about the issues with formatting, it should be fixed now. 
Here's corrected version with diff to current trunk.


Thanks,
Pawel Kupidura

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index fdc4a7e..d1c6663 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,7 @@
+2015-08-04  Pawel Kupidura  pawel.kupid...@arm.com
+   * config/aarch64/aarch64.c: Change inner loop statement cost
+   to be consistent with other targets.
+
 2015-08-03  Abe Skolnik  a.skol...@samsung.com

* tree-if-conv.c: Fix various typos in comments.
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 2b1ae36..173a385 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -7086,15 +7086,9 @@ aarch64_add_stmt_cost (void *data, int count, 
enum vect_cost_for_stmt kind,


   /* Statements in an inner loop relative to the loop being
 vectorized are weighted more heavily.  The value here is
-a function (linear for now) of the loop nest level.  */
+arbitrary and could potentially be improved with analysis.  */
   if (where == vect_body  stmt_info  stmt_in_inner_loop_p 
(stmt_info))

-   {
- loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
- struct loop *loop =  LOOP_VINFO_LOOP (loop_info);
- unsigned nest_level = loop_depth (loop);
-
- count *= nest_level;
-   }
+   count *= 50; /*  FIXME  */

   retval = (unsigned) (count * stmt_cost);
   cost[where] += retval;



Re: [PATCH 9/15][AArch64] vld{2,3,4}{,_lane,_dup}, vcombine, vcreate

2015-08-04 Thread Alan Lawrence

Attachment has gone awol here too. Sorry for the bother, please ignore 
previous...

Alan Lawrence wrote:

James Greenhalgh wrote:

On Tue, Jul 28, 2015 at 12:25:55PM +0100, Alan Lawrence wrote:

gcc/ChangeLog:

* config/aarch64/aarch64.c (aarch64_split_simd_combine): Add V4HFmode.
* config/aarch64/aarch64-builtins.c (VAR13, VAR14): New.
(aarch64_scalar_builtin_types, aarch64_init_simd_builtin_scalar_types):
Add __builtin_aarch64_simd_hf.
* config/aarch64/arm_neon.h (float16x4x2_t, float16x8x2_t,
float16x4x3_t, float16x8x3_t, float16x4x4_t, float16x8x4_t,
vcombine_f16, vst2_lane_f16, vst2q_lane_f16, vst3_lane_f16,
vst3q_lane_f16, vst4_lane_f16, vst4q_lane_f16, vld2_f16, vld2q_f16,
vld3_f16, vld3q_f16, vld4_f16, vld4q_f16, vld2_dup_f16, vld2q_dup_f16,
vld3_dup_f16, vld3q_dup_f16, vld4_dup_f16, vld4q_dup_f16,
vld2_lane_f16, vld2q_lane_f16, vld3_lane_f16, vld3q_lane_f16,
vld4_lane_f16, vld4q_lane_f16, vst2_f16, vst2q_f16, vst3_f16,
vst3q_f16, vst4_f16, vst4q_f16, vcreate_f16): New.

* config/aarch64/iterators.md (VALLDIF, Vtype, Vetype, Vbtype,
V_cmp_result, v_cmp_result): Add cases for V4HF and V8HF.
(VDC, Vdbl): Add V4HF.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/vldN_1.c: Add float16x4_t and float16x8_t cases.
* gcc.target/aarch64/vldN_dup_1.c: Likewise.
* gcc.target/aarch64/vldN_lane_1.c: Likewise.

Hi Alan,

The arm_neon.h portion of this patch does not apply after Charles' recent
changes. Could you please rebase and resubmit the patch for review?

Thanks,
James


Ah, indeed, thanks. Here's a rebased version, using Charles' new versions of 
__(LD|ST)[234]_LANE_FUNC. I'll follow with a patch adding corresponding 
lane_f16_indices tests in a separate email.


(Changelog as before)

Bootstrapped + check-gcc on aarch64-none-linux-gnu.




[PATCH, i386] Disable AVX-512VL insns for scalar mode operands on -march=knl.

2015-08-04 Thread Kirill Yukhin
Hello,

For vec_dup and vec_concat patterns (of v2df mode) second operand
is of scalar mode, so `ix86_hard_regno_mode_ok’ didn’t block EVEX registers,
of non-512b modes (when AVX-512VL is turned off).
This turns into 128/256b xmm[15] regs emit on -march=knl.

There’re should be more patterns w/ similar issue. Will look for them later.

Bootstrapped and regtested.

If no objections, I'll commit it tomorrow morning (Moscow time).

gcc/
* config/i386/i386.md (define_attr isa): Addd avx512vl and
noavx512vl.
(define_attr enabled): Handle avx521vl and noavx512vl.
* config/i386/sse.md (define_insn vec_dupv2dfmask_name): Split
AVX-512 alternative out of SSE.
(define_insn *vec_concatv2df): Ditto.

--
Thanks, K

commit 924990a6e8d38b6ebff9dd9a79e285ef81890202
Author: Kirill Yukhin kirill.yuk...@intel.com
Date:   Mon Aug 3 15:21:06 2015 +0300

Fix vec_concatv2df and vec_dupv2df to block wrongly enabled AVX-512VL insns.

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 5c5c1fc..9ffe9aa 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -784,7 +784,8 @@
 (define_attr isa base,x64,x64_sse4,x64_sse4_noavx,x64_avx,nox64,
sse2,sse2_noavx,sse3,sse4,sse4_noavx,avx,noavx,
avx2,noavx2,bmi,bmi2,fma4,fma,avx512f,noavx512f,
-   fma_avx512f,avx512bw,noavx512bw,avx512dq,noavx512dq
+   fma_avx512f,avx512bw,noavx512bw,avx512dq,noavx512dq,
+   avx512vl,noavx512vl
   (const_string base))
 
 (define_attr enabled 
@@ -819,6 +820,8 @@
 (eq_attr isa noavx512bw) (symbol_ref !TARGET_AVX512BW)
 (eq_attr isa avx512dq) (symbol_ref TARGET_AVX512DQ)
 (eq_attr isa noavx512dq) (symbol_ref !TARGET_AVX512DQ)
+(eq_attr isa avx512vl) (symbol_ref TARGET_AVX512VL)
+(eq_attr isa noavx512vl) (symbol_ref !TARGET_AVX512VL)
]
(const_int 1)))
 
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 0970f0e..a509369 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -8638,44 +8638,47 @@
(set_attr mode DF,DF,V1DF,V1DF,V1DF,V2DF,V1DF,V1DF,V1DF)])
 
 (define_insn vec_dupv2dfmask_name
-  [(set (match_operand:V2DF 0 register_operand =x,v)
+  [(set (match_operand:V2DF 0 register_operand =x,x,v)
(vec_duplicate:V2DF
- (match_operand:DF 1 nonimmediate_operand  0,vm)))]
+ (match_operand:DF 1 nonimmediate_operand  0,xm,vm)))]
   TARGET_SSE2  mask_avx512vl_condition
   @
unpcklpd\t%0, %0
-   %vmovddup\t{%1, %0mask_operand2|%0mask_operand2, %1}
-  [(set_attr isa noavx,sse3)
+   %vmovddup\t{%1, %0mask_operand2|%0mask_operand2, %1}
+   vmovddup\t{%1, %0mask_operand2|%0mask_operand2, %1}
+  [(set_attr isa noavx,sse3,avx512vl)
(set_attr type sselog1)
-   (set_attr prefix orig,maybe_vex)
-   (set_attr mode V2DF,DF)])
+   (set_attr prefix orig,maybe_vex,evex)
+   (set_attr mode V2DF,DF,DF)])
 
 (define_insn *vec_concatv2df
-  [(set (match_operand:V2DF 0 register_operand =x,v,v,x,x,v,x,x)
+  [(set (match_operand:V2DF 0 register_operand =x,x,v,x,v,x,x,v,x,x)
(vec_concat:V2DF
- (match_operand:DF 1 nonimmediate_operand  0,v,m,0,x,m,0,0)
- (match_operand:DF 2 vector_move_operand   x,v,1,m,m,C,x,m)))]
+ (match_operand:DF 1 nonimmediate_operand  0,x,v,m,m,0,x,m,0,0)
+ (match_operand:DF 2 vector_move_operand   x,x,v,1,1,m,m,C,x,m)))]
   TARGET_SSE
 (!(MEM_P (operands[1])  MEM_P (operands[2]))
|| (TARGET_SSE3  rtx_equal_p (operands[1], operands[2])))
   @
unpcklpd\t{%2, %0|%0, %2}
vunpcklpd\t{%2, %1, %0|%0, %1, %2}
+   vunpcklpd\t{%2, %1, %0|%0, %1, %2}
%vmovddup\t{%1, %0|%0, %1}
+   vmovddup\t{%1, %0|%0, %1}
movhpd\t{%2, %0|%0, %2}
vmovhpd\t{%2, %1, %0|%0, %1, %2}
%vmovsd\t{%1, %0|%0, %1}
movlhps\t{%2, %0|%0, %2}
movhps\t{%2, %0|%0, %2}
-  [(set_attr isa sse2_noavx,avx,sse3,sse2_noavx,avx,sse2,noavx,noavx)
+  [(set_attr isa 
sse2_noavx,avx,avx512vl,sse3,avx512vl,sse2_noavx,avx,sse2,noavx,noavx)
(set (attr type)
  (if_then_else
(eq_attr alternative 0,1,2)
(const_string sselog)
(const_string ssemov)))
-   (set_attr prefix_data16 *,*,*,1,*,*,*,*)
-   (set_attr prefix orig,vex,maybe_vex,orig,vex,maybe_vex,orig,orig)
-   (set_attr mode V2DF,V2DF,DF,V1DF,V1DF,DF,V4SF,V2SF)])
+   (set_attr prefix_data16 *,*,*,*,*,1,*,*,*,*)
+   (set_attr prefix 
orig,vex,evex,maybe_vex,evex,orig,vex,maybe_vex,orig,orig)
+   (set_attr mode V2DF,V2DF,V2DF, DF, DF, V1DF,V1DF,DF,V4SF,V2SF)])
 
 ;
 ;;


[gomp4] Worker reduction builtin

2015-08-04 Thread Nathan Sidwell
I've committed this to gomp4  branch.  It creates a new builtin to be used for 
worker-level reductions that Cesar is working on.  When the builtin is expanded 
it allocates a slot in a new .shared array to hold the reduction variable.  This 
array is reused for reductions on different loops.


I also realized the lockk and unlock expanders needed to emit memory barriers to 
that writes made by one  thread in the protected region could be seen by other 
threads in the region.


nathan
2015-08-04  Nathan Sidwell  nat...@codesourcery.com

	* config/nvptx/nvptx.md (UNSPECV_MEMBAR): New.
	(nvptx_membar): New insn.
	* config/nvptx/nvptx.c (BARRIER_SHARED, BARRIER_GLOBAL,
	BARRIER_SYS): New.
	(lock_names, lock_space): Constify.
	(lock_level): New.
	(worker_red_hwm, worker_red_align, worker_red_name,
	worker_red_sym): New.
	(var_red_t, struct loop_red): New types.
	(loop_red): New.
	(nvptx_print_operand): Add 'B' case.
	(nvptx_reorg_reductions): New.
	(nvptx_reorg): Call it.
	(nvptx_file_end): Emit worker reduction array.
	(struct builtin_descriptor):  Remove builtin pointer from
	expander.
	(nvptx_expand_shuffle_down, nvptx_expand_lock_unlock,
	nvptx_expand_lock, nvptx_expand_unlock): Adjust.
	(nvptx_expand_lock_unlock): Emit barrier too.
	(nvptx_expand_work_red_addr): New.
	(NT_UINTPTR_UINT_UINT, NT_ULLPTR_UINT_UINT, NT_FLTPTR_UINT_UINT,
	NT_DBLPTR_UINT_UINT): New.
	(builtins): Add new builtins.
	(nvptx_init_builtins): Create new types.
	(nvptx_expand_builtin): Adjust expander call.

Index: gcc/config/nvptx/nvptx.md
===
--- gcc/config/nvptx/nvptx.md	(revision 226539)
+++ gcc/config/nvptx/nvptx.md	(working copy)
@@ -65,6 +65,7 @@
UNSPECV_CAS
UNSPECV_XCHG
UNSPECV_BARSYNC
+   UNSPECV_MEMBAR
UNSPECV_DIM_POS
 
UNSPECV_FORK
@@ -1564,6 +1565,11 @@
   
   bar.sync\\t%0;)
 
+(define_insn nvptx_membar
+  [(unspec_volatile [(match_operand:SI 0 const_int_operand )]
+		UNSPECV_MEMBAR)]
+  
+  membar%M0;)
 
 ;; spinlock and unlock
 (define_insn nvptx_spinlock
Index: gcc/config/nvptx/nvptx.c
===
--- gcc/config/nvptx/nvptx.c	(revision 226539)
+++ gcc/config/nvptx/nvptx.c	(working copy)
@@ -69,6 +69,11 @@
 #define SHUFFLE_BFLY 2
 #define SHUFFLE_IDX 3
 
+/* Memory barrier levels.  */
+#define BARRIER_SHARED 0
+#define BARRIER_GLOBAL 1
+#define BARRIER_SYS 2
+
 /* Record the function decls we've written, and the libfuncs and function
decls corresponding to them.  */
 static std::stringstream func_decls;
@@ -107,15 +112,47 @@ static GTY(()) rtx worker_bcast_sym;
 #define LOCK_GLOBAL 0
 #define LOCK_SHARED 1
 #define LOCK_MAX2
-static const char *const lock_names[] = 
-  {__global_lock, __shared_lock};
-static const char *const lock_regions[] = 
-  {global, shared};
-static unsigned lock_space[] =
-  {ADDR_SPACE_GLOBAL, ADDR_SPACE_SHARED};
+static const char *const lock_names[] = {__global_lock, __shared_lock};
+static const unsigned lock_space[] = {ADDR_SPACE_GLOBAL, ADDR_SPACE_SHARED};
+static const unsigned lock_level[] = {BARRIER_GLOBAL, BARRIER_SHARED};
 static GTY(()) rtx lock_syms[LOCK_MAX];
 static bool lock_used[LOCK_MAX];
 
+/* Size of buffer needed for worker reductions.  This has to be
+   disjoing from the worker broadcast array, as both may be live
+   concurrently.  */
+static unsigned worker_red_hwm;
+static unsigned worker_red_align;
+#define worker_red_name __worker_red
+static GTY(()) rtx worker_red_sym;
+
+/* To process worker-level reductions we need a buffer in CTA local
+   (.shared) memory.  As the number of loops per function and number
+   of reductions per loop are likely to be small numbers, we use
+   simple unsorted vectors to hold the mappings.  */
+
+/* Mapping from a reduction to an offset within the worker reduction
+   array.  */
+typedef std::pairunsigned, unsigned var_red_t;
+
+/* Mapping from loops within a function to lists of reductions on that
+   loop.  */
+struct loop_red
+{
+  unsigned id;  /* Loop ID.  */
+  unsigned hwm;  /* Allocated worker buffer for this loop.  */
+  auto_vecvar_red_t vars;   /* Reduction variables of the loop.  */
+
+  loop_red (unsigned id_)
+  :id (id_), hwm (0) 
+  {
+  }
+};
+
+/* It would be nice to put this intp machine_function, but auto_vec
+   pulls in too much other stuff.   */
+static auto_vecloop_red loop_reds;
+
 /* Allocate a new, cleared machine_function structure.  */
 
 static struct machine_function *
@@ -147,6 +184,9 @@ nvptx_option_override (void)
   worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, worker_bcast_name);
   worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
 
+  worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, worker_red_name);
+  worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
+
   for (unsigned ix = LOCK_MAX; ix--;)
 lock_syms[ix] = gen_rtx_SYMBOL_REF (Pmode, lock_names[ix]);
 }
@@ -1893,6 +1933,7 @@ nvptx_print_operand_address (FILE *file,

[gomp4] some ptx cleanups

2015-08-04 Thread Nathan Sidwell

I've applied this to gomp4 branch.

There was an inconsistency with the 'S' assembly formatter, in that it didn't 
print the leading '.', but the others did.  Also the pseudo's structure is no 
longer used.


nathan
2015-08-04  Nathan Sidwell  nat...@codesourcery.com

	* config/nvptx/nvptx.md (nvptx_shufflemode): Adjust assembly.
	* config/nvptx/nvptx.c (nvptx_print_operand): Adjust 'S' case to
	print leading . for consistency.
	* config/nvptx/nvptx.h (struct nvptx_pseudo_info): Delete.
	(machine_function): Remove pseudos field.

Index: gcc/config/nvptx/nvptx.md
===
--- gcc/config/nvptx/nvptx.md	(revision 226569)
+++ gcc/config/nvptx/nvptx.md	(working copy)
@@ -1419,7 +1419,7 @@
 		 (match_operand:SI 3 const_int_operand n)]
 		  UNSPEC_SHUFFLE))]
   
-  %.\\tshfl.%S3.b32\\t%0, %1, %2, 31;)
+  %.\\tshfl%S3.b32\\t%0, %1, %2, 31;)
 
 ;; extract parts of a 64 bit object into 2 32-bit ints
 (define_insn unpackmodesi2
Index: gcc/config/nvptx/nvptx.c
===
--- gcc/config/nvptx/nvptx.c	(revision 226569)
+++ gcc/config/nvptx/nvptx.c	(working copy)
@@ -2008,7 +2008,7 @@ nvptx_print_operand (FILE *file, rtx x,
 	unsigned kind = UINTVAL (x);
 	static const char *const kinds[] = 
 	  {up, down, bfly, idx};
-	fprintf (file, %s, kinds[kind]);
+	fprintf (file, .%s, kinds[kind]);
   }
   break;
   
Index: gcc/config/nvptx/nvptx.h
===
--- gcc/config/nvptx/nvptx.h	(revision 226569)
+++ gcc/config/nvptx/nvptx.h	(working copy)
@@ -225,12 +225,6 @@ struct nvptx_args {
 #define LEGITIMATE_PIC_OPERAND_P(X) 1
 
 
-struct nvptx_pseudo_info
-{
-  int true_size;
-  int renumber;
-};
-
 #if defined HOST_WIDE_INT
 struct GTY(()) machine_function
 {
@@ -239,7 +233,6 @@ struct GTY(()) machine_function
   tree funtype;
   bool has_call_with_varargs;
   bool has_call_with_sc;
-  struct GTY((skip)) nvptx_pseudo_info *pseudos;
   HOST_WIDE_INT outgoing_stdarg_size;
   int ret_reg_mode;
   int punning_buffer_size;


[committed, PATCH] PR target/67110: gcc.target/i386/iamcu/test_struct_returning.c execution test FAILs with -fpic

2015-08-04 Thread H.J. Lu
Since IAMCU tests clear all scratch integer registers with:

  asm __volatile__ (xor %%eax, %%eax\n\t \
xor %%edx, %%edx\n\t \
xor %%ecx, %%ecx\n\t \
::: eax, edx, ecx);

PIC register may be trashed between setting PIC register and using it.
This patch compiles AMCU tests with -fno-pie -no-pie.

H.J.
--
PR target/67110
* gcc.target/i386/iamcu/abi-iamcu.exp (additional_flags): Add
-fno-pie -no-pie.
---
 gcc/testsuite/gcc.target/i386/iamcu/abi-iamcu.exp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/i386/iamcu/abi-iamcu.exp 
b/gcc/testsuite/gcc.target/i386/iamcu/abi-iamcu.exp
index b5b3261..8bcee0f 100644
--- a/gcc/testsuite/gcc.target/i386/iamcu/abi-iamcu.exp
+++ b/gcc/testsuite/gcc.target/i386/iamcu/abi-iamcu.exp
@@ -29,7 +29,7 @@ if { (![istarget x86_64-*-linux*]  ![istarget 
i?86-*-linux*])
 
 torture-init
 set-torture-options $C_TORTURE_OPTIONS
-set additional_flags -miamcu -W -Wall -Wno-abi
+set additional_flags -miamcu -W -Wall -Wno-abi -fno-pie -no-pie
 
 foreach src [lsort [glob -nocomplain $srcdir/$subdir/test_*.c]] {
 if {[runtest_file_p $runtests $src]} {
-- 
2.4.3



Re: [PATCH][AArch64][8/14] Implement TARGET_OPTION_VALID_ATTRIBUTE_P

2015-08-04 Thread James Greenhalgh
On Tue, Aug 04, 2015 at 09:58:37AM +0100, Kyrill Tkachov wrote:
 
 On 04/08/15 09:53, James Greenhalgh wrote:
  On Mon, Aug 03, 2015 at 04:20:13PM +0100, Kyrill Tkachov wrote:
  Ok, I've removed usages of 'ret' in favor of returning when appropriate.
  In this last one I left the ret (but cleaned up the control flow a bit)
  because if the processing fails we need to clean up a bit of state before
  returning.
  This is OK with the changes below fixed, or commented on as justification.
 
  diff --git a/gcc/config/aarch64/aarch64-protos.h 
  b/gcc/config/aarch64/aarch64-protos.h
  index fc1cec7..3a5482d 100644
  --- a/gcc/config/aarch64/aarch64-protos.h
  +++ b/gcc/config/aarch64/aarch64-protos.h
  @@ -376,6 +378,8 @@ extern bool aarch64_madd_needs_nop (rtx_insn *);
extern void aarch64_final_prescan_insn (rtx_insn *);
extern bool
aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel);
  +bool aarch64_handle_option (struct gcc_options *, struct gcc_options *,
  +   const struct cl_decoded_option *, location_t);
  Please try to keep this file in alphabetical order, first by return type,
  then by function name.
 
 Ok, will do.
 
 
void aarch64_atomic_assign_expand_fenv (tree *, tree *, tree *);
int aarch64_ccmp_mode_to_code (enum machine_mode mode);

  diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
  index d0d62e7..7a369fd 100644
  --- a/gcc/config/aarch64/aarch64.c
  +++ b/gcc/config/aarch64/aarch64.c
  +static bool
  +aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
  +{
  +  const struct processor *tmp_arch = NULL;
  +  enum aarch64_parse_opt_result parse_res
  += aarch64_parse_arch (str, tmp_arch, aarch64_isa_flags);
  +
  +  if (parse_res == AARCH64_PARSE_OK)
  +{
  +  gcc_assert (tmp_arch);
  +  selected_arch = tmp_arch;
  +  explicit_arch = selected_arch-arch;
  +  return true;
  +}
  Why not pull this in to the switch case below?
 
 I chose to keep the success case separate from error handling and reporting 
 as it made it
 easier to find it (and it is the more interesting case in these functions). I 
 can add a comment
 to that effect there if you'd like.

I thought that might be it. It looks unusual to me, but I don't have
strong feelings against it, so I'm happy for you to leave it as is if
that is your preference.

Thanks,
James



[PATCH, libgfortran]: Improve get_fpu_trap_exceptions

2015-08-04 Thread Uros Bizjak
Following patch substantially improves generated code for
get_fpu_trap_exceptions, reducing insn count from 31 to 9.

2015-08-04  Uros Bizjak  ubiz...@gmail.com

* config/fpu-387.h (get_fpu_trap_exceptions): Add temporary variable
to improve generated code.

Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.

Committed to mainline SVN.

Uros.
Index: config/fpu-387.h
===
--- config/fpu-387.h(revision 226547)
+++ config/fpu-387.h(working copy)
@@ -215,12 +215,13 @@ set_fpu (void)
 int
 get_fpu_trap_exceptions (void)
 {
+  unsigned short cw;
+  int mask;
   int res = 0;
-  unsigned short cw;
 
   __asm__ __volatile__ (fstcw\t%0 : =m (cw));
-  cw = _FPU_MASK_ALL;
-
+  mask = cw;
+  
   if (has_sse())
 {
   unsigned int cw_sse;
@@ -228,16 +229,18 @@ get_fpu_trap_exceptions (void)
   __asm__ __volatile__ (%vstmxcsr\t%0 : =m (cw_sse));
 
   /* The SSE exception masks are shifted by 7 bits.  */
-  cw = cw | ((cw_sse  7)  _FPU_MASK_ALL);
+  mask |= (cw_sse  7);
 }
 
-  if (~cw  _FPU_MASK_IM) res |= GFC_FPE_INVALID;
-  if (~cw  _FPU_MASK_DM) res |= GFC_FPE_DENORMAL;
-  if (~cw  _FPU_MASK_ZM) res |= GFC_FPE_ZERO;
-  if (~cw  _FPU_MASK_OM) res |= GFC_FPE_OVERFLOW;
-  if (~cw  _FPU_MASK_UM) res |= GFC_FPE_UNDERFLOW;
-  if (~cw  _FPU_MASK_PM) res |= GFC_FPE_INEXACT;
+  mask = ~mask  _FPU_MASK_ALL;
 
+  if (mask  _FPU_MASK_IM) res |= GFC_FPE_INVALID;
+  if (mask  _FPU_MASK_DM) res |= GFC_FPE_DENORMAL;
+  if (mask  _FPU_MASK_ZM) res |= GFC_FPE_ZERO;
+  if (mask  _FPU_MASK_OM) res |= GFC_FPE_OVERFLOW;
+  if (mask  _FPU_MASK_UM) res |= GFC_FPE_UNDERFLOW;
+  if (mask  _FPU_MASK_PM) res |= GFC_FPE_INEXACT;
+
   return res;
 }
 


Re: [ARM] Fix vget_lane for big-endian targets

2015-08-04 Thread Christophe Lyon
On 4 August 2015 at 14:09, Christophe Lyon christophe.l...@linaro.org wrote:
 On 21 July 2015 at 16:01, Kyrill Tkachov kyrylo.tkac...@arm.com wrote:

 On 16/07/15 08:56, Christophe Lyon wrote:

 AdvSIMD vget_lane tests currently fail on armeb targets when dealing
 with vectors of 2 64-bits elements. This patches fixes it, by adding a
 code fragment similar to what is dones in other cases. I could have
 simplified it a bit given that the vector width is known, but I chose
 to hardcode 'reg_nelts = 2' to keep the code closer to what is done
 elsewhere.

 OK for trunk?

 Christophe

 2015-07-16  Christophe Lyon  christophe.l...@linaro.org

 * config/arm/neon.md (neon_vget_lanev2di): Handle big-endian
 targets.


 I see we do this for other lanewise patterns as well.
 Has this been tested on an arm big-endian target?

 If so, ok for trunk.

 I forgot to mention that yes, I actually tested it on arm big-endian,
 using QEMU.


Since Alan committed his patch, there was a conflict with mine.
Here is what I committed, the change being obvious enough IMO.
(I did re-run make check on armeb using qemu)

Christophe


 Christophe.


 Thanks,
 Kyrill



 diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
 index 654d9d5..59ddc5b 100644
 --- a/gcc/config/arm/neon.md
 +++ b/gcc/config/arm/neon.md
 @@ -2736,6 +2736,19 @@
  (match_operand:SI 2 immediate_operand )]
 TARGET_NEON
   {
 +  if (BYTES_BIG_ENDIAN)
 +{
 +  /* The intrinsics are defined in terms of a model where the
 +element ordering in memory is vldm order, whereas the generic
 +RTL is defined in terms of a model where the element ordering
 +in memory is array order.  Convert the lane number to conform
 +to this model.  */
 +  unsigned int elt = INTVAL (operands[2]);
 +  unsigned int reg_nelts = 2;
 +  elt ^= reg_nelts - 1;
 +  operands[2] = GEN_INT (elt);
 +}
 +
 switch (INTVAL (operands[2]))
   {
   case 0:


2015-08-04  Christophe Lyon  christophe.l...@linaro.org

	* config/arm/neon.md (neon_vget_lanev2di): Handle big-endian
	targets.

diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 4af74ce..b1bf26a 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -2731,7 +2731,22 @@
(match_operand:SI 2 immediate_operand )]
   TARGET_NEON
 {
-  int lane = INTVAL (operands[2]);
+  int lane;
+
+if (BYTES_BIG_ENDIAN)
+{
+  /* The intrinsics are defined in terms of a model where the
+	 element ordering in memory is vldm order, whereas the generic
+	 RTL is defined in terms of a model where the element ordering
+	 in memory is array order.  Convert the lane number to conform
+	 to this model.  */
+  unsigned int elt = INTVAL (operands[2]);
+  unsigned int reg_nelts = 2;
+  elt ^= reg_nelts - 1;
+  operands[2] = GEN_INT (elt);
+}
+
+  lane = INTVAL (operands[2]);
   gcc_assert ((lane ==0) || (lane == 1));
   emit_move_insn (operands[0], lane == 0
 ? gen_lowpart (DImode, operands[1])


Re: [PATCH][AArch64] Change aarch64 vector cost to match vectorizer

2015-08-04 Thread Richard Sandiford
Pawel Kupidura pawel.kupid...@arm.com writes:
 Hi,

 The issue was flowed format forced by mail client. I've tested it and
 the patch should apply now.

Thanks, applied based on James's OK.

Richard



[PATCH] Add X != !X pattern

2015-08-04 Thread Richard Biener

This adds a pattern matching x != ~x on GIMPLE and allows CCP to
properly optimize the added testcase.  gimple_simplify gets confused
by the existing ~x == 1 - x == 0 pattern which 
gimple_fold_stmt_to_constant_1 cannot reduce to a single value.

Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk.

Richard.

2015-08-04  Richard Biener  rguent...@suse.de

* gimple-fold.c (gimple_fold_stmt_to_constant_1): Canonicalize
bool compares on RHS.
* match.pd: Add X ==/!= !X is false/true pattern.

* gcc.dg/tree-ssa/ssa-ccp-38.c: New testcase.

Index: gcc/gimple-fold.c
===
--- gcc/gimple-fold.c   (revision 226559)
+++ gcc/gimple-fold.c   (working copy)
@@ -5012,10 +5012,8 @@ gimple_fold_stmt_to_constant_1 (gimple s
   further propagation.  */
if (subcode == POINTER_PLUS_EXPR)
  {
-   /* Handle binary operators that can appear in GIMPLE form.  */
tree op0 = (*valueize) (gimple_assign_rhs1 (stmt));
tree op1 = (*valueize) (gimple_assign_rhs2 (stmt));
-
if (TREE_CODE (op0) == ADDR_EXPR
 TREE_CODE (op1) == INTEGER_CST)
  {
@@ -5027,6 +5025,38 @@ gimple_fold_stmt_to_constant_1 (gimple s
  unshare_expr (op0), off));
  }
  }
+   /* Canonicalize bool != 0 and bool == 0 appearing after
+  valueization.  While gimple_simplify handles this
+  it can get confused by the ~X == 1 - X == 0 transform
+  which we cant reduce to a SSA name or a constant
+  (and we have no way to tell gimple_simplify to not
+  consider those transforms in the first place).  */
+   else if (subcode == EQ_EXPR
+|| subcode == NE_EXPR)
+ {
+   tree lhs = gimple_assign_lhs (stmt);
+   tree op0 = gimple_assign_rhs1 (stmt);
+   if (useless_type_conversion_p (TREE_TYPE (lhs),
+  TREE_TYPE (op0)))
+ {
+   tree op1 = (*valueize) (gimple_assign_rhs2 (stmt));
+   op0 = (*valueize) (op0);
+   if (subcode == NE_EXPR)
+ {
+   if (integer_zerop (op1))
+ return op0;
+   else if (integer_zerop (op0))
+ return op1;
+ }
+   else
+ {
+   if (integer_onep (op1))
+ return op0;
+   else if (integer_onep (op0))
+ return op1;
+ }
+ }
+ }
return NULL_TREE;
 
   case GIMPLE_TERNARY_RHS:
Index: gcc/match.pd
===
--- gcc/match.pd(revision 226559)
+++ gcc/match.pd(working copy)
@@ -618,6 +618,11 @@ (define_operator_list CBRT BUILT_IN_CBRT
  (simplify
   (op:c truth_valued_p@0 (logical_inverted_value @0))
   { constant_boolean_node (true, type); }))
+/* X ==/!= !X is false/true.  */
+(for op (eq ne)
+ (simplify
+  (op:c truth_valued_p@0 (logical_inverted_value @0))
+  { constant_boolean_node (op == NE_EXPR ? true : false, type); }))
 
 /* If arg1 and arg2 are booleans (or any single bit type)
then try to simplify:
Index: gcc/testsuite/gcc.dg/tree-ssa/ssa-ccp-38.c
===
--- gcc/testsuite/gcc.dg/tree-ssa/ssa-ccp-38.c  (revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/ssa-ccp-38.c  (working copy)
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options -O -fdump-tree-ccp1 } */
+
+int foo (_Bool x)
+{
+  _Bool t = 1;
+  _Bool xx = !x;
+  _Bool y = xx == t;
+  _Bool z = y == x;
+  return z ? 1 : 0;
+}
+
+/* { dg-final { scan-tree-dump return 0; ccp1 } } */


Re: [PING] Re: [PATCH] c/66516 - missing diagnostic on taking the address of a builtin function

2015-08-04 Thread Jason Merrill

On 08/03/2015 07:02 PM, Martin Sebor wrote:

I've prototyped this approach in a couple places in the middle
end. Both implementations are very simple and work great when
the code isn't optimized away. The problem is that the
optimizations done at various points between the front end and
the final gimplification make the diagnostics inconsistent.

For instance, with F being the type of the builtin, this is
diagnosed in the first prototype:

   F* foo () { if (0) return __builtin_trap; return 0; }

but this isn't:

   F* bar () { return 0 ? __builtin_trap : 0; }

because the ternary ?: expression is folded into a constant by
the front end even before it reaches the gimplifier, while the
if-else statement isn't folded until the control flow graph is
built. (As an aside: I'm wondering why that is. Why have the
front end do any optimization at all if the middle can and
does them too, and better?)


This is largely historical baggage, I think, from days where computers 
had less memory and we were trying to do as much processing as possible 
immediately.


The c++-delayed-folding branch delays folding the ?: expression until 
the end of the function, at which point we can see better what context 
the function is being used in, which could simplify your patch.


But your question leads me to wonder if we need to do front end folding 
there, either...


Jason



Re: Regression in target MIC compiler

2015-08-04 Thread Ilya Verbin
On Tue, Aug 04, 2015 at 16:07:42 +0200, Richard Biener wrote:
 On Tue, Aug 4, 2015 at 3:06 PM, Ilya Verbin iver...@gmail.com wrote:
  On Tue, Aug 04, 2015 at 14:35:11 +0200, Thomas Schwinge wrote:
  On Fri, 31 Jul 2015 20:13:02 +0300, Ilya Verbin iver...@gmail.com wrote:
   On Fri, Jul 31, 2015 at 18:59:59 +0200, Jakub Jelinek wrote:
  On Wed, Feb 18, 2015 at 11:00:35 +0100, Jakub Jelinek wrote:
  +  /* First search just the GET_CLASS_NARROWEST_MODE to wider 
  modes,
  +  if not found, fallback to all modes.  */
  +  int pass;
  +  for (pass = 0; pass  2; pass++)
  + for (machine_mode mr = pass ? VOIDmode
  + : GET_CLASS_NARROWEST_MODE 
  (mclass);
  +  pass ? mr  MAX_MACHINE_MODE : mr != VOIDmode;
  +  pass ? mr = (machine_mode) (m + 1)
  +   : mr = GET_MODE_WIDER_MODE (mr))
  +   if (GET_MODE_CLASS (mr) != mclass
  +   || GET_MODE_SIZE (mr) != size
  +   || GET_MODE_PRECISION (mr) != prec
  +   || GET_MODE_INNER (mr) != inner
  +   || GET_MODE_IBIT (mr) != ibit
  +   || GET_MODE_FBIT (mr) != fbit
  +   || GET_MODE_NUNITS (mr) != nunits)
  + continue;
 
  Given that gomp-4_1-branch works ok, the problem was introduced 
  somewhere
  between 9 and 31 Jul.  I'll try to find the revision.

 Shouldn't 'mr' be here instead of 'm'?
   
I think so.  If it works, patch preapproved.
 
 ^^^
 
 looks like an obvious error anyway.
 
 Richard.

Yeah, but the fix for this typo doesn't really help, since it exposes another
error in this function.

vvv

   It fixes the infinite loop, but causes an error:
   lto1: fatal error: unsupported mode QI
 
  Confirmed.
 
But wonder what changed that we haven't been triggering it before.
What mode do you think it on (mclass/size/prec/inner/ibit/fbit/nunits)?
  
   When in hangs, mr is HImode.
 
  Do you already have any further analysis, a workaround, or even a fix?
 
  Not yet.  I thought since Jakub is the author of this function, he could 
  easily
  point what is wrong here :)  Actually, intelmic doesn't require
  lto_input_mode_table, so temporary workaround is just to disable it.

  -- Ilya


Re: [RFC] [Patch]: Try and vectorize with shift for mult expr with power 2 integer constant.

2015-08-04 Thread Richard Sandiford
Richard Biener richard.guent...@gmail.com writes:
 On Tue, Aug 4, 2015 at 4:21 PM, Richard Biener
 richard.guent...@gmail.com wrote:
 On Tue, Aug 4, 2015 at 4:15 PM, Richard Sandiford
 richard.sandif...@arm.com wrote:
 Richard Biener richard.guent...@gmail.com writes:
 On Tue, Aug 4, 2015 at 10:52 AM, Kumar, Venkataramanan
 venkataramanan.ku...@amd.com wrote:
 Hi Jeff,

 -Original Message-
 From: gcc-patches-ow...@gcc.gnu.org [mailto:gcc-patches-
 ow...@gcc.gnu.org] On Behalf Of Jeff Law
 Sent: Monday, August 03, 2015 11:42 PM
 To: Kumar, Venkataramanan; Jakub Jelinek
 Cc: Richard Beiner (richard.guent...@gmail.com); gcc-patches@gcc.gnu.org
 Subject: Re: [RFC] [Patch]: Try and vectorize with shift for mult
 expr with
 power 2 integer constant.

 On 08/02/2015 05:03 AM, Kumar, Venkataramanan wrote:
  Hi Jakub,
 
  Thank you for reviewing the patch.
 
  I have incorporated your comments in the attached patch.
 Note Jakub is on PTO for the next 3 weeks.

  Thank you for this information.



 
 
 
  vectorize_mults_via_shift.diff.txt
 
 
  diff --git a/gcc/testsuite/gcc.dg/vect/vect-mult-patterns.c
  b/gcc/testsuite/gcc.dg/vect/vect-mult-patterns.c
 Jakub would probably like more testcases :-)

 The most obvious thing to test would be other shift factors.

 A negative test to verify we don't try to turn a multiply by
 non-constant or
 multiply by a constant that is not a power of 2 into shifts.

 I have added negative test in the attached patch.



 [ Would it make sense, for example, to turn a multiply by 3 into a
 shift-add
 sequence?  As Jakub said, choose_mult_variant can be your friend. ]

 Yes I will do that in a follow up patch.

 The new change log becomes

 gcc/ChangeLog
 2015-08-04  Venkataramanan Kumar  venkataramanan.ku...@amd.com
  * tree-vect-patterns.c (vect_recog_mult_pattern): New function for 
 vectorizing
 multiplication patterns.
  * tree-vectorizer.h: Adjust the number of patterns.

 gcc/testsuite/ChangeLog
 2015-08-04  Venkataramanan Kumar  venkataramanan.ku...@amd.com
  * gcc.dg/vect/vect-mult-pattern-1.c: New
 * gcc.dg/vect/vect-mult-pattern-2.c: New

 Bootstrapped and reg tested on aarch64-unknown-linux-gnu.

 Ok for trunk ?

 +  if (TREE_CODE (oprnd0) != SSA_NAME
 +  || TREE_CODE (oprnd1) != INTEGER_CST
 +  || TREE_CODE (itype) != INTEGER_TYPE

 INTEGRAL_TYPE_P (itype)

 +  optab = optab_for_tree_code (LSHIFT_EXPR, vectype, optab_vector);
 +  if (!optab
 +  || optab_handler (optab, TYPE_MODE (vectype)) == CODE_FOR_nothing)
 +   return NULL;
 +

 indent of the return stmt looks wrong

 +  /* Handle constant operands that are postive or negative powers of 2.  
 */
 +  if ( wi::exact_log2 (oprnd1) != -1  ||
 +   wi::exact_log2 (wi::neg (oprnd1)) != -1)

 no space after (, || goes to the next line.

 +{
 +  tree shift;
 +
 +  if (wi::exact_log2 (oprnd1) != -1)

 please cache wi::exact_log2

 in fact the first if () looks redundant if you simply put an else
 return NULL
 after a else if (wi::exact_log2 (wi::neg (oprnd1)) != -1)

 Note that the issue with INT_MIN is that wi::neg (INT_MIN) is INT_MIN
 again, but it seems that wi::exact_log2 returns -1 in that case so you
 are fine (and in fact not handling this case).

 Are you sure it returns -1 for INT_MIN?  It isn't supposed to, assuming
 INT_MIN is shorthand for minimum value for a signed type.  wide_ints
 aren't signed, so INT_MIN is indistinguishable from an unsigned
 1(prec-1).

 No, not sure.  I spotted

   /* Reject cases where there are implicit -1 blocks above HIGH.  */
   if (x.len * HOST_BITS_PER_WIDE_INT  x.precision  x.sign_mask ()  0)
 return -1;

 and thought that would catch it.  I mean the tree value is negative so
 exact_log2 must see it is a negative value.

That's handling the compressed format, e.g.:

  {1  63}

as a 64-bit short-hand for a 256-bit:

  {1  63,-1,-1,-1}

In this case more than one of the low x.precision bits are known to be set.

 Now re-sent with Richards company disclaimer stripped...

Doh.  Sent via the right channels this time...

Thanks,
Richard



[PATCH] Remove fold_binary use from gimple_fold_stmt_to_constant_1

2015-08-04 Thread Richard Biener

Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk.

Richard.

2015-08-04  Richard Biener  rguent...@suse.de

* gimple-fold.c (gimple_fold_stmt_to_constant_1): Remove
dispatching to fold_binary for GIMPLE_BINARY_RHS and for
comparisons embedded in [VEC_]COND_EXPRs.

Index: gcc/gimple-fold.c
===
--- gcc/gimple-fold.c   (revision 226489)
+++ gcc/gimple-fold.c   (working copy)
@@ -5008,28 +5008,26 @@ gimple_fold_stmt_to_constant_1 (gimple s
return NULL_TREE;
 
   case GIMPLE_BINARY_RHS:
-{
-  /* Handle binary operators that can appear in GIMPLE form.  */
-  tree op0 = (*valueize) (gimple_assign_rhs1 (stmt));
-  tree op1 = (*valueize) (gimple_assign_rhs2 (stmt));
-
- /* Translate x + CST into an invariant form suitable for
-further propagation.  */
- if (gimple_assign_rhs_code (stmt) == POINTER_PLUS_EXPR
-  TREE_CODE (op0) == ADDR_EXPR
-  TREE_CODE (op1) == INTEGER_CST)
-   {
- tree off = fold_convert (ptr_type_node, op1);
- return build_fold_addr_expr_loc
-  (loc,
-   fold_build2 (MEM_REF,
-TREE_TYPE (TREE_TYPE (op0)),
-unshare_expr (op0), off));
-   }
-
-  return fold_binary_loc (loc, subcode,
- gimple_expr_type (stmt), op0, op1);
-}
+   /* Translate x + CST into an invariant form suitable for
+  further propagation.  */
+   if (subcode == POINTER_PLUS_EXPR)
+ {
+   /* Handle binary operators that can appear in GIMPLE form.  */
+   tree op0 = (*valueize) (gimple_assign_rhs1 (stmt));
+   tree op1 = (*valueize) (gimple_assign_rhs2 (stmt));
+
+   if (TREE_CODE (op0) == ADDR_EXPR
+TREE_CODE (op1) == INTEGER_CST)
+ {
+   tree off = fold_convert (ptr_type_node, op1);
+   return build_fold_addr_expr_loc
+   (loc,
+fold_build2 (MEM_REF,
+ TREE_TYPE (TREE_TYPE (op0)),
+ unshare_expr (op0), off));
+ }
+ }
+   return NULL_TREE;
 
   case GIMPLE_TERNARY_RHS:
 {
@@ -5037,20 +5035,6 @@ gimple_fold_stmt_to_constant_1 (gimple s
   tree op0 = (*valueize) (gimple_assign_rhs1 (stmt));
   tree op1 = (*valueize) (gimple_assign_rhs2 (stmt));
   tree op2 = (*valueize) (gimple_assign_rhs3 (stmt));
-
- /* Fold embedded expressions in ternary codes.  */
- if ((subcode == COND_EXPR
-  || subcode == VEC_COND_EXPR)
-  COMPARISON_CLASS_P (op0))
-   {
- tree op00 = (*valueize) (TREE_OPERAND (op0, 0));
- tree op01 = (*valueize) (TREE_OPERAND (op0, 1));
- tree tem = fold_binary_loc (loc, TREE_CODE (op0),
- TREE_TYPE (op0), op00, op01);
- if (tem)
-   op0 = tem;
-   }
-
   return fold_ternary_loc (loc, subcode,
   gimple_expr_type (stmt), op0, op1, op2);
 }


Re: [AArch64] Improve TLS Descriptor pattern to release RTL loop IV opt

2015-08-04 Thread James Greenhalgh
On Tue, Jul 28, 2015 at 02:12:36PM +0100, Jiong Wang wrote:
 
 The instruction sequences for preparing argument for TLS descriptor
 runtime resolver and the later function call to resolver can actually be
 hoisted out of the loop.
 
 Currently we can't because we have exposed the hard register X0 as
 destination of set.  While GCC's RTL data flow infrastructure will
 skip or do very conservative assumption when hard register involved in
 and thus some loop IV opportunities are missed.
 
 This patch add another tlsdesc_small_pseudo_mode pattern, and avoid
 expose x0 to gcc generic code.
 
 Generally, we define a new register class FIXED_R0 which only contains 
 register
 0, so the instruction sequences generated from the new add pattern is the same
 as tlsdesc_small_mode, while the operand 0 is wrapped as pseudo register 
 that
 RTL IV opt can handle it.
 
 Ideally, we should allow operand 0 to be any pseudo register, but then
 we can't model the override of x0 caused by the function call which is
 hidded by the UNSPEC.
 
 So here, we restricting operand 0 to be x0, the override of x0 can be
 reflected to the gcc.
 
 OK for trunk?

OK.

Thanks,
James



Re: [PATCH][AArch64][8/14] Implement TARGET_OPTION_VALID_ATTRIBUTE_P

2015-08-04 Thread James Greenhalgh
On Mon, Aug 03, 2015 at 04:20:13PM +0100, Kyrill Tkachov wrote:
 Ok, I've removed usages of 'ret' in favor of returning when appropriate.
 In this last one I left the ret (but cleaned up the control flow a bit)
 because if the processing fails we need to clean up a bit of state before
 returning.

This is OK with the changes below fixed, or commented on as justification.

 diff --git a/gcc/config/aarch64/aarch64-protos.h 
 b/gcc/config/aarch64/aarch64-protos.h
 index fc1cec7..3a5482d 100644
 --- a/gcc/config/aarch64/aarch64-protos.h
 +++ b/gcc/config/aarch64/aarch64-protos.h
 @@ -376,6 +378,8 @@ extern bool aarch64_madd_needs_nop (rtx_insn *);
  extern void aarch64_final_prescan_insn (rtx_insn *);
  extern bool
  aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel);
 +bool aarch64_handle_option (struct gcc_options *, struct gcc_options *,
 +  const struct cl_decoded_option *, location_t);

Please try to keep this file in alphabetical order, first by return type,
then by function name.

  void aarch64_atomic_assign_expand_fenv (tree *, tree *, tree *);
  int aarch64_ccmp_mode_to_code (enum machine_mode mode);
  
 diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
 index d0d62e7..7a369fd 100644
 --- a/gcc/config/aarch64/aarch64.c
 +++ b/gcc/config/aarch64/aarch64.c

 +static bool
 +aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
 +{
 +  const struct processor *tmp_arch = NULL;
 +  enum aarch64_parse_opt_result parse_res
 += aarch64_parse_arch (str, tmp_arch, aarch64_isa_flags);
 +
 +  if (parse_res == AARCH64_PARSE_OK)
 +{
 +  gcc_assert (tmp_arch);
 +  selected_arch = tmp_arch;
 +  explicit_arch = selected_arch-arch;
 +  return true;
 +}

Why not pull this in to the switch case below?

 +
 +  switch (parse_res)
 +{
 +  case AARCH64_PARSE_MISSING_ARG:
 + error (missing architecture name in 'arch' target %s, pragma_or_attr);
 + break;
 +  case AARCH64_PARSE_INVALID_ARG:
 + error (unknown value %qs for 'arch' target %s, str, pragma_or_attr);
 + break;
 +  case AARCH64_PARSE_INVALID_FEATURE:
 + error (invalid feature modifier %qs for 'arch' target %s,
 +str, pragma_or_attr);
 + break;
 +  default:
 + gcc_unreachable ();
 +}
 +
 +  return false;
 +}
 +
 +/* Handle the argument CPU_STR to the cpu= target attribute.
 +   PRAGMA_OR_ATTR is used in potential error messages.  */
 +
 +static bool
 +aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
 +{
 +  const struct processor *tmp_cpu = NULL;
 +  enum aarch64_parse_opt_result parse_res
 += aarch64_parse_cpu (str, tmp_cpu, aarch64_isa_flags);
 +
 +  if (parse_res == AARCH64_PARSE_OK)
 +{
 +  gcc_assert (tmp_cpu);
 +  selected_tune = tmp_cpu;
 +  explicit_tune_core = selected_tune-ident;
 +
 +  selected_arch = all_architectures[tmp_cpu-arch];
 +  explicit_arch = selected_arch-arch;
 +  return true;
 +}

Likewise here.

 +
 +  switch (parse_res)
 +{
 +  case AARCH64_PARSE_MISSING_ARG:
 + error (missing cpu name in 'cpu' target %s, pragma_or_attr);
 + break;
 +  case AARCH64_PARSE_INVALID_ARG:
 + error (unknown value %qs for 'cpu' target %s, str, pragma_or_attr);
 + break;
 +  case AARCH64_PARSE_INVALID_FEATURE:
 + error (invalid feature modifier %qs for 'cpu' target %s,
 +str, pragma_or_attr);
 + break;
 +  default:
 + gcc_unreachable ();
 +}
 +
 +  return false;
 +}
 +
 +/* Handle the argument STR to the tune= target attribute.
 +   PRAGMA_OR_ATTR is used in potential error messages.  */
 +
 +static bool
 +aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
 +{
 +  const struct processor *tmp_tune = NULL;
 +  enum aarch64_parse_opt_result parse_res
 += aarch64_parse_tune (str, tmp_tune);
 +
 +  if (parse_res == AARCH64_PARSE_OK)
 +{
 +  gcc_assert (tmp_tune);
 +  selected_tune = tmp_tune;
 +  explicit_tune_core = selected_tune-ident;
 +  return true;
 +}
 +

And likewise here.

 +  switch (parse_res)
 +{
 +  case AARCH64_PARSE_INVALID_ARG:
 + error (unknown value %qs for 'tune' target %s, str, pragma_or_attr);
 + break;
 +  default:
 + gcc_unreachable ();
 +}
 +
 +  return false;
 +}
 +
 +/* Parse an architecture extensions target attribute string specified in STR.
 +   For example +fp+nosimd.  Show any errors if needed.  Return TRUE
 +   if successful.  Update aarch64_isa_flags to reflect the ISA features
 +   modified.
 +   PRAGMA_OR_ATTR is used in potential error messages.  */
 +
 +static bool
 +aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
 +{
 +  enum aarch64_parse_opt_result parse_res;
 +  unsigned long isa_flags = aarch64_isa_flags;
 +
 +  parse_res = aarch64_parse_extension (str, isa_flags);
 +
 +  if (parse_res == AARCH64_PARSE_OK)
 +{
 +  aarch64_isa_flags = 

RE: [RFC] [Patch]: Try and vectorize with shift for mult expr with power 2 integer constant.

2015-08-04 Thread Kumar, Venkataramanan
Hi Jeff, 

 -Original Message-
 From: gcc-patches-ow...@gcc.gnu.org [mailto:gcc-patches-
 ow...@gcc.gnu.org] On Behalf Of Jeff Law
 Sent: Monday, August 03, 2015 11:42 PM
 To: Kumar, Venkataramanan; Jakub Jelinek
 Cc: Richard Beiner (richard.guent...@gmail.com); gcc-patches@gcc.gnu.org
 Subject: Re: [RFC] [Patch]: Try and vectorize with shift for mult expr with
 power 2 integer constant.
 
 On 08/02/2015 05:03 AM, Kumar, Venkataramanan wrote:
  Hi Jakub,
 
  Thank you for reviewing the patch.
 
  I have incorporated your comments in the attached patch.
 Note Jakub is on PTO for the next 3 weeks.

 Thank you for this information.

 
 
 
 
 
  vectorize_mults_via_shift.diff.txt
 
 
  diff --git a/gcc/testsuite/gcc.dg/vect/vect-mult-patterns.c
  b/gcc/testsuite/gcc.dg/vect/vect-mult-patterns.c
 Jakub would probably like more testcases :-)
 
 The most obvious thing to test would be other shift factors.
 
 A negative test to verify we don't try to turn a multiply by non-constant or
 multiply by a constant that is not a power of 2 into shifts.

I have added negative test in the attached patch.


 
 [ Would it make sense, for example, to turn a multiply by 3 into a shift-add
 sequence?  As Jakub said, choose_mult_variant can be your friend. ]

Yes I will do that in a follow up patch.   

The new change log becomes 

gcc/ChangeLog
2015-08-04  Venkataramanan Kumar  venkataramanan.ku...@amd.com
 * tree-vect-patterns.c (vect_recog_mult_pattern): New function for 
vectorizing
multiplication patterns.
 * tree-vectorizer.h: Adjust the number of patterns.

gcc/testsuite/ChangeLog
2015-08-04  Venkataramanan Kumar  venkataramanan.ku...@amd.com
 * gcc.dg/vect/vect-mult-pattern-1.c: New
* gcc.dg/vect/vect-mult-pattern-2.c: New

Bootstrapped and reg tested on aarch64-unknown-linux-gnu.

Ok for trunk ?

 
 
 
  @@ -2147,6 +2152,140 @@ vect_recog_vector_vector_shift_pattern
 (vecgimple *stmts,
  return pattern_stmt;
}
 
  +/* Detect multiplication by constant which are postive or negatives
  +of power 2,
 s/postive/positive/
 
 
 Jeff

Regards,
Venkat.

diff --git a/gcc/testsuite/gcc.dg/vect/vect-mult-pattern-1.c 
b/gcc/testsuite/gcc.dg/vect/vect-mult-pattern-1.c
new file mode 100644
index 000..764d0e3
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-mult-pattern-1.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target vect_shift } */
+
+unsigned  long int __attribute__ ((aligned (64)))arr[100];
+int i;
+
+void test_for_vectorshifts_via_mul_with_power2_const ()
+{
+  for (i=0; i=99; i++)
+arr[i] = arr[i] * 4;
+}
+
+void test_for_vectorshifts_via_mul_with_negative_power2_const ()
+{
+  for (i=0; i=99; i++)
+arr[i] = arr[i] * (-4);
+}
+
+/* { dg-final { scan-tree-dump-times vectorized 1 loops 2 vect  {target  { 
! { vect_int_mult } } } } } */
+/* { dg-final { scan-tree-dump-times vect_recog_mult_pattern: detected 2 
vect {target  { ! { vect_int_mult } } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-mult-pattern-2.c 
b/gcc/testsuite/gcc.dg/vect/vect-mult-pattern-2.c
new file mode 100644
index 000..77e8cff
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-mult-pattern-2.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target vect_shift } */
+
+unsigned  long int __attribute__ ((aligned (64)))arr[100];
+int i;
+
+void negative_test_for_vectorshifts_via_mul_with_const ()
+{
+  for (i=0; i=99; i++)
+arr[i] = arr[i] * 123;
+}
+
+void negative_test_for_vectorshifts_via_mul_with_negative_const ()
+{
+  for (i=0; i=99; i++)
+arr[i] = arr[i] * (-123);
+}
+
+void negative_test_for_vectorshifts_via_mul_with_varable (int x)
+{
+  for (i=0; i=99; i++)
+arr[i] = arr[i] * x;
+}
+
+
+/* { dg-final { scan-tree-dump-times vectorized 0 loops 3 vect  {target  { 
! { vect_int_mult } } } } } */
+/* { dg-final { scan-tree-dump-not vect_recog_mult_pattern: detected vect 
{target  { ! { vect_int_mult } } } } } */
diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c
index f034635..5cbb49e 100644
--- a/gcc/tree-vect-patterns.c
+++ b/gcc/tree-vect-patterns.c
@@ -76,6 +76,10 @@ static gimple vect_recog_vector_vector_shift_pattern 
(vecgimple *,
  tree *, tree *);
 static gimple vect_recog_divmod_pattern (vecgimple *,
 tree *, tree *);
+
+static gimple vect_recog_mult_pattern (vecgimple *,
+  tree *, tree *);
+
 static gimple vect_recog_mixed_size_cond_pattern (vecgimple *,
  tree *, tree *);
 static gimple vect_recog_bool_pattern (vecgimple *, tree *, tree *);
@@ -90,6 +94,7 @@ static vect_recog_func_ptr 
vect_vect_recog_func_ptrs[NUM_PATTERNS] = {
vect_recog_rotate_pattern,
vect_recog_vector_vector_shift_pattern,

Re: [PATCH][RTL-ifcvt] Improve conditional select ops on immediates

2015-08-04 Thread Kyrill Tkachov


On 03/08/15 18:37, Uros Bizjak wrote:

On Mon, Aug 3, 2015 at 7:20 PM, Kyrill Tkachov kyrylo.tkac...@arm.com wrote:


Looking at the x86 movcc expansion code (ix86_expand_int_movcc) I
don't think this is a good idea. In the expander, there is already
quite some target-dependent code that goes great length to utilize sbb
insn as much as possible, before cmove is used.

IMO, as far as x86 is concerned, the best solution would be to revert
the change. ix86_expand_int_movcc already does some tricks from your
patch in a target-efficient way. Generic change that was introduced by
your patch now interferes with this expansion.

Well, technically the transformation was already there, it was just
never
reached for an x86 compilation because noce_try_cmove was tried in front
of
it
and used a target-specific expansion.
In any case, how's this proposal?
The transformation noce_try_store_flag_constants
 /* if (test) x = a; else x = b;
=   x = (-(test != 0)  (b - a)) + a;  */

Is a catch-all-immediates transformation in
noce_try_store_flag_constants.
What if we moved it to noce_try_cmove and performed it only if the
target-specific
conditional move expansion there failed?

That way we can try the x86_64-specific sequence first and still give
the
opportunity
to noce_try_store_flag_constants to perform the transformations that can
benefit targets
that don't have highly specific conditional move expanders.

Yes, let's try this approach. As was found out, some targets (e.g.
x86) hide lots of different target-dependent expansion strategies into
movcc expander. Perhaps this fact should be documented in the comment
in the generic code?

Ok, I'll work on that approach and add a comment.


I'm testing a patch that fix the testcases on x86_64 and does not
harm codegen on aarch64. Feel free to file a PR and assign it to me.

PR67103 [1]

[1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67103


Thanks,
Here's the patch to move that transformation from noce_try_store_flag_constants
to noce_try_cmove after the target-specific expansion has had a go.

This fixes the testcases for me on x86_64.
In i386.exp I only see:
FAIL: gcc.target/i386/pr49781-1.c scan-assembler-not lea[lq]?[ 
\t]\\((%|)r[a-z0-9]*
FAIL: gcc.target/i386/pr61403.c scan-assembler blend

which were there before my patch.
Bootstrap and testing on x86_64, arm and aarch64 is successful for me.

Is this ok?

Thanks,
Kyrill

2015-08-04  Kyrylo Tkachov  kyrylo.tkac...@arm.com

PR rtl-optimization/67103
* ifcvt.c (noce_try_store_flag_constants): Move
x = (-(test != 0)  (b - a)) + a transformation to...
(noce_try_cmove): ... Here.  Try it if normal conditional
move fails.




Thanks,
Uros.



diff --git a/gcc/ifcvt.c b/gcc/ifcvt.c
index 0ebe107..b06eaab 100644
--- a/gcc/ifcvt.c
+++ b/gcc/ifcvt.c
@@ -1410,9 +1410,6 @@ noce_try_store_flag_constants (struct noce_if_info *if_info)
 	  normalize = -1;
 	  reversep = true;
 	}
-  else if ((if_info-branch_cost = 2  STORE_FLAG_VALUE == -1)
-	   || if_info-branch_cost = 3)
-	normalize = -1;
   else
 	return FALSE;
 
@@ -1481,18 +1478,10 @@ noce_try_store_flag_constants (struct noce_if_info *if_info)
 	target, gen_int_mode (ifalse, mode),
 	if_info-x, 0, OPTAB_WIDEN);
 	}
-
-  /* if (test) x = a; else x = b;
-	 =   x = (-(test != 0)  (b - a)) + a;  */
   else
 	{
-	  target = expand_simple_binop (mode, AND,
-	target, gen_int_mode (diff, mode),
-	if_info-x, 0, OPTAB_WIDEN);
-	  if (target)
-	target = expand_simple_binop (mode, PLUS,
-	  target, gen_int_mode (ifalse, mode),
-	  if_info-x, 0, OPTAB_WIDEN);
+	  end_sequence ();
+	  return FALSE;
 	}
 
   if (! target)
@@ -1818,11 +1807,67 @@ noce_try_cmove (struct noce_if_info *if_info)
    INSN_LOCATION (if_info-insn_a));
 	  return TRUE;
 	}
-  else
+  /* If both a and b are constants try a last-ditch transformation:
+	 if (test) x = a; else x = b;
+	 =   x = (-(test != 0)  (b - a)) + a;
+	 Try this only if the target-specific expansion above has failed.
+	 The target-specific expander may want to generate sequences that
+	 we don't know about, so give them a chance before trying this
+	 approach.  */
+  else if (!targetm.have_conditional_execution ()
+		 CONST_INT_P (if_info-a)  CONST_INT_P (if_info-b)
+		 ((if_info-branch_cost = 2  STORE_FLAG_VALUE == -1)
+		|| if_info-branch_cost = 3))
 	{
-	  end_sequence ();
-	  return FALSE;
+	  machine_mode mode = GET_MODE (if_info-x);
+	  HOST_WIDE_INT ifalse = INTVAL (if_info-a);
+	  HOST_WIDE_INT itrue = INTVAL (if_info-b);
+	  rtx target = noce_emit_store_flag (if_info, if_info-x, false, -1);
+	  if (!target)
+	{
+	  end_sequence ();
+	  return FALSE;
+	}
+
+	  HOST_WIDE_INT diff = (unsigned HOST_WIDE_INT) itrue - ifalse;
+	  /* Make sure we can represent the difference
+	 between the two values.  */
+	  if ((diff  0)
+	  != ((ifalse  0) != (itrue  0) ? ifalse  0 : ifalse  itrue))
+	{
+	  end_sequence 

[PATCH][3/N] Replace the pattern GET_MODE_BITSIZE (GET_MODE_INNER (m)) with GET_MODE_UNIT_BITSIZE (m)

2015-08-04 Thread David Sherwood
Hi,

This patch follows on from

[PATCH][1/N] Change GET_MODE_INNER to always return a non-void mode

It is another tidy up, replacing the pattern
GET_MODE_BITSIZE (GET_MODE_INNER (m)) with GET_MODE_UNIT_BITSIZE (m). Also
replaces any calls to GET_MODE_PRECISION (GET_MODE_INNER (m)) with
GET_MODE_UNIT_PRECISION (m).

Tested:
aarch64 and aarch64_be - no regressions in gcc testsuite
x86_64 - bootstrap build, no testsuite regressions
arm-none-eabi - no regressions in gcc testsuite
Run contrib/config-list.mk - no regressions

Good to go?

Thanks,
David.

2015-08-04  David Sherwood  david.sherw...@arm.com

gcc/
* config/arm/arm.c (neon_element_bits): Replace call to
GET_MODE_BITSIZE (GET_MODE_INNER (m)) with GET_MODE_UNIT_BITSIZE (m).
* config/arm/neon.md (neon_vget_lanemode): Likewise.
(neon_vget_laneumode, neon_vset_lanemode): Likewise
(neon_vdup_lanemode): Likewise.
* config/i386/i386.c (ix86_expand_int_vcond): Likewise.
(ix86_expand_multi_arg_builtin, ix86_expand_reduc): Likewise.
(expand_vec_perm_palignr, ix86_expand_sse2_abs): Likewise.
* config/rs6000/rs6000.c (rs6000_do_expand_vec_perm): Likewise.
* config/spu/spu.c (arith_immediate_p): Likewise.
* expmed.c (store_bit_field_1, extract_bit_field_1): Likewise.
* expr.c (expand_expr_real_2): Likewise.
* optabs.c (shift_amt_for_vec_perm_mask): Likewise.
* simplify-rtx.c (simplify_immed_subreg): Likewise.
* tree-cfg.c (verify_gimple_assign_ternary): Likewise.
* tree-vect-patterns.c (vect_recog_mixed_size_cond_pattern): Likewise.
New variable.
* fold-const.c (fold_binary_loc): Replace call to
GET_MODE_PRECISION (GET_MODE_INNER (m)) with
GET_MODE_UNIT_PRECISION (m).


mode_inner3.patch
Description: Binary data


Re: Regression in target MIC compiler

2015-08-04 Thread Ilya Verbin
On Tue, Aug 04, 2015 at 14:35:11 +0200, Thomas Schwinge wrote:
 On Fri, 31 Jul 2015 20:13:02 +0300, Ilya Verbin iver...@gmail.com wrote:
  On Fri, Jul 31, 2015 at 18:59:59 +0200, Jakub Jelinek wrote:
 On Wed, Feb 18, 2015 at 11:00:35 +0100, Jakub Jelinek wrote:
 +  /* First search just the GET_CLASS_NARROWEST_MODE to wider 
 modes,
 +  if not found, fallback to all modes.  */
 +  int pass;
 +  for (pass = 0; pass  2; pass++)
 + for (machine_mode mr = pass ? VOIDmode
 + : GET_CLASS_NARROWEST_MODE (mclass);
 +  pass ? mr  MAX_MACHINE_MODE : mr != VOIDmode;
 +  pass ? mr = (machine_mode) (m + 1)
 +   : mr = GET_MODE_WIDER_MODE (mr))
 +   if (GET_MODE_CLASS (mr) != mclass
 +   || GET_MODE_SIZE (mr) != size
 +   || GET_MODE_PRECISION (mr) != prec
 +   || GET_MODE_INNER (mr) != inner
 +   || GET_MODE_IBIT (mr) != ibit
 +   || GET_MODE_FBIT (mr) != fbit
 +   || GET_MODE_NUNITS (mr) != nunits)
 + continue;
 
 Given that gomp-4_1-branch works ok, the problem was introduced 
 somewhere
 between 9 and 31 Jul.  I'll try to find the revision.

Shouldn't 'mr' be here instead of 'm'?
   
   I think so.  If it works, patch preapproved.
  
  It fixes the infinite loop, but causes an error:
  lto1: fatal error: unsupported mode QI
 
 Confirmed.
 
   But wonder what changed that we haven't been triggering it before.
   What mode do you think it on (mclass/size/prec/inner/ibit/fbit/nunits)?
  
  When in hangs, mr is HImode.
 
 Do you already have any further analysis, a workaround, or even a fix?

Not yet.  I thought since Jakub is the author of this function, he could easily
point what is wrong here :)  Actually, intelmic doesn't require
lto_input_mode_table, so temporary workaround is just to disable it.

  -- Ilya


[PTX] small cleanup

2015-08-04 Thread Nathan Sidwell

This removal of unused structure is now committed to trunk too.

nathan
2015-08-04  Nathan Sidwell  nat...@codesourcery.com

	* config/nvptx/nvptx.h (struct nvptx_pseudo_info): Delete.
	(machine_function): Remove pseudos field.

Index: gcc/config/nvptx/nvptx.h
===
--- gcc/config/nvptx/nvptx.h	(revision 226539)
+++ gcc/config/nvptx/nvptx.h	(working copy)
@@ -217,12 +217,6 @@ struct nvptx_args {
 #define LEGITIMATE_PIC_OPERAND_P(X) 1
 
 
-struct nvptx_pseudo_info
-{
-  int true_size;
-  int renumber;
-};
-
 #if defined HOST_WIDE_INT
 struct GTY(()) machine_function
 {
@@ -231,7 +225,6 @@ struct GTY(()) machine_function
   tree funtype;
   bool has_call_with_varargs;
   bool has_call_with_sc;
-  struct GTY((skip)) nvptx_pseudo_info *pseudos;
   HOST_WIDE_INT outgoing_stdarg_size;
   int ret_reg_mode;
   int punning_buffer_size;


Re: [PATCH] [AVX512F] Add scatter support for vectorizer

2015-08-04 Thread Richard Biener
On Fri, 31 Jul 2015, Petr Murzin wrote:

 Hello,
 This patch adds scatter support for vectorizer (for AVX512F
 instructions). Please have a look. Is it OK for trunk?

+/* Target builtin that implements vector scatter operation.  */
+DEFHOOK
+(builtin_scatter,
+ ,
+ tree,
+ (const_tree vectype, const_tree index_type, int scale),
+ NULL)

please add documentation inline here, like for builtin_gather,
and let tm.texi be auto-populated.

Note that the i386 changes need target maintainer approval, CCing
Uros.

diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index 731fe7d..2de0369 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -65,6 +65,8 @@ along with GCC; see the file COPYING3.  If not see
 #include builtins.h
 #include params.h

+
+
 /* Return true if load- or store-lanes optab OPTAB is implemented for
COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */


please avoid this kind of spurious whitespace changes.

@@ -2307,10 +2313,7 @@ vect_analyze_data_ref_access (struct data_reference 
*dr)
  if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
 zero step in outer loop.\n);
- if (DR_IS_READ (dr))
-   return true;
- else
-   return false;
+ return (DR_IS_READ (dr)) ? true : false;
}
 }

Likewise.  If anything then do

 return DR_IS_READ (dr);

-  if (gather)
+  if (gather || scatter)
{
  tree off;

- gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, off, 
NULL);
- if (gather
+ gather = 0 != vect_check_gather_scatter (stmt, loop_vinfo, NULL, 
off, NULL, true);
+ scatter = 0 != vect_check_gather_scatter (stmt, loop_vinfo, 
NULL, off, NULL, false);
+

please only check gather/scatter once - only one, gather or scatter
can ever be true.  This also means that the idea of having both
bools is not reflecting the state in a very good way.  Instead
please add a

  enum { SG_NONE, SCATTER, GATHER } gatherscatter;

and replace 'gather' with it.

@@ -3747,7 +3767,9 @@ again:

  datarefs[i] = dr;
  STMT_VINFO_GATHER_P (stmt_info) = true;
+ STMT_VINFO_SCATTER_P (stmt_info) = true;
}

this looks bougs as well due to the mechanical change - a stmt
cannot be gather and scatter at the same time.

- tree decl = vect_check_gather (stmt, loop_vinfo, NULL, off, 
NULL);
+ tree decl = vect_check_gather_scatter (stmt, loop_vinfo, NULL, 
off, NULL,
+(STMT_VINFO_GATHER_P 
(stmt_vinfo)) ? true : false);

watch long lines

  if (!process_use (stmt, off, loop_vinfo, live_p, relevant,
worklist, true))
-   return false;
+{
+ if (STMT_VINFO_SCATTER_P (stmt_vinfo) 
+  !process_use (stmt, gimple_assign_rhs1 (stmt), 
loop_vinfo, live_p,
+   relevant, worklist, true))
+worklist.release();
+
+ return false;
+}

no need to cut-off the early return, no?  Also rhs1 should be
already handled via

FOR_EACH_PHI_OR_STMT_USE (use_p, stmt, iter, SSA_OP_USE)
  {
tree op = USE_FROM_PTR (use_p);
if (!process_use (stmt, op, loop_vinfo, live_p, relevant,
  worklist, false))
  return false;
  }

note that 'force' doesn't apply here.

I wonder why vect_check_gather_scatter cannot figure out itself
whether scatter or gather is used.  After all it does

  struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);

so DR_IS_READ/WRITE is readily available.  Please rework accordingly.
This should also simplify the patch.

+  if (!vect_is_simple_use (gimple_assign_rhs1 (stmt), NULL, 
loop_vinfo, bb_vinfo,
+  def_stmt, def, scatter_src_dt))
+   {
+ if (dump_enabled_p ())
+   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ scatter source use not simple.);
+ return false;
+   }

This is redundant, it is verified earlier.

+ var = make_ssa_name (var, NULL);

make_ssa_name (var);

+ new_stmt
+   = gimple_build_assign (var, VIEW_CONVERT_EXPR,
+   src, NULL_TREE);

you can omit the NULL_TREE

@@ -5586,8 +5770,6 @@ vectorizable_store (gimple stmt, 
gimple_stmt_iterator *gsi, gimple *vec_stmt,
   prev_stmt_info = NULL;
   for (j = 0; j  ncopies; j++)
 {
-  gimple new_stmt;
-
   if (j == 0)
{
   if (slp)

spurious change?

@@ -5853,10 +6035,12 @@ permute_vec_elements (tree x, tree y, tree 
mask_vec, gimple stmt,
 {
   tree vectype = TREE_TYPE (x);
   tree perm_dest, data_ref;
+  tree scalar_dest = TREE_CODE (gimple_assign_lhs (stmt)) == SSA_NAME
+? gimple_assign_lhs (stmt) : x;


Re: [PATCH, i386] Disable AVX-512VL insns for scalar mode operands on -march=knl.

2015-08-04 Thread Kirill Yukhin
On 04 Aug 14:10, Uros Bizjak wrote:
 On Tue, Aug 4, 2015 at 1:47 PM, Kirill Yukhin kirill.yuk...@gmail.com wrote:
  Hello,
  -   (set_attr prefix_data16 *,*,*,1,*,*,*,*)
  -   (set_attr prefix orig,vex,maybe_vex,orig,vex,maybe_vex,orig,orig)
  -   (set_attr mode V2DF,V2DF,DF,V1DF,V1DF,DF,V4SF,V2SF)])
  +   (set_attr prefix_data16 *,*,*,*,*,1,*,*,*,*)
 
 Please change the above to:
 
(set (attr prefix_data16)
 (if_then_else (eq_attr alternative 5)
   (const_string 1)
   (const_string *)))
Thanks, fixed!
 
 Uros.

commit 20df38ce6fed082155b9860b0a1c5511894fdd84
Author: Kirill Yukhin kirill.yuk...@intel.com
Date:   Tue Aug 4 10:36:10 2015 +0300

Merge SSE 4.1 and AVX ptest patterns. Extend iterator for new one.

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 128c5af..f93a5ce 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -31734,9 +31734,9 @@ static const struct builtin_description bdesc_args[] =
   { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, __builtin_ia32_roundps_az, 
IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
   { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, 
__builtin_ia32_roundps_az_sfix, IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) 
V4SI_FTYPE_V4SF },
 
-  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, __builtin_ia32_ptestz128, 
IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
-  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, __builtin_ia32_ptestc128, 
IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
-  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, 
__builtin_ia32_ptestnzc128, IX86_BUILTIN_PTESTNZC, GTU, (int) 
INT_FTYPE_V2DI_V2DI_PTEST },
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptestv2di, 
__builtin_ia32_ptestz128, IX86_BUILTIN_PTESTZ, EQ, (int) 
INT_FTYPE_V2DI_V2DI_PTEST },
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptestv2di, 
__builtin_ia32_ptestc128, IX86_BUILTIN_PTESTC, LTU, (int) 
INT_FTYPE_V2DI_V2DI_PTEST },
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptestv2di, 
__builtin_ia32_ptestnzc128, IX86_BUILTIN_PTESTNZC, GTU, (int) 
INT_FTYPE_V2DI_V2DI_PTEST },
 
   /* SSE4.2 */
   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, __builtin_ia32_pcmpgtq, 
IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
@@ -31892,9 +31892,9 @@ static const struct builtin_description bdesc_args[] =
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, 
__builtin_ia32_vtestzps256, IX86_BUILTIN_VTESTZPS256, EQ, (int) 
INT_FTYPE_V8SF_V8SF_PTEST },
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, 
__builtin_ia32_vtestcps256, IX86_BUILTIN_VTESTCPS256, LTU, (int) 
INT_FTYPE_V8SF_V8SF_PTEST },
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, 
__builtin_ia32_vtestnzcps256, IX86_BUILTIN_VTESTNZCPS256, GTU, (int) 
INT_FTYPE_V8SF_V8SF_PTEST },
-  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, __builtin_ia32_ptestz256, 
IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
-  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, __builtin_ia32_ptestc256, 
IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
-  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, __builtin_ia32_ptestnzc256, 
IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptestv4di, __builtin_ia32_ptestz256, 
IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptestv4di, __builtin_ia32_ptestc256, 
IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptestv4di, __builtin_ia32_ptestnzc256, 
IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
 
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, 
__builtin_ia32_movmskpd256, IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) 
INT_FTYPE_V4DF  },
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, 
__builtin_ia32_movmskps256, IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) 
INT_FTYPE_V8SF },
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 0970f0e..0ffc27d 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -299,6 +299,12 @@
V8DI (V4DI TARGET_AVX512VL) (V2DI TARGET_AVX512VL)])
 
 ;; All DImode vector integer modes
+(define_mode_iterator V_AVX
+  [V16QI V8HI V4SI V2DI V4SF V2DF
+   (V32QI TARGET_AVX) (V16HI TARGET_AVX)
+   (V8SI TARGET_AVX) (V4DI TARGET_AVX)
+   (V8SF TARGET_AVX) (V4DFTARGET_AVX)])
+
 (define_mode_iterator VI8
   [(V8DI TARGET_AVX512F) (V4DI TARGET_AVX) V2DI])
 
@@ -566,7 +572,11 @@
 (define_mode_attr sse4_1
   [(V4SF sse4_1) (V2DF sse4_1)
(V8SF avx) (V4DF avx)
-   (V8DF avx512f)])
+   (V8DF avx512f)
+   (V4DI avx) (V2DI sse4_1)
+   (V8SI avx) (V4SI sse4_1)
+   (V16QI sse4_1) (V32QI avx)
+   (V8HI sse4_1) (V16HI avx)])
 
 (define_mode_attr avxsizesuffix
   [(V64QI 512) (V32HI 512) (V16SI 512) (V8DI 512)
@@ -14640,30 +14650,23 @@
 
 ;; ptest is very similar to comiss and ucomiss when setting FLAGS_REG.
 ;; But it is not a really compare instruction.
-(define_insn 

Re: [PATCH, i386] Merge SSE and AVX ptest patterns.

2015-08-04 Thread Kirill Yukhin
On 04 Aug 14:06, Uros Bizjak wrote:
 On Tue, Aug 4, 2015 at 1:58 PM, Kirill Yukhin kirill.yuk...@gmail.com wrote:
  +   (set (attr btver2_decode)
  + (if_then_else
  +   (and (eq_attr alternative 2)
  +   (match_test sseinsnmodemode==OImode))
  + (const_string vector)
  + (const_string *)))
 
 vector does not depend on alternative, but only on
 sseinsnsmodemode. So the and above should be removed.
Thanks, fixed!
 Uros.

commit 20df38ce6fed082155b9860b0a1c5511894fdd84
Author: Kirill Yukhin kirill.yuk...@intel.com
Date:   Tue Aug 4 10:36:10 2015 +0300

Merge SSE 4.1 and AVX ptest patterns. Extend iterator for new one.

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 128c5af..f93a5ce 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -31734,9 +31734,9 @@ static const struct builtin_description bdesc_args[] =
   { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, __builtin_ia32_roundps_az, 
IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
   { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, 
__builtin_ia32_roundps_az_sfix, IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) 
V4SI_FTYPE_V4SF },
 
-  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, __builtin_ia32_ptestz128, 
IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
-  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, __builtin_ia32_ptestc128, 
IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
-  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, 
__builtin_ia32_ptestnzc128, IX86_BUILTIN_PTESTNZC, GTU, (int) 
INT_FTYPE_V2DI_V2DI_PTEST },
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptestv2di, 
__builtin_ia32_ptestz128, IX86_BUILTIN_PTESTZ, EQ, (int) 
INT_FTYPE_V2DI_V2DI_PTEST },
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptestv2di, 
__builtin_ia32_ptestc128, IX86_BUILTIN_PTESTC, LTU, (int) 
INT_FTYPE_V2DI_V2DI_PTEST },
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptestv2di, 
__builtin_ia32_ptestnzc128, IX86_BUILTIN_PTESTNZC, GTU, (int) 
INT_FTYPE_V2DI_V2DI_PTEST },
 
   /* SSE4.2 */
   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, __builtin_ia32_pcmpgtq, 
IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
@@ -31892,9 +31892,9 @@ static const struct builtin_description bdesc_args[] =
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, 
__builtin_ia32_vtestzps256, IX86_BUILTIN_VTESTZPS256, EQ, (int) 
INT_FTYPE_V8SF_V8SF_PTEST },
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, 
__builtin_ia32_vtestcps256, IX86_BUILTIN_VTESTCPS256, LTU, (int) 
INT_FTYPE_V8SF_V8SF_PTEST },
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, 
__builtin_ia32_vtestnzcps256, IX86_BUILTIN_VTESTNZCPS256, GTU, (int) 
INT_FTYPE_V8SF_V8SF_PTEST },
-  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, __builtin_ia32_ptestz256, 
IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
-  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, __builtin_ia32_ptestc256, 
IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
-  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, __builtin_ia32_ptestnzc256, 
IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptestv4di, __builtin_ia32_ptestz256, 
IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptestv4di, __builtin_ia32_ptestc256, 
IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptestv4di, __builtin_ia32_ptestnzc256, 
IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
 
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, 
__builtin_ia32_movmskpd256, IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) 
INT_FTYPE_V4DF  },
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, 
__builtin_ia32_movmskps256, IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) 
INT_FTYPE_V8SF },
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 0970f0e..0ffc27d 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -299,6 +299,12 @@
V8DI (V4DI TARGET_AVX512VL) (V2DI TARGET_AVX512VL)])
 
 ;; All DImode vector integer modes
+(define_mode_iterator V_AVX
+  [V16QI V8HI V4SI V2DI V4SF V2DF
+   (V32QI TARGET_AVX) (V16HI TARGET_AVX)
+   (V8SI TARGET_AVX) (V4DI TARGET_AVX)
+   (V8SF TARGET_AVX) (V4DFTARGET_AVX)])
+
 (define_mode_iterator VI8
   [(V8DI TARGET_AVX512F) (V4DI TARGET_AVX) V2DI])
 
@@ -566,7 +572,11 @@
 (define_mode_attr sse4_1
   [(V4SF sse4_1) (V2DF sse4_1)
(V8SF avx) (V4DF avx)
-   (V8DF avx512f)])
+   (V8DF avx512f)
+   (V4DI avx) (V2DI sse4_1)
+   (V8SI avx) (V4SI sse4_1)
+   (V16QI sse4_1) (V32QI avx)
+   (V8HI sse4_1) (V16HI avx)])
 
 (define_mode_attr avxsizesuffix
   [(V64QI 512) (V32HI 512) (V16SI 512) (V8DI 512)
@@ -14640,30 +14650,23 @@
 
 ;; ptest is very similar to comiss and ucomiss when setting FLAGS_REG.
 ;; But it is not a really compare instruction.
-(define_insn avx_ptest256
-  [(set (reg:CC FLAGS_REG)
-   (unspec:CC [(match_operand:V4DI 0 register_operand x)
- 

Re: [PING] Re: [PATCH] c/66516 - missing diagnostic on taking the address of a builtin function

2015-08-04 Thread Joseph Myers
On Mon, 3 Aug 2015, Martin Sebor wrote:

 because the ternary ?: expression is folded into a constant by
 the front end even before it reaches the gimplifier, while the
 if-else statement isn't folded until the control flow graph is
 built. (As an aside: I'm wondering why that is. Why have the
 front end do any optimization at all if the middle can and
 does them too, and better?)

Well, in C, if an expression is an integer constant expression, then in 
certain contexts its folded value needs to be known for checking lots of 
other constraints that should be diagnosed in the front end, such as on 
bit-field widths and array sizes - and then it needs to be known for 
layout so that sizeof and offsetof (which can be used in other integer 
constant expressions) can be computed.  And in other contexts (pointers 
conditional expressions), the value of an integer constant expression 
affects the type of the containing expression, so types cannot be 
determined without folding (with consequent effects on other diagnostics).  
And then certain expressions that aren't integer constant expressions but 
can be folded to them in fact get used (e.g. in the Linux kernel) in 
contexts requiring integer constant expressions, so also need folding.  
And then a wider range of expressions need folding in static initializers 
so the front end can diagnose whether an initializer is valid or not, 
while allowing extensions that again are used in practice (though there's 
a possibility such diagnostics for initializers could be left until after 
some middle-end optimization).

-- 
Joseph S. Myers
jos...@codesourcery.com


Re: [PATCH] Add __builtin_stack_top

2015-08-04 Thread Mike Stump
On Aug 4, 2015, at 5:30 AM, H.J. Lu hjl.to...@gmail.com wrote:
 Where does this feature belong?

I prefer the middle end.


Re: [PATCH] Add __builtin_stack_top

2015-08-04 Thread H.J. Lu
On Tue, Aug 4, 2015 at 8:40 AM, Mike Stump mikest...@comcast.net wrote:
 On Aug 4, 2015, at 5:30 AM, H.J. Lu hjl.to...@gmail.com wrote:
 Where does this feature belong?

 I prefer the middle end.

Any comments on my middle-end patch?

Thanks.

-- 
H.J.


Re: [PATCH] Fix PR66870 ppc64le, ppc64 split stack

2015-08-04 Thread Lynn Boger

Updated changelog and attached patch based on Alan's comments.

gcc/ChangeLog

2015-07-30Lynn Boger labo...@linux.vnet.ibm.com

PR66870
* gcc/config/rs6000/rs6000.c (rs6000_emit_prologue): Check for
no_split_stack function attribute along with flag_split_stack.
(rs6000_expand_split_stack_prologue): Likewise.


On 7/31/2015 4:00 AM, David Edelsohn wrote:

On Fri, Jul 31, 2015 at 12:00 AM, Alan Modra amo...@gmail.com wrote:

On Thu, Jul 30, 2015 at 03:30:12PM -0500, Lynn A. Boger wrote:

 PR66870
 * gcc/config/rs6000/rs6000.c:  Add check for no_split_stack
 function attribute along with flag_split_stack check to
 determine when to generate split stack prologue for
 ppc64 and ppc64le.

Looks good to me, except that the changelog entry should mention the
modified functions, for example:

 PR target/66870
 * gcc/config/rs6000/rs6000.c (rs6000_emit_prologue): Check for
 no_split_stack function attribute along with flag_split_stack.
 (rs6000_expand_split_stack_prologue): Likewise.

Also, formatting rules for gcc say to not split a line after an
operator.


+  int using_split_stack = flag_split_stack 
+   (lookup_attribute (no_split_stack, DECL_ATTRIBUTES (cfun-decl))
+ == NULL);

The  belongs on the next line, with parentheses added so that emacs
and indent will line up the continuation nicely.

   int using_split_stack = (flag_split_stack
 (lookup_attribute (no_split_stack,
  DECL_ATTRIBUTES (cfun-decl))
== NULL));


David, the following is another piece of the PR66870 fixes.  This
stops shrink-wrap from moving insns around in the first few blocks of
a function, in a way that is incorrect given that r12 is live.
Bootstrapped and regression tested powerpc64le-linux (and
powerpc64-linux by Lynn).

 PR target/66870
 * config/rs6000/rs6000.c (machine_function): Add split_stack_argp_used.
 (rs6000_emit_prologue): Set it.
 (rs6000_set_up_by_prologue): Specify r12 when split_stack_argp_used.

Both patches with your suggested changes are okay.

Thanks, David




Index: gcc/config/rs6000/rs6000.c
===
--- gcc/config/rs6000/rs6000.c  (revision 226606)
+++ gcc/config/rs6000/rs6000.c  (working copy)
@@ -23748,6 +23748,11 @@
   int using_static_chain_p = (cfun-static_chain_decl != NULL_TREE
   df_regs_ever_live_p (STATIC_CHAIN_REGNUM)
   call_used_regs[STATIC_CHAIN_REGNUM]);
+  int using_split_stack = (flag_split_stack
+(lookup_attribute (no_split_stack,
+ DECL_ATTRIBUTES (cfun-decl))
+   == NULL));
+ 
   /* Offset to top of frame for frame_reg and sp respectively.  */
   HOST_WIDE_INT frame_off = 0;
   HOST_WIDE_INT sp_off = 0;
@@ -24018,7 +24023,7 @@
info-cr_save_p
REGNO (frame_reg_rtx) != cr_save_regno
!(using_static_chain_p  cr_save_regno == 11)
-   !(flag_split_stack  cr_save_regno == 12  sp_adjust))
+   !(using_split_stack  cr_save_regno == 12  sp_adjust))
 {
   cr_save_rtx = gen_rtx_REG (SImode, cr_save_regno);
   START_USE (cr_save_regno);
@@ -24596,7 +24601,7 @@
   if ((DEFAULT_ABI == ABI_AIX || DEFAULT_ABI == ABI_ELFv2)
   !using_static_chain_p)
save_regno = 11;
-  else if (flag_split_stack || REGNO (frame_reg_rtx) == 12)
+  else if (using_split_stack || REGNO (frame_reg_rtx) == 12)
{
  save_regno = 11;
  if (using_static_chain_p)
@@ -24699,7 +24704,7 @@
   emit_insn (gen_frame_store (reg, sp_reg_rtx, RS6000_TOC_SAVE_SLOT));
 }
 
-  if (flag_split_stack  split_stack_arg_pointer_used_p ())
+  if (using_split_stack  split_stack_arg_pointer_used_p ())
 {
   /* Set up the arg pointer (r12) for -fsplit-stack code.  If
 __morestack was called, it left the arg pointer to the old
@@ -26287,7 +26292,10 @@
 static rtx
 rs6000_internal_arg_pointer (void)
 {
-  if (flag_split_stack)
+  if (flag_split_stack
+  (lookup_attribute (no_split_stack, DECL_ATTRIBUTES (cfun-decl))
+ == NULL))
+
 {
   if (cfun-machine-split_stack_arg_pointer == NULL_RTX)
{


Re: [PR64164] drop copyrename, integrate into expand

2015-08-04 Thread Alexandre Oliva
On Aug  4, 2015, Richard Biener richard.guent...@gmail.com wrote:

 Though I wonder on whether splitting the patch into a first one with disabling
 coalescing of parms (their default defs(?)) and a followup implementing the
 support for that.

We can't disable coalescing of parms altogether.  With -O0, we must
coalesce all SSA_NAMEs referencing each parm to a single partition.
With optimization, we could coalesce parms in general, just not these
special cases in which the parm is to live in a caller-supplied memory
block.

Now, it's not coalescing parms proper that brought so much risk to the
patch, it is assigning rtl to SSA partitions, and having assign_parms*
use that assignment.  Considering that sometimes a single param
necessarily ends up in more than one partition, requiring two
assignments, and that assign_parms* can't deal with that, I don't see
how to easily disable the cfgexpand logic when it comes to parms, so as
to be able to leave assign_parms alone.

How about, if further problems arise that justify reverting the patch
one more time, I'll look into splitting the patch as you suggested, but
otherwise, I'll save myself the trouble, ok?

 So - is my observation correct that this is only about coalescing of the
 default defs of parameters, not other SSA names based on parameter decls?

It's more like the opposite, i.e., we *refrain* from coalescing other
SSA_NAMEs related with byref params, so that we can easily tell when a
partition references a byref param and whether that partition holds its
default def.  We could have coalesced any other names that ended up in
different partitions, and even the partition holding the default def, if
we had other means to identify partitions with default defs of byref
params.  For example, we could create a bitmap of byref param default
def versions, and then, after partitioning, map those to the partitions
they were assigned to.  In fact, I might do that as a followup.

 Do you think this splitting is feasible and my concern about the
 code-gen issues warranted?

It is feasible but not exactly easy.

As for codegen, I hope to have covered all cases now, but should we find
out I haven't, I'll try the split and see what that gets us.  Did you
have any special cases in mind that it looks like I may have missed?

Thanks,

-- 
Alexandre Oliva, freedom fighterhttp://FSFLA.org/~lxoliva/
You must be the change you wish to see in the world. -- Gandhi
Be Free! -- http://FSFLA.org/   FSF Latin America board member
Free Software Evangelist|Red Hat Brasil GNU Toolchain Engineer


Re: [RFC] Elimination of zext/sext - type promotion pass

2015-08-04 Thread kugan



You indeed need to use CONVERT_EXPR here, maybe you can elaborate
on the optimization issues.


2. for inline asm (a reduced test case that might not make much as a
stand alone test-case, but I ran into similar cases with valid programmes)

;; Function fn1 (fn1, funcdef_no=0, decl_uid=4220, cgraph_uid=0,
symbol_order=0)

fn1 (short int p1)
{
   bb 2:
   __asm__( : =r p1_2 : 0 p1_1(D));
   return;

}


I am generating something like the following which ICEs. What is the
expected out?

;; Function fn1 (fn1, funcdef_no=0, decl_uid=4220, cgraph_uid=0,
symbol_order=0)

fn1 (short int p1)
{
   int _1;
   int _2;
   short int _5;

   bb 2:
   _1 = (int) p1_4(D);
   _5 = (short int) _1;
   __asm__( : =r p1_6 : 0 _5);
   _2 = (int) p1_6;
   return;

}


Parameters are indeed interesting to handle ;)  As we now see on ARM
the incoming parameter (the default def) and later assignments to it
can require different promotions (well, different extensions for ARM).

The only sensible way to deal with promoting parameters is to
promote them by changing the function signature.  Thus reflect the
targets ABI for parameters in the GIMPLE representation (which
includes TYPE_ARG_TYPES and DECL_ARGUMENTS).
IMHO we should do this during gimplification of parameters / call
arguments already.

So for your example you'd end up with

fn1 (int p1)
{
   __asm__( : =r p1_6 : 0 p1_4(D));
   return;
}

that is, promotions also apply to asm inputs/outputs (no?)



Thanks for the review and answers. For the time being, I am handling 
gimple_asm as one that has to be handled in original type. I Will look 
into improving it after getting the basic framework right.


As it is, attached patch bootstraps on x86_64-linux-gnu, arm-linux-gnu 
and aarch64-linux-gnu. There are few regressions to look into (Please 
see below).


There are cases it is working well. There are cases where it can be 
improved. I am attaching couple test cases (and their results). I am 
seeing some BIT_AND_EXPR which are inserted by promotion are not being 
optimized when they are redundant. This is especially the case when I 
invalidate the VRP range into from VRP1 during the type promotion. I am 
looking into it.


Please note that attached patch still needs to address:
* Adding gimple_debug stmts.
* Address review comment for expr.c handling SEXT_EXPR.
* Address regression failures

Based on the feedback, I will address the above and split the patch into 
logical patch set for easy detailed review.


Here are the outputs for the testcases.

--- c5.c.142t.veclower212015-08-05 08:50:11.367135339 +1000
+++ c5.c.143t.promotion 2015-08-05 08:50:11.367135339 +1000
@@ -1,34 +1,45 @@

 ;; Function unPack (unPack, funcdef_no=0, decl_uid=4145, cgraph_uid=0, 
symbol_order=0)


 unPack (unsigned char c)
 {
-  short int _1;
-  unsigned short _4;
-  unsigned short _5;
-  short int _6;
-  short int _7;
+  int _1;
+  unsigned int _2;
+  unsigned int _3;
+  unsigned int _4;
+  unsigned int _5;
+  int _6;
+  int _7;
+  unsigned int _9;
+  int _11;
+  int _12;
+  short int _13;

   bb 2:
-  c_3 = c_2(D)  15;
-  if (c_3  7)
+  _2 = (unsigned int) c_10(D);
+  _3 = _2  15;
+  _9 = _3  255;
+  if (_9  7)
 goto bb 3;
   else
 goto bb 4;

   bb 3:
-  _4 = (unsigned short) c_3;
-  _5 = _4 + 65531;
-  _6 = (short int) _5;
+  _4 = _3  65535;
+  _5 = _4 + 4294967291;
+  _11 = (int) _5;
+  _6 = (_11) sext from bit (16);
   goto bb 5;

   bb 4:
-  _7 = (short int) c_3;
+  _12 = (int) _3;
+  _7 = (_12) sext from bit (16);

   bb 5:
   # _1 = PHI _6(3), _7(4)
-  return _1;
+  _13 = (short int) _1;
+  return _13;

 }


--- c5.org.s2015-08-05 08:51:44.619133892 +1000
+++ c5.new.s2015-08-05 08:51:29.643134124 +1000
@@ -16,16 +16,14 @@
.syntax divided
.arm
.type   unPack, %function
 unPack:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
@ link register save eliminated.
and r0, r0, #15
cmp r0, #7
subhi   r0, r0, #5
-   uxthr0, r0
-   sxthr0, r0
bx  lr
.size   unPack, .-unPack
.ident  GCC: (GNU) 6.0.0 20150724 (experimental)
.section.note.GNU-stack,,%progbits
--- crc.c.142t.veclower21   2015-08-05 08:52:43.811132974 +1000
+++ crc.c.143t.promotion2015-08-05 08:52:43.811132974 +1000
@@ -1,52 +1,78 @@

 ;; Function crc2 (crc2, funcdef_no=0, decl_uid=4146, cgraph_uid=0, 
symbol_order=0)


 crc2 (short unsigned int crc, unsigned char data)
 {
   unsigned char carry;
   unsigned char x16;
   unsigned char i;
-  unsigned char ivtmp_5;
-  unsigned char _9;
-  unsigned char _10;
-  unsigned char ivtmp_18;
+  unsigned int _2;
+  unsigned int _3;
+  unsigned int _5;
+  unsigned int _7;
+  unsigned int _8;
+  unsigned int _9;
+  unsigned int _10;
+  unsigned int _11;
+  unsigned int _12;
+  unsigned int _13;
+  unsigned int _15;
+  unsigned int _16;
+  unsigned int _18;
+  unsigned int _19;
+  unsigned int _21;
+ 

Go patch committed: Use the type context to determine type of a complex constant

2015-08-04 Thread Ian Lance Taylor
This patch by Chris Manghane fixes the Go frontend to use the type
context to determine the type of a complex constant.  This fixes
https://golang.org/issue/11572 .  Bootstrapped and ran Go testsuite on
x86_64-unknown-linux-gnu.  Committed to mainline.

Ian
Index: gcc/go/gofrontend/MERGE
===
--- gcc/go/gofrontend/MERGE (revision 226543)
+++ gcc/go/gofrontend/MERGE (working copy)
@@ -1,4 +1,4 @@
-bdd98c601f2c8dbd0bf821548ba09c038f7645c4
+df080adb06f0e423820f3f6b9604b0c1093ff20a
 
 The first line of this file holds the git revision number of the last
 merge done from the gofrontend repository.
Index: gcc/go/gofrontend/expressions.cc
===
--- gcc/go/gofrontend/expressions.cc(revision 226510)
+++ gcc/go/gofrontend/expressions.cc(working copy)
@@ -2396,7 +2396,7 @@ class Complex_expression : public Expres
 
   void
   do_dump_expression(Ast_dump_context*) const;
-  
+
  private:
   // The complex value.
   mpc_t val_;
@@ -2423,8 +2423,7 @@ Complex_expression::do_determine_type(co
 {
   if (this-type_ != NULL  !this-type_-is_abstract())
 ;
-  else if (context-type != NULL
-   context-type-complex_type() != NULL)
+  else if (context-type != NULL  context-type-is_numeric_type())
 this-type_ = context-type;
   else if (!context-may_be_abstract)
 this-type_ = Type::lookup_complex_type(complex128);


libgo patch committed: Backport some patches to GCC 5 branch

2015-08-04 Thread Ian Lance Taylor
I've backported three recent libgo patches to the GCC 5 branch, as follows.

Ian
Index: libgo/Makefile.am
===
--- libgo/Makefile.am   (revision 226591)
+++ libgo/Makefile.am   (working copy)
@@ -1676,7 +1676,17 @@ endif # !LIBGO_IS_LINUX
 # Define socket sizes and types.
 if LIBGO_IS_LINUX
 syscall_socket_file = go/syscall/socket_linux.go epoll.go
+if LIBGO_IS_PPC64LE
+syscall_socket_type_file = go/syscall/socket_linux_ppc64x_type.go
 else
+if LIBGO_IS_PPC64
+syscall_socket_type_file = go/syscall/socket_linux_ppc64x_type.go
+else
+syscall_socket_type_file = go/syscall/socket_linux_type.go
+endif
+endif
+else
+syscall_socket_type_file =
 if LIBGO_IS_SOLARIS
 syscall_socket_file = go/syscall/socket_solaris.go
 else
@@ -1762,6 +1772,7 @@ go_base_syscall_files = \
$(syscall_size_file) \
$(syscall_socket_file) \
$(syscall_socket_os_file) \
+   $(syscall_socket_type_file) \
$(syscall_uname_file) \
$(syscall_netlink_file) \
$(syscall_lsf_file) \
Index: libgo/go/syscall/socket_linux.go
===
--- libgo/go/syscall/socket_linux.go(revision 226591)
+++ libgo/go/syscall/socket_linux.go(working copy)
@@ -136,11 +136,6 @@ type RawSockaddrNetlink struct {
Groups uint32
 }
 
-type RawSockaddr struct {
-   Family uint16
-   Data   [14]int8
-}
-
 // BindToDevice binds the socket associated with fd to device.
 func BindToDevice(fd int, device string) (err error) {
return SetsockoptString(fd, SOL_SOCKET, SO_BINDTODEVICE, device)
Index: libgo/go/syscall/socket_linux_ppc64x_type.go
===
--- libgo/go/syscall/socket_linux_ppc64x_type.go(revision 0)
+++ libgo/go/syscall/socket_linux_ppc64x_type.go(working copy)
@@ -0,0 +1,14 @@
+// socket_linux_ppc64x_type.go -- Socket handling specific to ppc64 GNU/Linux.
+
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syscall
+
+// Type needed on ppc64le  ppc64
+
+type RawSockaddr struct {
+   Family uint16
+   Data   [14]uint8
+}
Index: libgo/go/syscall/socket_linux_type.go
===
--- libgo/go/syscall/socket_linux_type.go   (revision 0)
+++ libgo/go/syscall/socket_linux_type.go   (working copy)
@@ -0,0 +1,14 @@
+// socket_linux_type.go -- Socket handling specific to GNU/Linux.
+
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syscall
+
+// Type needed if not on ppc64le or ppc64
+
+type RawSockaddr struct {
+   Family uint16
+   Data   [14]int8
+}
Index: libgo/mksysinfo.sh
===
--- libgo/mksysinfo.sh  (revision 226591)
+++ libgo/mksysinfo.sh  (working copy)
@@ -201,6 +201,67 @@ enum {
 #ifdef TCSETS
   TCSETS_val = TCSETS,
 #endif
+#ifdef TUNSETIFF
+  TUNSETIFF_val = TUNSETIFF,
+#endif
+#ifdef TUNSETNOCSUM
+  TUNSETNOCSUM_val = TUNSETNOCSUM,
+#endif
+#ifdef TUNSETDEBUG
+  TUNSETDEBUG_val = TUNSETDEBUG,
+#endif
+#ifdef TUNSETPERSIST
+  TUNSETPERSIST_val = TUNSETPERSIST,
+#endif
+#ifdef TUNSETOWNER
+  TUNSETOWNER_val = TUNSETOWNER,
+#endif
+#ifdef TUNSETLINK
+  TUNSETLINK_val = TUNSETLINK,
+#endif
+#ifdef TUNSETGROUP
+  TUNSETGROUP_val = TUNSETGROUP,
+#endif
+#ifdef TUNGETFEATURES
+  TUNGETFEATURES_val = TUNGETFEATURES,
+#endif
+#ifdef TUNSETOFFLOAD
+  TUNSETOFFLOAD_val = TUNSETOFFLOAD,
+#endif
+#ifdef TUNSETTXFILTER
+  TUNSETTXFILTER_val = TUNSETTXFILTER,
+#endif
+#ifdef TUNGETIFF
+  TUNGETIFF_val = TUNGETIFF,
+#endif
+#ifdef TUNGETSNDBUF
+  TUNGETSNDBUF_val = TUNGETSNDBUF,
+#endif
+#ifdef TUNSETSNDBUF
+  TUNSETSNDBUF_val = TUNSETSNDBUF,
+#endif
+#ifdef TUNATTACHFILTER
+  TUNATTACHFILTER_val = TUNATTACHFILTER,
+#endif
+#ifdef TUNDETACHFILTER
+  TUNDETACHFILTER_val = TUNDETACHFILTER,
+#endif
+#ifdef TUNGETVNETHDRSZ
+  TUNGETVNETHDRSZ_val = TUNGETVNETHDRSZ,
+#endif
+#ifdef TUNSETVNETHDRSZ
+  TUNSETVNETHDRSZ_val = TUNSETVNETHDRSZ,
+#endif
+#ifdef TUNSETQUEUE
+  TUNSETQUEUE_val = TUNSETQUEUE,
+#endif
+#ifdef TUNSETIFINDEX
+  TUNSETIFINDEX_val = TUNSETIFINDEX,
+#endif
+#ifdef TUNGETFILTER
+  TUNGETFILTER_val = TUNGETFILTER,
+#endif
+
 };
 EOF
 
@@ -830,6 +891,9 @@ grep '^type _passwd ' gen-sysinfo.go | \
 grep '^const _TIOC' gen-sysinfo.go | \
 grep -v '_val =' | \
 sed -e 's/^\(const \)_\(TIOC[^= ]*\)\(.*\)$/\1\2 = _\2/'  ${OUT}
+grep '^const _TUNSET' gen-sysinfo.go | \
+grep -v '_val =' | \
+sed -e 's/^\(const \)_\(TUNSET[^= ]*\)\(.*\)$/\1\2 = _\2/'  ${OUT}
 # We need TIOCGWINSZ.
 if ! grep '^const TIOCGWINSZ' ${OUT} /dev/null 21; then
   if grep '^const _TIOCGWINSZ_val' ${OUT} /dev/null 21; then
@@ -872,6 

Re: [PATCH], PR target/67071, Improve easy_altivec_constants on PowerPC

2015-08-04 Thread David Edelsohn
On Mon, Aug 3, 2015 at 5:49 PM, Michael Meissner
meiss...@linux.vnet.ibm.com wrote:
 In preparing the next IEEE 128-bit floating point patch, I needed a quick way
 to load -0.0q into a vector registers (i.e. just the MSB set). I originally 
 had
 a special purpose insn to load this value, but I decided to widen it to allow
 the easy_altivec_constant support to generate constants where you use the
 VSLDOI instruction to create the bottom part of all 0's or all 1's.

 When I started doing the coding, I noticed that the current support to load
 vectors with the MSB set in each element no longer worked, because the test
 assumed the constant was stored as an unsigned value, and we now store a sign
 extended number. I raised PR 67071 about this, and this patch fixes that
 problem and adds more patterns of vector constants that can be loaded without
 using memory.

 I have built this on both big endian power7 and little endian power8 machines
 with no regressions. Can I install the patch? I would also like to backport
 this patch to the active branches.

 [gcc]
 2015-08-03  Michael Meissner  meiss...@linux.vnet.ibm.com

 PR target/67071
 * config/rs6000/predicates.md (easy_vector_constant_vsldoi): New
 predicate to allow construction of vector constants using the
 VSLDOI vector shift instruction.

 * config/rs6000/rs6000-protos.h (vspltis_shifted): Add
 declaration.

 * config/rs6000/rs6000.c (vspltis_shifted): New function to return
 the number of bytes to be shifted left and filled in with either
 all zero or all one bits.
 (gen_easy_altivec_constant): Call vsplitis_shifted if no other
 methods exist.
 (output_vec_const_move): On power8, generate XXLORC to generate
 a vector constant with all 1's. Do a split if we need to use a
 VSLDOI instruction.

 * config/rs6000/rs6000.h (EASY_VECTOR_MSB): Use mode mask to
 properly test for the MSB.

 * config/rs6000/altivec.md (VSLDOI splitter): Add splitter for
 vector constants that can be created with VSLDOI.

 [gcc/testsuite]
 2015-08-03  Michael Meissner  meiss...@linux.vnet.ibm.com

 PR target/67071
 * gcc.target/powerpc/pr67071-1.c: New file to test PR 67071 new
 vector constants.
 * gcc.target/powerpc/pr67071-2.c: Likewise.
 * gcc.target/powerpc/pr67071-3.c: Likewise.

This okay, but please fix the formatting of EASY_VECTOR_MSB, which
does not follow GCC formatting style.

 #define EASY_VECTOR_MSB(n,mode) \
-  (((unsigned HOST_WIDE_INT)n) == \
+  unsigned HOST_WIDE_INT)n)  GET_MODE_MASK (mode)) == \
unsigned HOST_WIDE_INT)GET_MODE_MASK (mode)) + 1)  1))

Spaces after the casts and parentheses around the first macro argument n.

Thanks, David


Re: offload data version number

2015-08-04 Thread Nathan Sidwell

On 08/04/15 12:17, Thomas Schwinge wrote:

Hi!

Nathan's patch is waiting for trunk approval:



Then, for convenience, I'm also again attaching Nathan's patch:
trunk-version-4.patch.

Nathan, for the trunk commit, I suggest you simply merge my patch into
yours.


Yes, I'll atomically commit them.

nathan

--
Nathan Sidwell


Re: [PATCH] Add __builtin_stack_top

2015-08-04 Thread H.J. Lu
On Tue, Aug 4, 2015 at 10:43 AM, Segher Boessenkool
seg...@kernel.crashing.org wrote:
 On Tue, Aug 04, 2015 at 10:28:00AM -0700, H.J. Lu wrote:
  Any comments on my middle-end patch?
 
  So, if the answer is the same as frame_address (0), why not have the 
  fallback just expand to that?  Then, one can use this builtin everywhere 
  that frame address is used today.  People that want a faster, tighter port 
  can then implement the hook and achieve higher performance.

 The motivation of __builtin_stack_top is that frame_address requires a
 frame pointer register, which isn't desirable for x86.  __builtin_stack_top
 doesn't require a frame pointer register.

 If the target just returns frame_pointer_rtx from INITIAL_FRAME_ADDRESS_RTX,
 you don't get crtl-accesses_prior_frames set either, and as far as I can
 see everything works fine?  For __builtin_frame_address(0).

 You might have a reason why you want the entry stack address instead of the
 frame address, but you didn't really explain I think?  Or I missed it.


expand_builtin_return_addr sets

crtl-accesses_prior_frames = 1;

for __builtin_frame_address, which requires a frame pointer register.
__builtin_stack_top doesn't set crtl-accesses_prior_frames and frame
pointer register isn't required.

-- 
H.J.


Re: RFA: RL78: Remove far operand optimization in rl78_force_nonfar_3

2015-08-04 Thread DJ Delorie

This is OK, but note that it prevents some operations like:

__far int i;

foo()
{
  i ++;
}

from being implemented with a minimum set of opcodes.  This might be
particularly troublesome for volatile far things.



Re: [C++17] Implement N3928 - Extending static_assert

2015-08-04 Thread Paolo Carlini

Hi,

On 06/24/2015 05:32 PM, Ed Smith-Rowland wrote:

Index: testsuite/g++.dg/cpp1z/static_assert-nomsg.C
===
--- testsuite/g++.dg/cpp1z/static_assert-nomsg.C(revision 0)
+++ testsuite/g++.dg/cpp1z/static_assert-nomsg.C(working copy)
@@ -0,0 +1,27 @@
+// { dg-do compile { target c++1z } }
As far as I can see, we can't really use this syntax in the cpp1z 
directory: the testcase is effectively skipped. Indeed, we don't use it 
anywhere else. I'm going to replace it with the straightforward:


// { dg-options -std=c++1z }

Thanks,
Paolo.




Re: [PATCH] Add __builtin_stack_top

2015-08-04 Thread Mike Stump
On Aug 4, 2015, at 8:44 AM, H.J. Lu hjl.to...@gmail.com wrote:
 On Tue, Aug 4, 2015 at 8:40 AM, Mike Stump mikest...@comcast.net wrote:
 On Aug 4, 2015, at 5:30 AM, H.J. Lu hjl.to...@gmail.com wrote:
 Where does this feature belong?
 
 I prefer the middle end.
 
 Any comments on my middle-end patch?

So, if the answer is the same as frame_address (0), why not have the fallback 
just expand to that?  Then, one can use this builtin everywhere that frame 
address is used today.  People that want a faster, tighter port can then 
implement the hook and achieve higher performance.

Re: offload data version number

2015-08-04 Thread Thomas Schwinge
Hi!

Nathan's patch is waiting for trunk approval:

On Sat, 1 Aug 2015 20:06:39 -0400, Nathan Sidwell nat...@acm.org wrote:
 On 07/31/15 12:10, Jakub Jelinek wrote:
 
  This will hopefully be just GOMP_4.1 instead in the end, but it can
  change when gomp-4_1-branch is merged to trunk, we don't guarantee
  ABI stability at this point.
 
 Sure.
 
  I'd prefer version to go after devicep argument rather than before.
 Fixed.
 
  And really don't have ver_func vs. unver_func, just a single
  callback that will take the version argument too (again, if possible
  after target_id).
 
 Fixed ( elsewhere).  The patch should be checked for intelmic if possible 
 (Ilya?).  The  changes there are very mechanical so I'm not expecting a 
 problem.
 
 We don't need to  make the initial value of GOMP_VERSION non-zero, because 
 the 
 absence of the GOMP_OFFLOAD_version func will distinguish out of date plugins 
 at 
 this point.
 
 
  +
  +  if (DLSYM_OPT (version, version))
 
  I'd prefer requiring version always (i.e. DLSYM (version);
  plus the v != GOMP_VERSION checking).
 
 Fixed.
 
 ok?

I'd found one infrastructure-class problem in the libgomp intelmic
plugin build, which I addressed on gomp-4_0-branch in r226497,
http://news.gmane.org/find-root.php?message_id=%3C878u9soc60.fsf%40kepler.schwinge.homeip.net%3E,
patch again attached here, for easy reference (applies to trunk as-is):
0001-Fix-intelmic-libgomp-plugin-build.patch.

Then, for convenience, I'm also again attaching Nathan's patch:
trunk-version-4.patch.

Nathan, for the trunk commit, I suggest you simply merge my patch into
yours.


Grüße,
 Thomas


From 4e0158f41a00d6c4d09ca8a48eb63832abdd2f84 Mon Sep 17 00:00:00 2001
From: tschwinge tschwinge@138bc75d-0d04-0410-961f-82ee72b054a4
Date: Mon, 3 Aug 2015 11:14:24 +
Subject: [PATCH] Fix intelmic libgomp plugin build

... which got broken in r226469:

[...]/source-gcc/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp: In function 'unsigned int GOMP_OFFLOAD_version()':
[...]/source-gcc/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp:336:10: error: 'GOMP_VERSION' was not declared in this scope
   return GOMP_VERSION;
  ^
[...]/source-gcc/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp: In function 'int GOMP_OFFLOAD_load_image(int, unsigned int, void*, addr_pair**)':
[...]/source-gcc/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp:345:32: error: 'GOMP_VERSION_DEV' was not declared in this scope
   if (GOMP_VERSION_DEV (version)  GOMP_VERSION_INTEL_MIC)
^
[...]/source-gcc/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp:345:36: error: 'GOMP_VERSION_INTEL_MIC' was not declared in this scope
   if (GOMP_VERSION_DEV (version)  GOMP_VERSION_INTEL_MIC)
^
[...]/source-gcc/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp: In function 'void GOMP_OFFLOAD_unload_image(int, unsigned int, const void*)':
[...]/source-gcc/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp:373:32: error: 'GOMP_VERSION_DEV' was not declared in this scope
   if (GOMP_VERSION_DEV (version)  GOMP_VERSION_INTEL_MIC)
^
[...]/source-gcc/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp:373:36: error: 'GOMP_VERSION_INTEL_MIC' was not declared in this scope
   if (GOMP_VERSION_DEV (version)  GOMP_VERSION_INTEL_MIC)
^
make[6]: *** [libgomp_plugin_intelmic_la-libgomp-plugin-intelmic.lo] Error 1

	liboffloadmic/
	* plugin/Makefile.am (include_src_dir): Set.
	[PLUGIN_HOST] (libgomp_plugin_intelmic_la_CPPFLAGS): Use it.
	* plugin/Makefile.in: Regenerate.
	* plugin/libgomp-plugin-intelmic.cpp: Include gomp-constants.h.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/branches/gomp-4_0-branch@226497 138bc75d-0d04-0410-961f-82ee72b054a4
---
 liboffloadmic/ChangeLog.gomp | 7 +++
 liboffloadmic/plugin/Makefile.am | 3 ++-
 liboffloadmic/plugin/Makefile.in | 3 ++-
 liboffloadmic/plugin/libgomp-plugin-intelmic.cpp | 3 ++-
 4 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/liboffloadmic/ChangeLog.gomp b/liboffloadmic/ChangeLog.gomp
index 93d1e02..adb9e05 100644
--- a/liboffloadmic/ChangeLog.gomp
+++ b/liboffloadmic/ChangeLog.gomp
@@ -1,3 +1,10 @@
+2015-08-03  Thomas Schwinge  tho...@codesourcery.com
+
+	* plugin/Makefile.am (include_src_dir): Set.
+	[PLUGIN_HOST] (libgomp_plugin_intelmic_la_CPPFLAGS): Use it.
+	* plugin/Makefile.in: Regenerate.
+	* plugin/libgomp-plugin-intelmic.cpp: Include gomp-constants.h.
+
 2015-08-01  Nathan Sidwell  nat...@codesourcery.com
 
 	* plugin/libgomp-plugin-intelmic.cpp (GOMP_OFFLOAD_version): New.
diff --git a/liboffloadmic/plugin/Makefile.am b/liboffloadmic/plugin/Makefile.am
index 19d69ab..6ec444c 100644
--- a/liboffloadmic/plugin/Makefile.am
+++ b/liboffloadmic/plugin/Makefile.am
@@ -36,6 +36,7 @@ build_dir = $(top_builddir)
 source_dir = 

[PATCH, contrib]: texi2pod.pl: Escape braces in regexp involving @strong{...}

2015-08-04 Thread Uros Bizjak
... to avoid

perl ../../gcc-svn/trunk/gcc/../contrib/texi2pod.pl
../../gcc-svn/trunk/gcc/doc/invoke.texi  gcc.pod
Unescaped left brace in regex is deprecated, passed through in regex;
marked by -- HERE in m/^\@strong{ -- HERE (.*)}$/ at
../../gcc-svn/trunk/gcc/../contrib/texi2pod.pl line 319.

2015-08-04  Uros Bizjak  ubiz...@gmail.com

* texi2pod.pl: Escape braces in regexp involving @strong{...}.

Bootstrapped on Fedora 22 and committed to mainline SVN.

Uros.

Index: texi2pod.pl
===
--- texi2pod.pl (revision 226581)
+++ texi2pod.pl (working copy)
@@ -316,7 +316,7 @@
   @columns = ();
   for $column (split (/\s*\@tab\s*/, $1)) {
   # @strong{...} is used a @headitem work-alike
-   $column =~ s/^\@strong{(.*)}$/$1/;
+   $column =~ s/^\@strong\{(.*)\}$/$1/;
   push @columns, $column;
   }
   $_ = \n=item .join ( : , @columns).\n;


[PATCH, libstdc++, testsuite] Remove redundant -save-temps options

2015-08-04 Thread Nikolai Bozhenov

Hi,

the attached patch removes redundant -save-temps options from some libstdc++
tests, since the option is not needed in dg-do-compile/scan-assembler tests.

Thanks,
Nikolai

2015-08-04  Nikolai Bozhenov  n.bozhe...@samsung.com

	* testsuite/20_util/enable_shared_from_this/cons/constexpr.cc: Remove
	redundant -save-temps option.
	* testsuite/20_util/shared_ptr/cons/constexpr.cc: Likewise.
	* testsuite/20_util/unique_ptr/cons/constexpr.cc: Likewise.
	* testsuite/20_util/weak_ptr/cons/constexpr.cc: Likewise.
	* testsuite/30_threads/future/cons/constexpr.cc: Likewise.
	* testsuite/30_threads/shared_future/cons/constexpr.cc: Likewise.
  

diff --git a/libstdc++-v3/testsuite/20_util/enable_shared_from_this/cons/constexpr.cc b/libstdc++-v3/testsuite/20_util/enable_shared_from_this/cons/constexpr.cc
index 78d8f06..18bf0c7 100644
--- a/libstdc++-v3/testsuite/20_util/enable_shared_from_this/cons/constexpr.cc
+++ b/libstdc++-v3/testsuite/20_util/enable_shared_from_this/cons/constexpr.cc
@@ -1,5 +1,5 @@
 // { dg-do compile }
-// { dg-options -std=gnu++11 -fno-inline -save-temps -g0 }
+// { dg-options -std=gnu++11 -fno-inline -g0 }
 // { dg-final { scan-assembler-not _ZNSt23enable_shared_from_thisIiEC2Ev } }
 // { dg-final { scan-assembler-not _ZN7derivedC2Ev } }
 
diff --git a/libstdc++-v3/testsuite/20_util/shared_ptr/cons/constexpr.cc b/libstdc++-v3/testsuite/20_util/shared_ptr/cons/constexpr.cc
index 0c9e9b2..63cc60b 100644
--- a/libstdc++-v3/testsuite/20_util/shared_ptr/cons/constexpr.cc
+++ b/libstdc++-v3/testsuite/20_util/shared_ptr/cons/constexpr.cc
@@ -1,5 +1,5 @@
 // { dg-do compile }
-// { dg-options -std=gnu++11 -fno-inline -save-temps -g0 }
+// { dg-options -std=gnu++11 -fno-inline -g0 }
 // { dg-final { scan-assembler-not _ZNSt10shared_ptrIiEC2Ev } }
 // { dg-final { scan-assembler-not _ZNSt10shared_ptrIiEC2EDn } }
 
diff --git a/libstdc++-v3/testsuite/20_util/unique_ptr/cons/constexpr.cc b/libstdc++-v3/testsuite/20_util/unique_ptr/cons/constexpr.cc
index 4d6ae77..f118415 100644
--- a/libstdc++-v3/testsuite/20_util/unique_ptr/cons/constexpr.cc
+++ b/libstdc++-v3/testsuite/20_util/unique_ptr/cons/constexpr.cc
@@ -1,5 +1,5 @@
 // { dg-do compile }
-// { dg-options -std=gnu++11 -fno-inline -save-temps -g0 }
+// { dg-options -std=gnu++11 -fno-inline -g0 }
 // { dg-final { scan-assembler-not _ZNSt10unique_ptrIiSt14default_deleteIiEEC2Ev } }
 // { dg-final { scan-assembler-not _ZNSt10unique_ptrIiSt14default_deleteIiEEC2EDn } }
 
diff --git a/libstdc++-v3/testsuite/20_util/weak_ptr/cons/constexpr.cc b/libstdc++-v3/testsuite/20_util/weak_ptr/cons/constexpr.cc
index b5eff55..6b77e9b 100644
--- a/libstdc++-v3/testsuite/20_util/weak_ptr/cons/constexpr.cc
+++ b/libstdc++-v3/testsuite/20_util/weak_ptr/cons/constexpr.cc
@@ -1,5 +1,5 @@
 // { dg-do compile }
-// { dg-options -std=gnu++11 -fno-inline -save-temps -g0 }
+// { dg-options -std=gnu++11 -fno-inline -g0 }
 // { dg-final { scan-assembler-not _ZNSt8weak_ptrIiEC2Ev } }
 
 // Copyright (C) 2010-2015 Free Software Foundation, Inc.
diff --git a/libstdc++-v3/testsuite/30_threads/future/cons/constexpr.cc b/libstdc++-v3/testsuite/30_threads/future/cons/constexpr.cc
index 0ec5fda..11945ad 100644
--- a/libstdc++-v3/testsuite/30_threads/future/cons/constexpr.cc
+++ b/libstdc++-v3/testsuite/30_threads/future/cons/constexpr.cc
@@ -1,5 +1,5 @@
 // { dg-do compile }
-// { dg-options -std=gnu++11 -fno-inline -save-temps -g0 }
+// { dg-options -std=gnu++11 -fno-inline -g0 }
 // { dg-require-cstdint  }
 // { dg-require-gthreads  }
 // { dg-require-atomic-builtins  }
diff --git a/libstdc++-v3/testsuite/30_threads/shared_future/cons/constexpr.cc b/libstdc++-v3/testsuite/30_threads/shared_future/cons/constexpr.cc
index 11826e1..eebf797 100644
--- a/libstdc++-v3/testsuite/30_threads/shared_future/cons/constexpr.cc
+++ b/libstdc++-v3/testsuite/30_threads/shared_future/cons/constexpr.cc
@@ -1,5 +1,5 @@
 // { dg-do compile }
-// { dg-options -std=gnu++11 -fno-inline -save-temps -g0 }
+// { dg-options -std=gnu++11 -fno-inline -g0 }
 // { dg-require-cstdint  }
 // { dg-require-gthreads  }
 // { dg-require-atomic-builtins  }


Re: [PATCH 3/4] Add libgomp plugin for Intel MIC

2015-08-04 Thread David Malcolm
On Mon, 2015-08-03 at 13:23 +0300, Maxim Blumental wrote:
 Could you probably review the patch, please?

Sorry, I'm not the best person to review the patch: Jakub CCed me for my
knowledge of python, so I ported his script to work with both python 2
and 3, and it ought to work with early python 2 versions (or be easily
fixable).

It looks like you're using the resulting python script I wrote.  Other
than that, I don't have reviewer-level expertise in the domains of the
rest of the patch (e.g. Intel MIC, and or the build system).

 2015-07-28 18:42 GMT+03:00 Maxim Blumental bvm...@gmail.com:
   Applied the idea with python script alternative. Review, please.
 
  2015-07-24 17:18 GMT+03:00 David Malcolm dmalc...@redhat.com:
  On Fri, 2015-07-24 at 10:01 +0200, Jakub Jelinek wrote:
  #!/usr/bin/python
  import sys
  with open(sys.argv[1],rb) as f:
  nextblock = f.read(12)
  while 1:
  block = nextblock
  nextblock = f.read(12)
  if block == :
  break
  str = 
  for ch in block:
  if str == :
  str =   
  else:
  str += , 
  if ord(ch)  10:
  str += 0x0 + chr(ord('0')+ord(ch))
  elif ord(ch)  16:
  str += 0x0 + chr(ord('a')+ord(ch)-10)
  else:
  str += hex(ord(ch))
  if nextblock != :
  str += ,
  print str
 
  python ./xxd.py $  $@
  does the same thing as
  cat $ | xxd -include  $@
  (CCing David as python expert, my python knowledge is limited and
  15 years old, not sure how portable this is (python 2 vs. python 3,
  and
  even python 2 minimal versions)).
 
  It doesn't work with Python 3 for various reasons (print syntax, and
  str vs bytes issues).
 
  I'm attaching a version which works with both Python 2 and Python 3
  (2.7.5 and 3.3.2 were the versions I tried).
 
  It ought to work with much older python 2 versions (as your script
  appears to), but I don't have them handy.
 
  Presumably it would need a license header and some descriptive comments.
 
  (snip)
 
  Dave




Re: [RFC] [Patch]: Try and vectorize with shift for mult expr with power 2 integer constant.

2015-08-04 Thread Richard Sandiford
Richard Biener richard.guent...@gmail.com writes:
 On August 4, 2015 4:28:26 PM GMT+02:00, Richard Sandiford
 richard.sandif...@arm.com wrote:
Richard Biener richard.guent...@gmail.com writes:
 On Tue, Aug 4, 2015 at 4:21 PM, Richard Biener
 richard.guent...@gmail.com wrote:
 On Tue, Aug 4, 2015 at 4:15 PM, Richard Sandiford
 richard.sandif...@arm.com wrote:
 Richard Biener richard.guent...@gmail.com writes:
 in fact the first if () looks redundant if you simply put an else
 return NULL
 after a else if (wi::exact_log2 (wi::neg (oprnd1)) != -1)

 Note that the issue with INT_MIN is that wi::neg (INT_MIN) is
INT_MIN
 again, but it seems that wi::exact_log2 returns -1 in that case so
you
 are fine (and in fact not handling this case).

 Are you sure it returns -1 for INT_MIN?  It isn't supposed to,
assuming
 INT_MIN is shorthand for minimum value for a signed type. 
wide_ints
 aren't signed, so INT_MIN is indistinguishable from an unsigned
 1(prec-1).

 No, not sure.  I spotted

   /* Reject cases where there are implicit -1 blocks above HIGH.  */
   if (x.len * HOST_BITS_PER_WIDE_INT  x.precision  x.sign_mask ()
 0)
 return -1;

 and thought that would catch it.  I mean the tree value is negative
so
 exact_log2 must see it is a negative value.

That's handling the compressed format, e.g.:

  {1  63}

as a 64-bit short-hand for a 256-bit:

  {1  63,-1,-1,-1}

In this case more than one of the low x.precision bits are known to be
set.

 So you are saying exact_log2 is really exact_log2u?

Not sure what you mean, sorry.  All I meant was that a number like:

  0x___8000

(uing 32-bit rather than 64-bit elements for brevity) is stored as
a single {0x8000}, with the upper 3 elements being implicit.  And:

  exact_log2 (0x___8000)

is obviously -1.  That's the case that the code above is handling.
If there are implicit all-1 blocks, the number is not a power of 2.

Thanks,
Richard



RE: [RFC] [Patch]: Try and vectorize with shift for mult expr with power 2 integer constant.

2015-08-04 Thread Kumar, Venkataramanan
Hi Richard,


 -Original Message-
 From: Richard Biener [mailto:richard.guent...@gmail.com]
 Sent: Tuesday, August 04, 2015 4:07 PM
 To: Kumar, Venkataramanan
 Cc: Jeff Law; Jakub Jelinek; gcc-patches@gcc.gnu.org
 Subject: Re: [RFC] [Patch]: Try and vectorize with shift for mult expr with
 power 2 integer constant.
 
 On Tue, Aug 4, 2015 at 10:52 AM, Kumar, Venkataramanan
 venkataramanan.ku...@amd.com wrote:
  Hi Jeff,
 
  -Original Message-
  From: gcc-patches-ow...@gcc.gnu.org [mailto:gcc-patches-
  ow...@gcc.gnu.org] On Behalf Of Jeff Law
  Sent: Monday, August 03, 2015 11:42 PM
  To: Kumar, Venkataramanan; Jakub Jelinek
  Cc: Richard Beiner (richard.guent...@gmail.com);
  gcc-patches@gcc.gnu.org
  Subject: Re: [RFC] [Patch]: Try and vectorize with shift for mult
  expr with power 2 integer constant.
 
  On 08/02/2015 05:03 AM, Kumar, Venkataramanan wrote:
   Hi Jakub,
  
   Thank you for reviewing the patch.
  
   I have incorporated your comments in the attached patch.
  Note Jakub is on PTO for the next 3 weeks.
 
   Thank you for this information.
 
 
 
  
  
  
   vectorize_mults_via_shift.diff.txt
  
  
   diff --git a/gcc/testsuite/gcc.dg/vect/vect-mult-patterns.c
   b/gcc/testsuite/gcc.dg/vect/vect-mult-patterns.c
  Jakub would probably like more testcases :-)
 
  The most obvious thing to test would be other shift factors.
 
  A negative test to verify we don't try to turn a multiply by
  non-constant or multiply by a constant that is not a power of 2 into 
  shifts.
 
  I have added negative test in the attached patch.
 
 
 
  [ Would it make sense, for example, to turn a multiply by 3 into a
  shift-add sequence?  As Jakub said, choose_mult_variant can be your
  friend. ]
 
  Yes I will do that in a follow up patch.
 
  The new change log becomes
 
  gcc/ChangeLog
  2015-08-04  Venkataramanan Kumar
 venkataramanan.ku...@amd.com
   * tree-vect-patterns.c (vect_recog_mult_pattern): New function for
 vectorizing
  multiplication patterns.
   * tree-vectorizer.h: Adjust the number of patterns.
 
  gcc/testsuite/ChangeLog
  2015-08-04  Venkataramanan Kumar
 venkataramanan.ku...@amd.com
   * gcc.dg/vect/vect-mult-pattern-1.c: New
  * gcc.dg/vect/vect-mult-pattern-2.c: New
 
  Bootstrapped and reg tested on aarch64-unknown-linux-gnu.
 
  Ok for trunk ?
 
 +  if (TREE_CODE (oprnd0) != SSA_NAME
 +  || TREE_CODE (oprnd1) != INTEGER_CST
 +  || TREE_CODE (itype) != INTEGER_TYPE
 
 INTEGRAL_TYPE_P (itype)
 
 +  optab = optab_for_tree_code (LSHIFT_EXPR, vectype, optab_vector);  if
 + (!optab
 +  || optab_handler (optab, TYPE_MODE (vectype)) ==
 CODE_FOR_nothing)
 +   return NULL;
 +
 
 indent of the return stmt looks wrong
 
 +  /* Handle constant operands that are postive or negative powers of 2.
 + */  if ( wi::exact_log2 (oprnd1) != -1  ||
 +   wi::exact_log2 (wi::neg (oprnd1)) != -1)
 
 no space after (, || goes to the next line.
 
 +{
 +  tree shift;
 +
 +  if (wi::exact_log2 (oprnd1) != -1)
 
 please cache wi::exact_log2
 
 in fact the first if () looks redundant if you simply put an else return NULL
 after a else if (wi::exact_log2 (wi::neg (oprnd1)) != -1)
 
 Note that the issue with INT_MIN is that wi::neg (INT_MIN) is INT_MIN
 again, but it seems that wi::exact_log2 returns -1 in that case so you are 
 fine
 (and in fact not handling this case).
 

I have updated your review comments in the attached patch. 

For the INT_MIN case, I am getting  vectorized output with the patch.   I 
believe x86_64 also vectorizes but does not negates the results. 

#include limits.h
unsigned long int  __attribute__ ((aligned (64)))arr[100];

int i;
#if 1
void test_vector_shifts()
{
for(i=0; i=99;i++)
arr[i]=arr[i] * INT_MIN;
}
#endif

void test_vectorshift_via_mul()
{
for(i=0; i=99;i++)
arr[i]=arr[i]*(-INT_MIN);

}

Before 
-
ldr x1, [x0]
neg x1, x1, lsl 31
str x1, [x0], 8
cmp x0, x2

After 
---
ldr q0, [x0]
shl v0.2d, v0.2d, 31
neg v0.2d, v0.2d
str q0, [x0], 16
cmp x1, x0

is this fine ?  

  Thanks,
 Richard.
 
 
 
 
   @@ -2147,6 +2152,140 @@ vect_recog_vector_vector_shift_pattern
  (vecgimple *stmts,
   return pattern_stmt;
 }
  
   +/* Detect multiplication by constant which are postive or
   +negatives of power 2,
  s/postive/positive/
 
 
  Jeff
 
  Regards,
  Venkat.
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-mult-pattern-1.c 
b/gcc/testsuite/gcc.dg/vect/vect-mult-pattern-1.c
new file mode 100644
index 000..764d0e3
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-mult-pattern-1.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target vect_shift } */
+
+unsigned  long int __attribute__ ((aligned (64)))arr[100];
+int i;
+
+void test_for_vectorshifts_via_mul_with_power2_const ()
+{
+  for (i=0; i=99; 

Re: [PATCH] Add __builtin_stack_top

2015-08-04 Thread H.J. Lu
On Tue, Aug 4, 2015 at 10:16 AM, Mike Stump mikest...@comcast.net wrote:
 On Aug 4, 2015, at 8:44 AM, H.J. Lu hjl.to...@gmail.com wrote:
 On Tue, Aug 4, 2015 at 8:40 AM, Mike Stump mikest...@comcast.net wrote:
 On Aug 4, 2015, at 5:30 AM, H.J. Lu hjl.to...@gmail.com wrote:
 Where does this feature belong?

 I prefer the middle end.

 Any comments on my middle-end patch?

 So, if the answer is the same as frame_address (0), why not have the fallback 
 just expand to that?  Then, one can use this builtin everywhere that frame 
 address is used today.  People that want a faster, tighter port can then 
 implement the hook and achieve higher performance.

The motivation of __builtin_stack_top is that frame_address requires a
frame pointer register, which isn't desirable for x86.  __builtin_stack_top
doesn't require a frame pointer register.

-- 
H.J.


Re: [PATCH] Add __builtin_stack_top

2015-08-04 Thread Segher Boessenkool
On Tue, Aug 04, 2015 at 10:28:00AM -0700, H.J. Lu wrote:
  Any comments on my middle-end patch?
 
  So, if the answer is the same as frame_address (0), why not have the 
  fallback just expand to that?  Then, one can use this builtin everywhere 
  that frame address is used today.  People that want a faster, tighter port 
  can then implement the hook and achieve higher performance.
 
 The motivation of __builtin_stack_top is that frame_address requires a
 frame pointer register, which isn't desirable for x86.  __builtin_stack_top
 doesn't require a frame pointer register.

If the target just returns frame_pointer_rtx from INITIAL_FRAME_ADDRESS_RTX,
you don't get crtl-accesses_prior_frames set either, and as far as I can
see everything works fine?  For __builtin_frame_address(0).

You might have a reason why you want the entry stack address instead of the
frame address, but you didn't really explain I think?  Or I missed it.


Segher


Re: [PATCH] Add __builtin_stack_top

2015-08-04 Thread H.J. Lu
On Tue, Aug 4, 2015 at 11:50 AM, H.J. Lu hjl.to...@gmail.com wrote:
 On Tue, Aug 4, 2015 at 10:43 AM, Segher Boessenkool
 seg...@kernel.crashing.org wrote:
 On Tue, Aug 04, 2015 at 10:28:00AM -0700, H.J. Lu wrote:
  Any comments on my middle-end patch?
 
  So, if the answer is the same as frame_address (0), why not have the 
  fallback just expand to that?  Then, one can use this builtin everywhere 
  that frame address is used today.  People that want a faster, tighter 
  port can then implement the hook and achieve higher performance.

 The motivation of __builtin_stack_top is that frame_address requires a
 frame pointer register, which isn't desirable for x86.  __builtin_stack_top
 doesn't require a frame pointer register.

 If the target just returns frame_pointer_rtx from INITIAL_FRAME_ADDRESS_RTX,
 you don't get crtl-accesses_prior_frames set either, and as far as I can
 see everything works fine?  For __builtin_frame_address(0).

 You might have a reason why you want the entry stack address instead of the
 frame address, but you didn't really explain I think?  Or I missed it.


 expand_builtin_return_addr sets

 crtl-accesses_prior_frames = 1;

 for __builtin_frame_address, which requires a frame pointer register.
 __builtin_stack_top doesn't set crtl-accesses_prior_frames and frame
 pointer register isn't required.


BTW, x86 doesn't define INITIAL_FRAME_ADDRESS_RTX.

-- 
H.J.


Re: [RFC] COMDAT Safe Module Level Multi versioning

2015-08-04 Thread Sriraman Tallam
On Tue, Jun 16, 2015 at 4:22 PM, Sriraman Tallam tmsri...@google.com wrote:
 On Tue, May 19, 2015 at 9:11 AM, Xinliang David Li davi...@google.com wrote:

 Hm.  But which options are unsafe?  Also wouldn't it be better to simply
 _not_ have unsafe options produce comdats but always make local clones
 for them (thus emit the comdat with unsafe flags dropped)?

 Always localize comdat functions may lead to text size increase. It
 does not work if the comdat function is a virtual function for
 instance.

 Based on Richard's suggestion, I have a patch to localize comdat
 functions which seems like a very effective solution to this problem.
 The text size increase is limited to the extra comdat copies generated
 for the specialized modules (modules with unsafe options) which is
 usually only a few.   Since -fweak does something similar for
 functions,  I have called the new option -fweak-comdat-functions.
 This does not apply to virtual comdat functions as their addresses can
 always be leaked via the vtable. Using this flag with virtual
 functions generates a warning.

 To summarize, this is the intended usage of this option. Modules which
 use unsafe code options, like -misa for multiversioning, to generate
 code that is meant to run only on a subset of CPUs can generate
 comdats with specialized instructions which when picked by the linker
 can get run unconditionally causing SIGILL on unsupported platforms.
 This flag hides these comdats to be local to these modules and not
 make them available publicly,  with the caveat that it does not apply
 to virtual comdats.

 Could you please review?

Ping.  This patch uses Richard's suggestion to localize comdat
functions with option -fno-weak-comdat-functions.  Comments?

* c-family/c.opt (fweak-comdat-functions): New option.
* cp/decl2.c (comdat_linkage): Implement new option.  Warn when
virtual comdat functions are seen.
* doc/invoke.texi: Document new option.
* testsuite/g++.dg/no-weak-comdat-functions-1.C: New test.



 * c-family/c.opt (fweak-comdat-functions): New option.
 * cp/decl2.c (comdat_linkage): Implement new option.  Warn when
 virtual comdat functions are seen.
 * doc/invoke.texi: Document new option.
 * testsuite/g++.dg/no-weak-comdat-functions-1.C: New test.


 Thanks
 Sri



 David



 Richard.


 Thanks
 Sri
* c-family/c.opt (fweak-comdat-functions): New option.
* cp/decl2.c (comdat_linkage): Implement new option.  Warn when
virtual comdat functions are seen.
* doc/invoke.texi: Document new option.
* testsuite/g++.dg/no-weak-comdat-functions-1.C: New test.

Index: c-family/c.opt
===
--- c-family/c.opt  (revision 224486)
+++ c-family/c.opt  (working copy)
@@ -1236,6 +1236,14 @@ fweak
 C++ ObjC++ Var(flag_weak) Init(1)
 Emit common-like symbols as weak symbols
 
+fweak-comdat-functions
+C++ Var(flag_weak_comdat_functions) Init(1)
+Specific to comdat functions(-fno-weak-comdat-functions : Localize Comdat 
Functions).
+With -fno-weak-comdat-functions, virtual comdat functions are still linked as
+weak functions.  With -fno-weak-comdat-functions, the address of the comdat
+functions that are localized will be unique and this can cause unintended
+behavior when addresses of comdat functions are used.
+
 fwide-exec-charset=
 C ObjC C++ ObjC++ Joined RejectNegative
 -fwide-exec-charset=cset Convert all wide strings and character 
constants to character set cset
Index: cp/decl2.c
===
--- cp/decl2.c  (revision 224486)
+++ cp/decl2.c  (working copy)
@@ -1702,8 +1702,19 @@ mark_vtable_entries (tree decl)
 void
 comdat_linkage (tree decl)
 {
-  if (flag_weak)
-make_decl_one_only (decl, cxx_comdat_group (decl));
+  if (flag_weak
+   (flag_weak_comdat_functions
+ || TREE_CODE (decl) != FUNCTION_DECL
+ || DECL_VIRTUAL_P (decl)))
+{
+  make_decl_one_only (decl, cxx_comdat_group (decl));
+  if (TREE_CODE (decl) == FUNCTION_DECL
+  DECL_VIRTUAL_P (decl)
+  !flag_weak_comdat_functions)
+   warning_at (DECL_SOURCE_LOCATION (decl), 0,
+   fno-weak-comdat-functions: Comdat linkage of virtual 
+   function %q#D preserved.);
+}
   else if (TREE_CODE (decl) == FUNCTION_DECL
   || (VAR_P (decl)  DECL_ARTIFICIAL (decl)))
 /* We can just emit function and compiler-generated variables
Index: doc/invoke.texi
===
--- doc/invoke.texi (revision 224486)
+++ doc/invoke.texi (working copy)
@@ -189,7 +189,7 @@ in the following sections.
 -fno-pretty-templates @gol
 -frepo  -fno-rtti  -fstats  -ftemplate-backtrace-limit=@var{n} @gol
 -ftemplate-depth=@var{n} @gol
--fno-threadsafe-statics -fuse-cxa-atexit  -fno-weak  -nostdinc++ @gol
+-fno-threadsafe-statics -fuse-cxa-atexit  -fno-weak -fno-weak-comdat-functions 
-nostdinc++ @gol
 

Re: [PATCH] Add __builtin_stack_top

2015-08-04 Thread H.J. Lu
On Tue, Aug 4, 2015 at 12:29 PM, Segher Boessenkool
seg...@kernel.crashing.org wrote:
 On Tue, Aug 04, 2015 at 11:50:00AM -0700, H.J. Lu wrote:
  The motivation of __builtin_stack_top is that frame_address requires a
  frame pointer register, which isn't desirable for x86.  
  __builtin_stack_top
  doesn't require a frame pointer register.
 
  If the target just returns frame_pointer_rtx from 
  INITIAL_FRAME_ADDRESS_RTX,
  you don't get crtl-accesses_prior_frames set either, and as far as I can
  see everything works fine?  For __builtin_frame_address(0).
 
  You might have a reason why you want the entry stack address instead of the
  frame address, but you didn't really explain I think?  Or I missed it.
 

 expand_builtin_return_addr sets

 crtl-accesses_prior_frames = 1;

 for __builtin_frame_address, which requires a frame pointer register.
 __builtin_stack_top doesn't set crtl-accesses_prior_frames and frame
 pointer register isn't required.

 Not if you have INITIAL_FRAME_ADDRESS_RTX.  I don't see why the generic code
 cannot just use frame_pointer_rtx (instead of hard_frame_pointer_rtx) for
 a count of 0; but making it target-specific is certainly more conservative.

 You say i386 doesn't have that target macro defined currently.  Yes I know;
 so change that?  Or change the generic code, but that is much more testing.

There is another issue with x86, maybe other targets.  You
can't get the real stack top when stack is realigned and
-maccumulate-outgoing-args isn't used since ix86_expand_prologue
will create and return another stack frame for
__builtin_frame_address and __builtin_return_address.
It will be wrong for __builtin_stack_top, which should
return the real stack address.

-- 
H.J.


Re: [PATCH] Add __builtin_stack_top

2015-08-04 Thread H.J. Lu
On Tue, Aug 4, 2015 at 1:45 PM, Segher Boessenkool
seg...@kernel.crashing.org wrote:
 On Tue, Aug 04, 2015 at 01:00:32PM -0700, H.J. Lu wrote:
 There is another issue with x86, maybe other targets.  You
 can't get the real stack top when stack is realigned and
 -maccumulate-outgoing-args isn't used since ix86_expand_prologue
 will create and return another stack frame for
 __builtin_frame_address and __builtin_return_address.
 It will be wrong for __builtin_stack_top, which should
 return the real stack address.

 That's why I asked:

   You might have a reason why you want the entry stack address instead of 
   the
   frame address, but you didn't really explain I think?  Or I missed it.

 What would a C program do with this, that it cannot do with the frame
 address, that would be useful and cannot be much better done in straight
 assembler?  Do you actually want to expose the argument pointer, maybe?


Yes, we want to use the argument pointer as shown in testcases
included in my patch.


-- 
H.J.


Re: [PATCH] Add __builtin_stack_top

2015-08-04 Thread Segher Boessenkool
On Tue, Aug 04, 2015 at 01:00:32PM -0700, H.J. Lu wrote:
 There is another issue with x86, maybe other targets.  You
 can't get the real stack top when stack is realigned and
 -maccumulate-outgoing-args isn't used since ix86_expand_prologue
 will create and return another stack frame for
 __builtin_frame_address and __builtin_return_address.
 It will be wrong for __builtin_stack_top, which should
 return the real stack address.

That's why I asked:

   You might have a reason why you want the entry stack address instead of 
   the
   frame address, but you didn't really explain I think?  Or I missed it.

What would a C program do with this, that it cannot do with the frame
address, that would be useful and cannot be much better done in straight
assembler?  Do you actually want to expose the argument pointer, maybe?


Segher


Re: [PATCH] Add __builtin_stack_top

2015-08-04 Thread Segher Boessenkool
On Tue, Aug 04, 2015 at 11:50:00AM -0700, H.J. Lu wrote:
  The motivation of __builtin_stack_top is that frame_address requires a
  frame pointer register, which isn't desirable for x86.  __builtin_stack_top
  doesn't require a frame pointer register.
 
  If the target just returns frame_pointer_rtx from INITIAL_FRAME_ADDRESS_RTX,
  you don't get crtl-accesses_prior_frames set either, and as far as I can
  see everything works fine?  For __builtin_frame_address(0).
 
  You might have a reason why you want the entry stack address instead of the
  frame address, but you didn't really explain I think?  Or I missed it.
 
 
 expand_builtin_return_addr sets
 
 crtl-accesses_prior_frames = 1;
 
 for __builtin_frame_address, which requires a frame pointer register.
 __builtin_stack_top doesn't set crtl-accesses_prior_frames and frame
 pointer register isn't required.

Not if you have INITIAL_FRAME_ADDRESS_RTX.  I don't see why the generic code
cannot just use frame_pointer_rtx (instead of hard_frame_pointer_rtx) for
a count of 0; but making it target-specific is certainly more conservative.

You say i386 doesn't have that target macro defined currently.  Yes I know;
so change that?  Or change the generic code, but that is much more testing.


Segher


Re: [PING] Re: [PATCH] c/66516 - missing diagnostic on taking the address of a builtin function

2015-08-04 Thread Jeff Law

On 08/04/2015 09:04 AM, Jason Merrill wrote:


This is largely historical baggage, I think, from days where computers
had less memory and we were trying to do as much processing as possible
immediately.
Right.  Also note the early folding was from a time before we had the 
gimple optimization pipeline -- the only significant optimizations we 
did on trees was the (excessive) fold-const.c stuff.





The c++-delayed-folding branch delays folding the ?: expression until
the end of the function, at which point we can see better what context
the function is being used in, which could simplify your patch.
Right.   And that's a design direction we want to take with folding in 
general -- delay it until the transition to gimple/ssa.  That way what 
we get out of the front-ends looks much more like the original source.


jeff


Re: [RFC] [Patch]: Try and vectorize with shift for mult expr with power 2 integer constant.

2015-08-04 Thread Richard Biener
On August 4, 2015 4:28:26 PM GMT+02:00, Richard Sandiford 
richard.sandif...@arm.com wrote:
Richard Biener richard.guent...@gmail.com writes:
 On Tue, Aug 4, 2015 at 4:21 PM, Richard Biener
 richard.guent...@gmail.com wrote:
 On Tue, Aug 4, 2015 at 4:15 PM, Richard Sandiford
 richard.sandif...@arm.com wrote:
 Richard Biener richard.guent...@gmail.com writes:
 On Tue, Aug 4, 2015 at 10:52 AM, Kumar, Venkataramanan
 venkataramanan.ku...@amd.com wrote:
 Hi Jeff,

 -Original Message-
 From: gcc-patches-ow...@gcc.gnu.org [mailto:gcc-patches-
 ow...@gcc.gnu.org] On Behalf Of Jeff Law
 Sent: Monday, August 03, 2015 11:42 PM
 To: Kumar, Venkataramanan; Jakub Jelinek
 Cc: Richard Beiner (richard.guent...@gmail.com);
gcc-patches@gcc.gnu.org
 Subject: Re: [RFC] [Patch]: Try and vectorize with shift for
mult
 expr with
 power 2 integer constant.

 On 08/02/2015 05:03 AM, Kumar, Venkataramanan wrote:
  Hi Jakub,
 
  Thank you for reviewing the patch.
 
  I have incorporated your comments in the attached patch.
 Note Jakub is on PTO for the next 3 weeks.

  Thank you for this information.



 
 
 
  vectorize_mults_via_shift.diff.txt
 
 
  diff --git a/gcc/testsuite/gcc.dg/vect/vect-mult-patterns.c
  b/gcc/testsuite/gcc.dg/vect/vect-mult-patterns.c
 Jakub would probably like more testcases :-)

 The most obvious thing to test would be other shift factors.

 A negative test to verify we don't try to turn a multiply by
 non-constant or
 multiply by a constant that is not a power of 2 into shifts.

 I have added negative test in the attached patch.



 [ Would it make sense, for example, to turn a multiply by 3 into
a
 shift-add
 sequence?  As Jakub said, choose_mult_variant can be your
friend. ]

 Yes I will do that in a follow up patch.

 The new change log becomes

 gcc/ChangeLog
 2015-08-04  Venkataramanan Kumar  venkataramanan.ku...@amd.com
  * tree-vect-patterns.c (vect_recog_mult_pattern): New
function for vectorizing
 multiplication patterns.
  * tree-vectorizer.h: Adjust the number of patterns.

 gcc/testsuite/ChangeLog
 2015-08-04  Venkataramanan Kumar  venkataramanan.ku...@amd.com
  * gcc.dg/vect/vect-mult-pattern-1.c: New
 * gcc.dg/vect/vect-mult-pattern-2.c: New

 Bootstrapped and reg tested on aarch64-unknown-linux-gnu.

 Ok for trunk ?

 +  if (TREE_CODE (oprnd0) != SSA_NAME
 +  || TREE_CODE (oprnd1) != INTEGER_CST
 +  || TREE_CODE (itype) != INTEGER_TYPE

 INTEGRAL_TYPE_P (itype)

 +  optab = optab_for_tree_code (LSHIFT_EXPR, vectype,
optab_vector);
 +  if (!optab
 +  || optab_handler (optab, TYPE_MODE (vectype)) ==
CODE_FOR_nothing)
 +   return NULL;
 +

 indent of the return stmt looks wrong

 +  /* Handle constant operands that are postive or negative powers
of 2.  */
 +  if ( wi::exact_log2 (oprnd1) != -1  ||
 +   wi::exact_log2 (wi::neg (oprnd1)) != -1)

 no space after (, || goes to the next line.

 +{
 +  tree shift;
 +
 +  if (wi::exact_log2 (oprnd1) != -1)

 please cache wi::exact_log2

 in fact the first if () looks redundant if you simply put an else
 return NULL
 after a else if (wi::exact_log2 (wi::neg (oprnd1)) != -1)

 Note that the issue with INT_MIN is that wi::neg (INT_MIN) is
INT_MIN
 again, but it seems that wi::exact_log2 returns -1 in that case so
you
 are fine (and in fact not handling this case).

 Are you sure it returns -1 for INT_MIN?  It isn't supposed to,
assuming
 INT_MIN is shorthand for minimum value for a signed type. 
wide_ints
 aren't signed, so INT_MIN is indistinguishable from an unsigned
 1(prec-1).

 No, not sure.  I spotted

   /* Reject cases where there are implicit -1 blocks above HIGH.  */
   if (x.len * HOST_BITS_PER_WIDE_INT  x.precision  x.sign_mask ()
 0)
 return -1;

 and thought that would catch it.  I mean the tree value is negative
so
 exact_log2 must see it is a negative value.

That's handling the compressed format, e.g.:

  {1  63}

as a 64-bit short-hand for a 256-bit:

  {1  63,-1,-1,-1}

In this case more than one of the low x.precision bits are known to be
set.

So you are saying exact_log2 is really exact_log2u?

 Now re-sent with Richards company disclaimer stripped...

Doh.  Sent via the right channels this time...

Thanks,
Richard




[gomp4] optimize launch dimensions

2015-08-04 Thread Nathan Sidwell
I've committed this to gomp4 branch.  It optimizes the new GOACC_DIM_SIZE and 
GOACC_DIM_POS bultins for constant dimensions.  In addition:


*) added a target-specific dimension validation and defaulting hook.  Provided a 
ptx implementation.


*) Made GOACC_DIM_POS pure, to allow some optimization with it (but not 
migration across fork/join)


*) Delete fork/join markers on the host.

This uncovers a defect in the invokation of the device compiler.  We fail to 
propagate -fno-diagnostics-show-caret and similar options to it.  Leading to the 
two tests I altered failing with unexpected diagnostic.  I'll be fixing that 
shortly.


nathan
2015-08-04  Nathan Sidwell  nat...@codesourcery.com

	gcc/
	* doc/tm.texi.in (TARGET_GOACC_VALIDATE_DIMS): Add hook.
	* doc/tm.texi: Regenerated.
	* target.def (TARGET_GOACC): New hook prefix.
	(validate_dims): New.
	* targhooks.h (default_goacc_validate_dims): Declare.
	* internal-fn.def: Add comments.
	(GOACC_DIM_POS): Make pure.
	* config/nvptx/nvptx.c (nvptx_validate_dims): New.
	(TARGET_GOACC_VALIDATE_DIMS): Override.
	* omp-low.h (set_oacc_fn_attrib): Leave default dims as NULL.
	(oacc_xform_dim): New.
	(execute_oacc_transform): Process launch dimensions. Optimize
	DIM_SIZE and DIM_POS.  Delete FORK  JOIN on host.
	(default_oacc_validate_dims): New.

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/routine-1.c: Add warning.
	* testsuite/libgomp.oacc-c-c++-common/routine-2.c: Add warning.

Index: gcc/internal-fn.def
===
--- gcc/internal-fn.def	(revision 226595)
+++ gcc/internal-fn.def	(working copy)
@@ -64,7 +64,22 @@ DEF_INTERNAL_FN (MUL_OVERFLOW, ECF_CONST
 DEF_INTERNAL_FN (TSAN_FUNC_EXIT, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL)
 DEF_INTERNAL_FN (VA_ARG, ECF_NOTHROW | ECF_LEAF, NULL)
 DEF_INTERNAL_FN (GOACC_DATA_END_WITH_ARG, ECF_NOTHROW, .r)
+
+/* FORK and JOIN mark the points at which partitioned execution is
+   entered or exited.  We arrange for these two function to be
+   unduplicable and uncombinable in order to preserve the SESE CFG
+   property of partitioned loops.  These are non-const functions to prevent
+   optimizations migrating memory accesses across a partition change
+   boundary.  They take a single INTEGER_CST
+   argument and return nothing.  */
 DEF_INTERNAL_FN (GOACC_FORK, ECF_NOTHROW | ECF_LEAF, .)
 DEF_INTERNAL_FN (GOACC_JOIN, ECF_NOTHROW | ECF_LEAF, .)
+
+/* DIM_SIZE and DIM_POS return the size of a particular compute
+   dimension and the executing thread's position within that
+   dimension.  DIM_POS is pure (and not const) so that it isn't
+   thought to clobber memory and can be gcse'd within a single
+   parallel region, but not across FORK/JOIN boundaries.  They take a
+   single INTEGER_CST argument.  */
 DEF_INTERNAL_FN (GOACC_DIM_SIZE, ECF_CONST | ECF_NOTHROW | ECF_LEAF, .)
-DEF_INTERNAL_FN (GOACC_DIM_POS, ECF_NOTHROW | ECF_LEAF, .)
+DEF_INTERNAL_FN (GOACC_DIM_POS, ECF_PURE | ECF_NOTHROW | ECF_LEAF, .)
Index: gcc/config/nvptx/nvptx.c
===
--- gcc/config/nvptx/nvptx.c	(revision 226595)
+++ gcc/config/nvptx/nvptx.c	(working copy)
@@ -3524,6 +3524,57 @@ nvptx_expand_builtin (tree exp, rtx targ
   return d-expander (exp, target, mode, ignore);
 }
 
+/* Validate compute dimensions, fill in defaults.  */
+
+static tree
+nvptx_validate_dims (tree decl, tree dims)
+{
+  tree adims[GOMP_DIM_MAX];
+  unsigned ix;
+  bool changed = false;
+  tree pos = dims;
+
+  for (ix = 0; ix != GOMP_DIM_MAX; ix++)
+{
+  adims[ix] = TREE_VALUE (pos);
+  pos = TREE_CHAIN (pos);
+}
+  /* Define vector size for known hardware.  */
+#define PTX_VECTOR_LENGTH 32
+  /* If the worker size is not 1, the vector size must be 32.  If
+ the vector size is not 1, it must be 32.  */
+  if ((adims[GOMP_DIM_WORKER]
+TREE_INT_CST_LOW (adims[GOMP_DIM_WORKER]) != 1)
+  || (adims[GOMP_DIM_VECTOR]
+	   TREE_INT_CST_LOW (adims[GOMP_DIM_VECTOR]) != 1))
+{
+  if (!adims[GOMP_DIM_VECTOR]
+	  || TREE_INT_CST_LOW (adims[GOMP_DIM_VECTOR]) != PTX_VECTOR_LENGTH)
+	{
+	  tree use = build_int_cst (integer_type_node, PTX_VECTOR_LENGTH);
+	  if (adims[GOMP_DIM_VECTOR])
+	warning_at (DECL_SOURCE_LOCATION (decl), 0,
+			TREE_INT_CST_LOW (adims[GOMP_DIM_VECTOR])
+			? using vector_length (%E), ignoring %E
+			: using vector_length (%E), ignoring runtime setting,
+			use, adims[GOMP_DIM_VECTOR]);
+	  adims[GOMP_DIM_VECTOR] = use;
+	}
+}
+
+  /* Set defaults.  */
+  for (ix = 0; ix != GOMP_DIM_MAX; ix++)
+if (!adims[ix])
+  adims[ix] = integer_one_node;
+
+  /* Write results.  */
+  pos = dims;
+  for (ix = 0; ix != GOMP_DIM_MAX; ix++, pos = TREE_CHAIN (pos))
+TREE_VALUE (pos) = adims[ix];
+  
+  return dims;
+}
+
 #undef TARGET_OPTION_OVERRIDE
 #define TARGET_OPTION_OVERRIDE nvptx_option_override
 
@@ -3618,6 +3669,9 @@ nvptx_expand_builtin (tree exp, rtx targ
 #undef  TARGET_BUILTIN_DECL
 #define 

Go patch committed: Verify pointer's underlying type

2015-08-04 Thread Ian Lance Taylor
This patch from Chris Manghane ensures that when verifying a pointer
type, we verify the underlying type.  This fixes
https://golang.org/issue/11547 .  Bootstrapped and ran Go testsuite on
x86_64-unknown-linux-gnu.  Committed to mainline.

Ian
Index: gcc/go/gofrontend/MERGE
===
--- gcc/go/gofrontend/MERGE (revision 226596)
+++ gcc/go/gofrontend/MERGE (working copy)
@@ -1,4 +1,4 @@
-df080adb06f0e423820f3f6b9604b0c1093ff20a
+6fb7c3509a4eda7d2403900981b53029d6727037
 
 The first line of this file holds the git revision number of the last
 merge done from the gofrontend repository.
Index: gcc/go/gofrontend/types.h
===
--- gcc/go/gofrontend/types.h   (revision 226510)
+++ gcc/go/gofrontend/types.h   (working copy)
@@ -2033,6 +2033,10 @@ class Pointer_type : public Type
   do_traverse(Traverse*);
 
   bool
+  do_verify()
+  { return this-to_type_-verify(); }
+
+  bool
   do_has_pointer() const
   { return true; }