[PATCH] Fix devirtualization ICE (PR tree-optimization/59622, take 2)

2014-01-03 Thread Jakub Jelinek
On Tue, Dec 31, 2013 at 11:30:12AM +0100, Richard Biener wrote:
 Meanwhile your patch is ok.

As discussed in the PR, the patch wasn't sufficient, __cxa_pure_virtual
can appear in the vtable and it has pretty much the same properties
as __builtin_unreachable (void return value, no arguments, noreturn,
DCE or cfg cleanup being able to remove anything after it because of
noreturn).

Additionally, the patch attempts to punt on invalid type changes
(ODR violations?, apparently none appear during bootstrap/regtest as
verified by additional logging added there) or inplace changes of
gimple_call_flags that would require some cleanups caller isn't expected to
do (again, doesn't happen during bootstrap/regtest).

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2014-01-03  Jakub Jelinek  ja...@redhat.com

PR tree-optimization/59622
* gimple-fold.c: Include calls.h.
(gimple_fold_call): Fix a typo in message.  Handle
__cxa_pure_virtual similarly to __builtin_unreachable.  For
inplace punt if gimple_call_flags would change.  Verify
that lhs and argument types are compatible.

* g++.dg/opt/pr59622-2.C: New test.

--- gcc/gimple-fold.c.jj2013-12-31 12:56:59.0 +0100
+++ gcc/gimple-fold.c   2014-01-02 18:33:51.207921734 +0100
@@ -51,6 +51,7 @@ along with GCC; see the file COPYING3.
 #include gimple-pretty-print.h
 #include tree-ssa-address.h
 #include langhooks.h
+#include calls.h
 
 /* Return true when DECL can be referenced from current unit.
FROM_DECL (if non-null) specify constructor of variable DECL was taken from.
@@ -1167,7 +1168,7 @@ gimple_fold_call (gimple_stmt_iterator *
  (OBJ_TYPE_REF_EXPR 
(callee)
{
  fprintf (dump_file,
-  Type inheritnace inconsistent devirtualization of );
+  Type inheritance inconsistent devirtualization of );
  print_gimple_stmt (dump_file, stmt, 0, TDF_SLIM);
  fprintf (dump_file,  to );
  print_generic_expr (dump_file, callee, TDF_SLIM);
@@ -1184,18 +1185,74 @@ gimple_fold_call (gimple_stmt_iterator *
= possible_polymorphic_call_targets (callee, final);
  if (final  targets.length () = 1)
{
+ tree fndecl;
+ int call_flags = gimple_call_flags (stmt), new_flags;
  if (targets.length () == 1)
+   fndecl = targets[0]-decl;
+ else
+   fndecl = builtin_decl_implicit (BUILT_IN_UNREACHABLE);
+ new_flags = flags_from_decl_or_type (fndecl);
+ if (inplace)
{
- gimple_call_set_fndecl (stmt, targets[0]-decl);
- changed = true;
+ /* For inplace, avoid changes of call flags that
+might require cfg cleanups, EH cleanups, removal
+of vdef/vuses etc.  */
+ if ((call_flags ^ new_flags)
+  (ECF_NORETURN | ECF_NOTHROW | ECF_NOVOPS))
+   final = false;
+ else if ((call_flags  ECF_NOVOPS) == 0)
+   {
+ if ((call_flags ^ new_flags)  ECF_CONST)
+   final = false;
+ else if (((call_flags  (ECF_PURE | ECF_CONST
+  | ECF_NORETURN)) == 0)
+  ^ ((new_flags  (ECF_PURE | ECF_CONST
+   | ECF_NORETURN)) == 0))
+   final = false;
+   }
}
- else if (!inplace)
+ if (final)
{
- tree fndecl = builtin_decl_implicit (BUILT_IN_UNREACHABLE);
- gimple new_stmt = gimple_build_call (fndecl, 0);
- gimple_set_location (new_stmt, gimple_location (stmt));
- gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
- return true;
+ tree fntype = gimple_call_fntype (stmt);
+ tree dtype = TREE_TYPE (fndecl);
+ if (gimple_call_lhs (stmt)
+  !useless_type_conversion_p (TREE_TYPE (fntype),
+TREE_TYPE (dtype)))
+   final = false;
+ else
+   {
+ tree t1, t2;
+ for (t1 = TYPE_ARG_TYPES (fntype),
+  t2 = TYPE_ARG_TYPES (dtype);
+  t1  t2;
+  t1 = TREE_CHAIN (t1), t2 = TREE_CHAIN (t2))
+   if (!useless_type_conversion_p (TREE_VALUE (t2),
+   TREE_VALUE (t1)))
+ break;
+ if (t1 || t2)
+   final = false;
+   }
+
+ /* 

[PATCH] Don't count asm goto for 4 branch limit optimization (PR target/59625)

2014-01-03 Thread Jakub Jelinek
Hi!

Linus complained about padding inserted in between asm goto.  In 4.8
even -mtune=generic performs ix86_avoid_jump_mispredicts, in 4.9 only e.g.
-mtune=atom and others, but not generic.
The issue is that assuming every asm goto must contain a jump is too
pessimistic, often asm goto (may) transfer control to the label(s) through
other means (e.g. on the side data structure with labels etc.), plus we
conservatively assume asm goto is 0 bytes long, so we were adding the
paddings pretty much always when multiple asm goto stmts appeared
consecutively.

This patch just ignores asm goto from the optimization.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk and 4.8
after a while?

2014-01-03  Jakub Jelinek  ja...@redhat.com

PR target/59625
* config/i386/i386.c (ix86_avoid_jump_mispredicts): Don't consider
asm goto as jump.

* gcc.target/i386/pr59625.c: New test.

--- gcc/config/i386/i386.c.jj   2014-01-01 00:45:15.0 +0100
+++ gcc/config/i386/i386.c  2014-01-02 14:44:07.980841529 +0100
@@ -38852,7 +38852,8 @@ ix86_avoid_jump_mispredicts (void)
  while (nbytes + max_skip = 16)
{
  start = NEXT_INSN (start);
- if (JUMP_P (start) || CALL_P (start))
+ if ((JUMP_P (start)  asm_noperands (PATTERN (start))  0)
+ || CALL_P (start))
njumps--, isjump = 1;
  else
isjump = 0;
@@ -38867,7 +38868,8 @@ ix86_avoid_jump_mispredicts (void)
   if (dump_file)
fprintf (dump_file, Insn %i estimated to %i bytes\n,
 INSN_UID (insn), min_size);
-  if (JUMP_P (insn) || CALL_P (insn))
+  if ((JUMP_P (insn)  asm_noperands (PATTERN (insn))  0)
+ || CALL_P (insn))
njumps++;
   else
continue;
@@ -38875,7 +38877,8 @@ ix86_avoid_jump_mispredicts (void)
   while (njumps  3)
{
  start = NEXT_INSN (start);
- if (JUMP_P (start) || CALL_P (start))
+ if ((JUMP_P (start)  asm_noperands (PATTERN (start))  0)
+ || CALL_P (start))
njumps--, isjump = 1;
  else
isjump = 0;
--- gcc/testsuite/gcc.target/i386/pr59625.c.jj  2014-01-02 17:17:45.611680569 
+0100
+++ gcc/testsuite/gcc.target/i386/pr59625.c 2014-01-02 17:17:37.0 
+0100
@@ -0,0 +1,36 @@
+/* PR target/59625 */
+/* { dg-do compile } */
+/* { dg-options -O2 -mtune=atom } */
+
+int
+foo (void)
+{
+  asm goto ( : : : : lab);
+  asm goto ( : : : : lab);
+  asm goto ( : : : : lab);
+  asm goto ( : : : : lab);
+  asm goto ( : : : : lab);
+  asm goto ( : : : : lab);
+  asm goto ( : : : : lab);
+  asm goto ( : : : : lab);
+  asm goto ( : : : : lab);
+  asm goto ( : : : : lab);
+  asm goto ( : : : : lab);
+  asm goto ( : : : : lab);
+  asm goto ( : : : : lab);
+  asm goto ( : : : : lab);
+  asm goto ( : : : : lab);
+  asm goto ( : : : : lab);
+  asm goto ( : : : : lab);
+  asm goto ( : : : : lab);
+  asm goto ( : : : : lab);
+  asm goto ( : : : : lab);
+  return 0;
+lab:
+  return 1;
+}
+
+/* Verify we don't consider asm goto as a jump for four jumps limit
+   optimization.  asm goto doesn't have to contain a jump at all,
+   the branching to labels can happen through different means.  */
+/* { dg-final { scan-assembler-not 
p2align\[^\n\r\]*\[\n\r]*\[^\n\r\]*p2align\[^\n\r\]*\[\n\r]*\[^\n\r\]*p2align\[^\n\r\]*\[\n\r]*\[^\n\r\]*p2align\[^\n\r\]*\[\n\r]*\[^\n\r\]*p2align\[^\n\r\]*\[\n\r]*\[^\n\r\]*p2align
 } } */

Jakub


[PATCH] Introduce MODE_SIZE mode attribute

2014-01-03 Thread Jakub Jelinek
Hi!

I've noticed that especially with the AVX512F introduction we use
GET_MODE_SIZE (MODEmode) quite heavily in the i386 *.md files, and
the problem with that is GET_MODE_SIZE isn't a compile time constant,
needs to read mode_size array (non-const) at runtime.

This patch attempts to decrease the enormous .text/.rodata growth from AVX512F
introduction a little bit and improve generated code, by making
GET_MODE_SIZE (MODEmode) compile time constants, thus all the
GET_MODE_SIZE (MODEmode) == 64 and similar tests can fold into
false/true.  The patch saves ~ 110KB of .text (both x86_64 and i686)
and ~ 50KB of .rodata (x86_64) or ~ 27KB of .rodata (i686).

The disadvantage is that the mode attribute duplicates insn-modes.c,
but I guess the listed modes aren't going to change their sizes ever
on this target, and if some mode isn't listed and used in some mode
iterator, one would get syntax errors from insn-*.[ch] and so it wouldn't be
hard to add it.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2014-01-03  Jakub Jelinek  ja...@redhat.com

* config/i386/i386.md (MODE_SIZE): New mode attribute.
(push splitter): Use P:MODE_SIZE instead of
GET_MODE_SIZE (P:MODEmode).
(lea splitter): Use MODE_SIZE instead of GET_MODE_SIZE (MODEmode).
(mov -1, reg peephole2): Likewise.
* config/i386/sse.md (*movmode_internal,
sse_storeussemodesuffixavxsizesuffix,
sse2_avx_avx512f_storedqumode, sse_andnotmode3,
*codemode3, *andnotmode3mask_name,
mask_codeforcodemode3mask_name): Likewise.
* config/i386/subst.md (mask_mode512bit_condition,
sd_mask_mode512bit_condition): Likewise.

--- gcc/config/i386/i386.md.jj  2013-12-27 19:24:33.0 +0100
+++ gcc/config/i386/i386.md 2014-01-02 20:08:16.911851795 +0100
@@ -914,6 +914,20 @@ (define_mode_iterator SWIM248 [(HI TARG
 (define_mode_iterator DWI [(DI !TARGET_64BIT)
   (TI TARGET_64BIT)])
 
+;; GET_MODE_SIZE for selected modes.  As GET_MODE_SIZE is not
+;; compile time constant, it is faster to use MODE_SIZE than
+;; GET_MODE_SIZE (MODEmode).  For XFmode which depends on
+;; command line options just use GET_MODE_SIZE macro.
+(define_mode_attr MODE_SIZE [(QI 1) (HI 2) (SI 4) (DI 8) (TI 16)
+(SF 4) (DF 8) (XF GET_MODE_SIZE (XFmode))
+(V16QI 16) (V32QI 32) (V64QI 64)
+(V8HI 16) (V16HI 32) (V32HI 64)
+(V4SI 16) (V8SI 32) (V16SI 64)
+(V2DI 16) (V4DI 32) (V8DI 64)
+(V1TI 16) (V2TI 32) (V4TI 64)
+(V2DF 16) (V4DF 32) (V8DF 64)
+(V4SF 16) (V8SF 32) (V16SF 64)])
+
 ;; Double word integer modes as mode attribute.
 (define_mode_attr DWI [(QI HI) (HI SI) (SI DI) (DI TI)])
 (define_mode_attr dwi [(QI hi) (HI si) (SI di) (DI ti)])
@@ -2734,7 +2748,7 @@ (define_split
   reload_completed
   [(set (reg:P SP_REG) (plus:P (reg:P SP_REG) (match_dup 2)))
(set (mem:SF (reg:P SP_REG)) (match_dup 1))]
-  operands[2] = GEN_INT (-GET_MODE_SIZE (P:MODEmode));)
+  operands[2] = GEN_INT (-P:MODE_SIZE);)
 
 (define_split
   [(set (match_operand:SF 0 push_operand)
@@ -5770,7 +5784,7 @@ (define_split
   enum machine_mode mode = MODEmode;
   rtx pat;
 
-  if (GET_MODE_SIZE (mode)  GET_MODE_SIZE (SImode))
+  if (MODE_SIZE  GET_MODE_SIZE (SImode))
 { 
   mode = SImode; 
   operands[0] = gen_lowpart (mode, operands[0]);
@@ -17403,7 +17417,7 @@ (define_peephole2
   [(parallel [(set (match_dup 0) (const_int -1))
  (clobber (reg:CC FLAGS_REG))])]
 {
-  if (GET_MODE_SIZE (MODEmode)  GET_MODE_SIZE (SImode))
+  if (MODE_SIZE  GET_MODE_SIZE (SImode))
 operands[0] = gen_lowpart (SImode, operands[0]);
 })
 
--- gcc/config/i386/sse.md.jj   2014-01-01 00:52:56.0 +0100
+++ gcc/config/i386/sse.md  2014-01-02 20:11:49.723916127 +0100
@@ -669,7 +669,7 @@ (define_insn *movmode_internal
   /* There is no evex-encoded vmov* for sizes smaller than 64-bytes
 in avx512f, so we need to use workarounds, to access sse registers
 16-31, which are evex-only.  */
-  if (TARGET_AVX512F  GET_MODE_SIZE (MODEmode)  64
+  if (TARGET_AVX512F  MODE_SIZE  64
   ((REG_P (operands[0])
EXT_REX_SSE_REGNO_P (REGNO (operands[0])))
  || (REG_P (operands[1])
@@ -677,18 +677,18 @@ (define_insn *movmode_internal
{
  if (memory_operand (operands[0], MODEmode))
{
- if (GET_MODE_SIZE (MODEmode) == 32)
+ if (MODE_SIZE == 32)
return vextractshuffletype64x4\t{$0x0, %g1, %0|%0, %g1, 
0x0};
- else if (GET_MODE_SIZE (MODEmode) == 16)
+ else if (MODE_SIZE == 16)
return vextractshuffletype32x4\t{$0x0, %g1, %0|%0, %g1, 
0x0};
  else
gcc_unreachable ();
}
 

[PATCH] i?86 unaligned/aligned load improvement for AVX512F

2014-01-03 Thread Jakub Jelinek
Hi!

This is an attempt to port my recent
http://gcc.gnu.org/viewcvs?rev=204219root=gccview=rev
http://gcc.gnu.org/viewcvs?rev=205663root=gccview=rev
http://gcc.gnu.org/viewcvs?rev=206090root=gccview=rev
changes also to AVX512F.  The motivation is to get:

#include immintrin.h

__m512i
foo (void *x, void *y)
{
  __m512i a = _mm512_loadu_si512 (x);
  __m512i b = _mm512_loadu_si512 (y);
  return _mm512_add_epi32 (a, b);
}

use one of the unaligned memories directly as operand to the vpaddd
instruction.  The first hunk is needed so that we don't regress on say:

#include immintrin.h

__m512i z;

__m512i
foo (void *x, void *y, int k)
{
  __m512i a = _mm512_mask_loadu_epi32 (z, k, x);
  __m512i b = _mm512_mask_loadu_epi32 (z, k, y);
  return _mm512_add_epi32 (a, b);
}

__m512i
bar (void *x, void *y, int k)
{
  __m512i a = _mm512_maskz_loadu_epi32 (k, x);
  __m512i b = _mm512_maskz_loadu_epi32 (k, y);
  return _mm512_add_epi32 (a, b);
}

Does it matter which of vmovdqu32 vs. vmovdqu64 is used if no
masking/zeroing is performed (i.e. vmovdqu32 (%rax), %zmm0 vs.
vmovdqu64 (%rax), %zmm0) for performance reasons (i.e. isn't there some
reinterpretation penalty)?

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2014-01-03  Jakub Jelinek  ja...@redhat.com

* config/i386/sse.md (avx512f_loadmode_mask): Emit vmovup{s,d}
or vmovdqu* for misaligned_operand.
(sse_loadussemodesuffixavxsizesuffixmask_name,
sse2_avx_avx512f_loaddqumodemask_name): Handle mask_applied.
* config/i386/i386.c (ix86_expand_special_args_builtin): Set
aligned_mem for AVX512F masked aligned load and store builtins and for
non-temporal moves.

* gcc.target/i386/avx512f-vmovdqu32-1.c: Allow vmovdqu64 instead of
vmovdqu32.

--- gcc/config/i386/sse.md.jj   2014-01-02 20:11:49.0 +0100
+++ gcc/config/i386/sse.md  2014-01-02 21:59:06.706161064 +0100
@@ -786,8 +786,12 @@ (define_insn avx512f_loadmode_mask
 {
 case MODE_V8DF:
 case MODE_V16SF:
+  if (misaligned_operand (operands[1], MODEmode))
+   return vmovussemodesuffix\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1};
   return vmovassemodesuffix\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1};
 default:
+  if (misaligned_operand (operands[1], MODEmode))
+   return vmovdqussescalarsize\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1};
   return vmovdqassescalarsize\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1};
 }
 }
@@ -936,11 +940,14 @@ (define_expand sse_loadussemodesuffi
  false, still emit UNSPEC_LOADU insn to honor user's request for
  misaligned load.  */
   if (TARGET_AVX
-   misaligned_operand (operands[1], MODEmode)
-  /* FIXME: Revisit after AVX512F merge is completed.  */
-   !mask_applied)
+   misaligned_operand (operands[1], MODEmode))
 {
-  emit_insn (gen_rtx_SET (VOIDmode, operands[0], operands[1]));
+  rtx src = operands[1];
+  if (mask_applied)
+   src = gen_rtx_VEC_MERGE (MODEmode, operands[1],
+operands[2 * mask_applied],
+operands[3 * mask_applied]);
+  emit_insn (gen_rtx_SET (VOIDmode, operands[0], src));
   DONE;
 }
 })
@@ -1046,11 +1053,14 @@ (define_expand sse2_avx_avx512f_loadd
  false, still emit UNSPEC_LOADU insn to honor user's request for
  misaligned load.  */
   if (TARGET_AVX
-   misaligned_operand (operands[1], MODEmode)
-  /* FIXME: Revisit after AVX512F merge is completed.  */
-   !mask_applied)
+   misaligned_operand (operands[1], MODEmode))
 {
-  emit_insn (gen_rtx_SET (VOIDmode, operands[0], operands[1]));
+  rtx src = operands[1];
+  if (mask_applied)
+   src = gen_rtx_VEC_MERGE (MODEmode, operands[1],
+operands[2 * mask_applied],
+operands[3 * mask_applied]);
+  emit_insn (gen_rtx_SET (VOIDmode, operands[0], src));
   DONE;
 }
 })
--- gcc/config/i386/i386.c.jj   2014-01-02 14:44:07.0 +0100
+++ gcc/config/i386/i386.c  2014-01-02 21:48:23.204400654 +0100
@@ -34407,6 +34408,9 @@ ix86_expand_special_args_builtin (const
case CODE_FOR_sse2_movntidi:
case CODE_FOR_sse_movntq:
case CODE_FOR_sse2_movntisi:
+   case CODE_FOR_avx512f_movntv16sf:
+   case CODE_FOR_avx512f_movntv8df:
+   case CODE_FOR_avx512f_movntv8di:
  aligned_mem = true;
  break;
default:
@@ -34431,6 +34435,24 @@ ix86_expand_special_args_builtin (const
   klass = load;
   memory = 0;
   break;
+case VOID_FTYPE_PV8DF_V8DF_QI:
+case VOID_FTYPE_PV16SF_V16SF_HI:
+case VOID_FTYPE_PV8DI_V8DI_QI:
+case VOID_FTYPE_PV16SI_V16SI_HI:
+  switch (icode)
+   {
+   /* These builtins and instructions require the memory
+  to be properly aligned.  */
+   case CODE_FOR_avx512f_storev16sf_mask:
+   case CODE_FOR_avx512f_storev16si_mask:
+   case 

Re: [PATCH] Introduce MODE_SIZE mode attribute

2014-01-03 Thread Uros Bizjak
On Fri, Jan 3, 2014 at 9:48 AM, Jakub Jelinek ja...@redhat.com wrote:

 I've noticed that especially with the AVX512F introduction we use
 GET_MODE_SIZE (MODEmode) quite heavily in the i386 *.md files, and
 the problem with that is GET_MODE_SIZE isn't a compile time constant,
 needs to read mode_size array (non-const) at runtime.

 This patch attempts to decrease the enormous .text/.rodata growth from AVX512F
 introduction a little bit and improve generated code, by making
 GET_MODE_SIZE (MODEmode) compile time constants, thus all the
 GET_MODE_SIZE (MODEmode) == 64 and similar tests can fold into
 false/true.  The patch saves ~ 110KB of .text (both x86_64 and i686)
 and ~ 50KB of .rodata (x86_64) or ~ 27KB of .rodata (i686).

 The disadvantage is that the mode attribute duplicates insn-modes.c,
 but I guess the listed modes aren't going to change their sizes ever
 on this target, and if some mode isn't listed and used in some mode
 iterator, one would get syntax errors from insn-*.[ch] and so it wouldn't be
 hard to add it.

This should be tolerable, at least for gains, shown above. OTOH, we
already have some attributes (e.g. ssescalarnum, ssescalarsize) that
use the same approach and the sky didn't fall down.

 Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

 2014-01-03  Jakub Jelinek  ja...@redhat.com

 * config/i386/i386.md (MODE_SIZE): New mode attribute.
 (push splitter): Use P:MODE_SIZE instead of
 GET_MODE_SIZE (P:MODEmode).
 (lea splitter): Use MODE_SIZE instead of GET_MODE_SIZE (MODEmode).
 (mov -1, reg peephole2): Likewise.
 * config/i386/sse.md (*movmode_internal,
 sse_storeussemodesuffixavxsizesuffix,
 sse2_avx_avx512f_storedqumode, sse_andnotmode3,
 *codemode3, *andnotmode3mask_name,
 mask_codeforcodemode3mask_name): Likewise.
 * config/i386/subst.md (mask_mode512bit_condition,
 sd_mask_mode512bit_condition): Likewise.

OK for mainline.

Thanks,
Uros.


Re: [PATCH] Fix devirtualization ICE (PR tree-optimization/59622, take 2)

2014-01-03 Thread Richard Biener
Jakub Jelinek ja...@redhat.com wrote:
On Tue, Dec 31, 2013 at 11:30:12AM +0100, Richard Biener wrote:
 Meanwhile your patch is ok.

As discussed in the PR, the patch wasn't sufficient, __cxa_pure_virtual
can appear in the vtable and it has pretty much the same properties
as __builtin_unreachable (void return value, no arguments, noreturn,
DCE or cfg cleanup being able to remove anything after it because of
noreturn).

Additionally, the patch attempts to punt on invalid type changes
(ODR violations?, apparently none appear during bootstrap/regtest as
verified by additional logging added there) or inplace changes of
gimple_call_flags that would require some cleanups caller isn't
expected to
do (again, doesn't happen during bootstrap/regtest).

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

I'd rather do nothing in the !inplace case, it's rare enough to not care.

Also there is nothing wrong with wrong types et al - we have a perfect 
mechanism for dealing with this. Change the fndecl but not the fntype.

I'd like to see the code simplified with keeping the above in mind.

Thanks,
Richard.

2014-01-03  Jakub Jelinek  ja...@redhat.com

   PR tree-optimization/59622
   * gimple-fold.c: Include calls.h.
   (gimple_fold_call): Fix a typo in message.  Handle
   __cxa_pure_virtual similarly to __builtin_unreachable.  For
   inplace punt if gimple_call_flags would change.  Verify
   that lhs and argument types are compatible.

   * g++.dg/opt/pr59622-2.C: New test.

--- gcc/gimple-fold.c.jj   2013-12-31 12:56:59.0 +0100
+++ gcc/gimple-fold.c  2014-01-02 18:33:51.207921734 +0100
@@ -51,6 +51,7 @@ along with GCC; see the file COPYING3.
 #include gimple-pretty-print.h
 #include tree-ssa-address.h
 #include langhooks.h
+#include calls.h
 
 /* Return true when DECL can be referenced from current unit.
FROM_DECL (if non-null) specify constructor of variable DECL was taken
from.
@@ -1167,7 +1168,7 @@ gimple_fold_call (gimple_stmt_iterator *
(OBJ_TYPE_REF_EXPR (callee)
   {
 fprintf (dump_file,
- Type inheritnace inconsistent devirtualization of );
+ Type inheritance inconsistent devirtualization of );
 print_gimple_stmt (dump_file, stmt, 0, TDF_SLIM);
 fprintf (dump_file,  to );
 print_generic_expr (dump_file, callee, TDF_SLIM);
@@ -1184,18 +1185,74 @@ gimple_fold_call (gimple_stmt_iterator *
   = possible_polymorphic_call_targets (callee, final);
 if (final  targets.length () = 1)
   {
+tree fndecl;
+int call_flags = gimple_call_flags (stmt), new_flags;
 if (targets.length () == 1)
+  fndecl = targets[0]-decl;
+else
+  fndecl = builtin_decl_implicit (BUILT_IN_UNREACHABLE);
+new_flags = flags_from_decl_or_type (fndecl);
+if (inplace)
   {
-gimple_call_set_fndecl (stmt, targets[0]-decl);
-changed = true;
+/* For inplace, avoid changes of call flags that
+   might require cfg cleanups, EH cleanups, removal
+   of vdef/vuses etc.  */
+if ((call_flags ^ new_flags)
+ (ECF_NORETURN | ECF_NOTHROW | ECF_NOVOPS))
+  final = false;
+else if ((call_flags  ECF_NOVOPS) == 0)
+  {
+if ((call_flags ^ new_flags)  ECF_CONST)
+  final = false;
+else if (((call_flags  (ECF_PURE | ECF_CONST
+ | ECF_NORETURN)) == 0)
+ ^ ((new_flags  (ECF_PURE | ECF_CONST
+  | ECF_NORETURN)) == 0))
+  final = false;
+  }
   }
-else if (!inplace)
+if (final)
   {
-tree fndecl = builtin_decl_implicit (BUILT_IN_UNREACHABLE);
-gimple new_stmt = gimple_build_call (fndecl, 0);
-gimple_set_location (new_stmt, gimple_location (stmt));
-gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
-return true;
+tree fntype = gimple_call_fntype (stmt);
+tree dtype = TREE_TYPE (fndecl);
+if (gimple_call_lhs (stmt)
+ !useless_type_conversion_p (TREE_TYPE (fntype),
+   TREE_TYPE (dtype)))
+  final = false;
+else
+  {
+tree t1, t2;
+for (t1 = TYPE_ARG_TYPES (fntype),
+ t2 = TYPE_ARG_TYPES (dtype);
+ t1  t2;
+ t1 = TREE_CHAIN (t1), t2 = TREE_CHAIN (t2))
+  if 

Re: [Patch, Fortran] PR 59023: [4.9 regression] ICE in gfc_search_interface with BIND(C)

2014-01-03 Thread Janus Weil
In addition this patch fixes PR 59662.

Also: Ping!

Cheers,
Janus



2013/12/31 Janus Weil ja...@gcc.gnu.org:
 Hi all,

 ... and the reg-bashing continues: Here is a small patch to fix a
 bind(c) problem. It essentially prevents 'resolve_global_procedure' to
 be applied to bind(c) procedures.

 Regtested on x86_64-unknown-linux-gnu. Ok for trunk?

 Cheers,
 Janus



 2013-12-31  Janus Weil  ja...@gcc.gnu.org

 PR fortran/59023
 * resolve.c (resolve_global_procedure): Don't apply to c-binding
 procedures.
 (gfc_verify_binding_labels): Remove duplicate line.

 2013-12-31  Janus Weil  ja...@gcc.gnu.org

 PR fortran/59023
 * gfortran.dg/bind_c_procs_2.f90: New.


Re: [PATCH] Fix devirtualization ICE (PR tree-optimization/59622, take 2)

2014-01-03 Thread Jakub Jelinek
On Fri, Jan 03, 2014 at 10:26:44AM +0100, Richard Biener wrote:
 Jakub Jelinek ja...@redhat.com wrote:
 On Tue, Dec 31, 2013 at 11:30:12AM +0100, Richard Biener wrote:
  Meanwhile your patch is ok.
 
 As discussed in the PR, the patch wasn't sufficient, __cxa_pure_virtual
 can appear in the vtable and it has pretty much the same properties
 as __builtin_unreachable (void return value, no arguments, noreturn,
 DCE or cfg cleanup being able to remove anything after it because of
 noreturn).
 
 Additionally, the patch attempts to punt on invalid type changes
 (ODR violations?, apparently none appear during bootstrap/regtest as
 verified by additional logging added there) or inplace changes of
 gimple_call_flags that would require some cleanups caller isn't
 expected to
 do (again, doesn't happen during bootstrap/regtest).
 
 Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
 
 I'd rather do nothing in the !inplace case, it's rare enough to not care.

Ok.

 Also there is nothing wrong with wrong types et al - we have a perfect 
 mechanism for dealing with this. Change the fndecl but not the fntype.

Well, see PR59630.  The question is if having to handle it everywhere
is worth it.

Jakub


Re: [PATCH] Fix devirtualization ICE (PR tree-optimization/59622, take 2)

2014-01-03 Thread Richard Biener
On Fri, 3 Jan 2014, Jakub Jelinek wrote:

 On Fri, Jan 03, 2014 at 10:26:44AM +0100, Richard Biener wrote:
  Jakub Jelinek ja...@redhat.com wrote:
  On Tue, Dec 31, 2013 at 11:30:12AM +0100, Richard Biener wrote:
   Meanwhile your patch is ok.
  
  As discussed in the PR, the patch wasn't sufficient, __cxa_pure_virtual
  can appear in the vtable and it has pretty much the same properties
  as __builtin_unreachable (void return value, no arguments, noreturn,
  DCE or cfg cleanup being able to remove anything after it because of
  noreturn).
  
  Additionally, the patch attempts to punt on invalid type changes
  (ODR violations?, apparently none appear during bootstrap/regtest as
  verified by additional logging added there) or inplace changes of
  gimple_call_flags that would require some cleanups caller isn't
  expected to
  do (again, doesn't happen during bootstrap/regtest).
  
  Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
  
  I'd rather do nothing in the !inplace case, it's rare enough to not care.
 
 Ok.
 
  Also there is nothing wrong with wrong types et al - we have a perfect 
  mechanism for dealing with this. Change the fndecl but not the fntype.
 
 Well, see PR59630.  The question is if having to handle it everywhere
 is worth it.

Well, this case happens because we go back to GENERIC which doesn't
have this feature.  So everywhere is somewhat a broad stmt.
It's easy to guard the builtins.c folding with a compatibility
check of fntype and fndecl type.

Richard.


Re: [PATCH] Fix devirtualization ICE (PR tree-optimization/59622, take 2)

2014-01-03 Thread Jakub Jelinek
On Fri, Jan 03, 2014 at 11:01:13AM +0100, Richard Biener wrote:
  Well, see PR59630.  The question is if having to handle it everywhere
  is worth it.
 
 Well, this case happens because we go back to GENERIC which doesn't
 have this feature.  So everywhere is somewhat a broad stmt.
 It's easy to guard the builtins.c folding with a compatibility
 check of fntype and fndecl type.

Well, clearly the inliner has similar issue, I doubt e.g. IPA cloning
handles it right, there is just one gimple_call_fntype call in all of ipa*.c
(which surprises me) and that ignores it if there is a decl:
  tree type = (e-callee
   ? TREE_TYPE (e-callee-decl)
   : gimple_call_fntype (e-call_stmt));
so if there is a mismatch between TREE_TYPE (e-callee-decl) and
gimple_call_fntype, it will happily look at the decl type.
So I'd say it is just a matter of adding more (invalid) testcases.

Jakub


Re: [PATCH] Fix devirtualization ICE (PR tree-optimization/59622, take 2)

2014-01-03 Thread Richard Biener
On Fri, 3 Jan 2014, Jakub Jelinek wrote:

 On Fri, Jan 03, 2014 at 11:01:13AM +0100, Richard Biener wrote:
   Well, see PR59630.  The question is if having to handle it everywhere
   is worth it.
  
  Well, this case happens because we go back to GENERIC which doesn't
  have this feature.  So everywhere is somewhat a broad stmt.
  It's easy to guard the builtins.c folding with a compatibility
  check of fntype and fndecl type.
 
 Well, clearly the inliner has similar issue, I doubt e.g. IPA cloning
 handles it right, there is just one gimple_call_fntype call in all of ipa*.c
 (which surprises me) and that ignores it if there is a decl:
   tree type = (e-callee
? TREE_TYPE (e-callee-decl)
: gimple_call_fntype (e-call_stmt));
 so if there is a mismatch between TREE_TYPE (e-callee-decl) and
 gimple_call_fntype, it will happily look at the decl type.
 So I'd say it is just a matter of adding more (invalid) testcases.

Of course we have to fix al the bugs.  The inliner has a
compatibility check in some of it's can-inline predicates.
ipa_get_callee_param_type looks correct if it really wants
to get at the param type of the _callee_ (not the type
the caller uses!)

static tree 
ipa_get_callee_param_type (struct cgraph_edge *e, int i)
{   
  int n;
  tree type = (e-callee
   ? TREE_TYPE (e-callee-decl)
   : gimple_call_fntype (e-call_stmt));
  tree t = TYPE_ARG_TYPES (type);  

so if there is a decl then use its type signature, otherwise
(indirect calls) use the caller signature (and hope it matches
the callee...).  That it later falls back to looking at
DECL_ARGUMENTS is odd (probably a FE issue where we have a
fndecl with a bogus type?)

Richard.


Re: [PATCH] Fix devirtualization ICE (PR tree-optimization/59622, take 2)

2014-01-03 Thread Jakub Jelinek
On Fri, Jan 03, 2014 at 11:15:32AM +0100, Richard Biener wrote:
 so if there is a decl then use its type signature, otherwise
 (indirect calls) use the caller signature (and hope it matches
 the callee...).  That it later falls back to looking at
 DECL_ARGUMENTS is odd (probably a FE issue where we have a
 fndecl with a bogus type?)

For KR C non-prototyped functions
foo (x, y)
  int x, y;
{
  return x + y;
}
I think TYPE_ARG_TYPES is NULL (?) and DECL_ARGUMENTS is the only way to get
at the types.

Jakub


Re: [PATCH] Fix devirtualization ICE (PR tree-optimization/59622, take 2)

2014-01-03 Thread Richard Biener
On Fri, 3 Jan 2014, Jakub Jelinek wrote:

 On Fri, Jan 03, 2014 at 11:15:32AM +0100, Richard Biener wrote:
  so if there is a decl then use its type signature, otherwise
  (indirect calls) use the caller signature (and hope it matches
  the callee...).  That it later falls back to looking at
  DECL_ARGUMENTS is odd (probably a FE issue where we have a
  fndecl with a bogus type?)
 
 For KR C non-prototyped functions
 foo (x, y)
   int x, y;
 {
   return x + y;
 }
 I think TYPE_ARG_TYPES is NULL (?) and DECL_ARGUMENTS is the only way to get
 at the types.

Ah, indeed.  A C speciality that shouldn't survive GENERICization if
that's really the case.

I wonder how

int main()
{
  return (*(foo))(0, 0);
}

works though.  Or

 __auto_type x = foo;

(substitute for that new C __auto thing).

The former probably because we fold away the indirection very early.

Richard.


Re: [PATCH] Fix devirtualization ICE (PR tree-optimization/59622, take 2)

2014-01-03 Thread Richard Biener
On Fri, 3 Jan 2014, Richard Biener wrote:

 On Fri, 3 Jan 2014, Jakub Jelinek wrote:
 
  On Fri, Jan 03, 2014 at 11:15:32AM +0100, Richard Biener wrote:
   so if there is a decl then use its type signature, otherwise
   (indirect calls) use the caller signature (and hope it matches
   the callee...).  That it later falls back to looking at
   DECL_ARGUMENTS is odd (probably a FE issue where we have a
   fndecl with a bogus type?)
  
  For KR C non-prototyped functions
  foo (x, y)
int x, y;
  {
return x + y;
  }
  I think TYPE_ARG_TYPES is NULL (?) and DECL_ARGUMENTS is the only way to get
  at the types.
 
 Ah, indeed.  A C speciality that shouldn't survive GENERICization if
 that's really the case.
 
 I wonder how
 
 int main()
 {
   return (*(foo))(0, 0);
 }
 
 works though.  Or
 
  __auto_type x = foo;
 
 (substitute for that new C __auto thing).
 
 The former probably because we fold away the indirection very early.

foo (x, y)
   int x, y;
{
return x + y;
}

int main()
{
  __auto_type f = foo;
  return f(0,0);
}

works just fine for me but shows

  int (*T391) () f;   
  int _4;   

  bb 2:   
  f_1 = foo;
  _4 = f_1 (0, 0);

so I suppose the fntype is not NULL but int (*)().

Richard.


[PATCH] Fix slpeel_update_phi_nodes_for_guard1 ICE (PR tree-optimization/59519)

2014-01-03 Thread Jakub Jelinek
Hi!

Since the r205959 SCEV changes for peeled chrec, apparently we can end up
with multiple PHIs on the to be vectorized loop that have the same arguments
(both on preheader and latch edges).  slpeel_update_phi_nodes_for_guard1
doesn't like that, it asserts that for the argument from the latch edge
we set_current_def only once, when it is shared by more than one PHI,
we would actually be trying to set it more than once.

What we create is just multiple PHIs on the *new_exit_bb that look like
_NN = PHI loop_arg(X)
but as they necessarily have the same value, IMHO it shouldn't matter
which one of those we record through set_current_def.

I've tried to trace where we'd call get_current_def on the loop_arg
besides the get_current_def call changed in the patch, but saw none even
on
struct S { int f0; } d;
int a[8] = { 0 }, b, c, e, f;

void
foo (void)
{
  for (; e  1; e++)
{
  for (b = 0; b  7; b++)
{
  c |= (a[b + 1] != 0);
  if (d.f0)
break;
}
  f += b;
}
}
where the value is actually used after the loop, in all cases the generated
IL looks sane.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2014-01-03  Jakub Jelinek  ja...@redhat.com

PR tree-optimization/59519
* tree-vect-loop-manip.c (slpeel_update_phi_nodes_for_guard1): Don't
ICE if get_current_def (current_new_name) is already non-NULL, as long
as it is a phi result of some other phi in *new_exit_bb that has
the same argument.

* gcc.dg/vect/pr59519-1.c: New test.
* gcc.dg/vect/pr59519-2.c: New test.

--- gcc/tree-vect-loop-manip.c.jj   2013-12-10 12:43:21.0 +0100
+++ gcc/tree-vect-loop-manip.c  2014-01-02 16:29:21.983645769 +0100
@@ -483,7 +483,18 @@ slpeel_update_phi_nodes_for_guard1 (edge
  if (!current_new_name)
continue;
 }
-  gcc_assert (get_current_def (current_new_name) == NULL_TREE);
+  tree new_name = get_current_def (current_new_name);
+  /* Because of peeled_chrec optimization it is possible that we have
+set this earlier.  Verify the PHI has the same value.  */
+  if (new_name)
+   {
+ gimple phi = SSA_NAME_DEF_STMT (new_name);
+ gcc_assert (gimple_code (phi) == GIMPLE_PHI
+  gimple_bb (phi) == *new_exit_bb
+  (PHI_ARG_DEF_FROM_EDGE (phi, single_exit (loop))
+ == loop_arg));
+ continue;
+   }
 
   set_current_def (current_new_name, PHI_RESULT (new_phi));
 }
--- gcc/testsuite/gcc.dg/vect/pr59519-1.c.jj2014-01-02 16:39:07.743647669 
+0100
+++ gcc/testsuite/gcc.dg/vect/pr59519-1.c   2014-01-02 16:40:22.414276947 
+0100
@@ -0,0 +1,19 @@
+/* PR tree-optimization/59519 */
+/* { dg-do compile } */
+/* { dg-additional-options -O3 } */
+
+int a, b, c, d;
+
+void
+foo (void)
+{
+  for (; d; d++)
+for (b = 0; b  14; b++)
+  {
+   c |= 1;
+   if (a)
+ break;
+  }
+}
+
+/* { dg-final { cleanup-tree-dump vect } } */
--- gcc/testsuite/gcc.dg/vect/pr59519-2.c.jj2014-01-02 16:39:10.726629480 
+0100
+++ gcc/testsuite/gcc.dg/vect/pr59519-2.c   2014-01-02 16:40:26.213249520 
+0100
@@ -0,0 +1,20 @@
+/* PR tree-optimization/59519 */
+/* { dg-do compile } */
+/* { dg-additional-options -O3 } */
+
+struct S { int f0; } d;
+int a[8] = { 0 }, b, c, e;
+
+void
+foo (void)
+{
+  for (; e  1; e++)
+for (b = 0; b  7; b++)
+  {
+   c |= (a[b + 1] != 0);
+   if (d.f0)
+ break;
+  }
+}
+
+/* { dg-final { cleanup-tree-dump vect } } */

Jakub


Re: [PATCH] Fix devirtualization ICE (PR tree-optimization/59622, take 2)

2014-01-03 Thread Jakub Jelinek
On Fri, Jan 03, 2014 at 11:24:53AM +0100, Richard Biener wrote:
 works just fine for me but shows
 
   int (*T391) () f; 
   
   int _4; 
   
   
   
   bb 2: 
   
   f_1 = foo;  
   
   _4 = f_1 (0, 0);
 
 so I suppose the fntype is not NULL but int (*)().

Sure, I said TYPE_ARG_TYPES is NULL, which is the () there.

Jakub


Re: [PATCH i386 5/8] [AVX-512] Extend vectorizer hooks.

2014-01-03 Thread Uros Bizjak
On Thu, Jan 2, 2014 at 11:18 PM, Eric Botcazou ebotca...@adacore.com wrote:
 Note that it has unexpected side-effects: previously, in 32-bit mode,
 256-bit aggregate objects would have been given 256-bit alignment; now,
 they will fall back to default alignment, for example 32-bit only.

 In case this wasn't clear enough, just compile in 32-bit mode:

 int a[8] = { 1, 2, 3, 4, 5, 6, 7, 8};

It looks to me that we don't need to adjust anything with max_align.
Using following patch:

--cut here--
Index: i386.c
===
--- i386.c  (revision 206311)
+++ i386.c  (working copy)
@@ -26465,6 +26465,7 @@
 int
 ix86_data_alignment (tree type, int align, bool opt)
 {
+#if 0
   int max_align = optimize_size ? BITS_PER_WORD
: MIN (512, MAX_OFILE_ALIGNMENT);

@@ -26476,6 +26477,7 @@
  || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
align  max_align)
 align = max_align;
+#endif

   /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
  to 16byte boundary.  */
--cut here--

and following testcase:

-- cut here--
float a[8] = { 1, 2, 3, 4, 5, 6, 7, 8};

extern float z[8];

void t (void)
{
  int i;

  for (i = 0; i  8; i++)
z[i] = z[i] + a[i];
}
--cut here--

When compiled with -m32 -mavx, we get:

.align 32
.type   a, @object
.size   a, 32
a:

so, the alignment was already raised elsewhere. We get .align 16 for
-msse -m32 when vectorizing.

without -msse (and consequently without vectorizing), we get for -m32:

.align 4
.type   a, @object
.size   a, 32
a:

which corresponds to 32bit ABI rules (we still get .align16 for 64bit ABI).

What bothers me in this testcase is (unrelated) alignment of z[8]
array. Even for 64bit targets, we get:

#(insn:TI 6 5 8 2 (set (reg:V4SF 21 xmm0 [orig:90 vect__4.5 ] [90])
#(unspec:V4SF [
#(mem/c:V4SF (reg/f:DI 0 ax [89]) [2 MEM[(float
*)z]+0 S16 A32])
#] UNSPEC_LOADU)) al.c:10 1195 {*sse_loadups}
# (nil))
movups  (%rax), %xmm0   # 6 *sse_loadups[length = 3]

ABI guarantees 16 byte alignment of z[8], but we fail to communicate
this to the compiler.

Uros.


Re: [PATCH i386 5/8] [AVX-512] Extend vectorizer hooks.

2014-01-03 Thread Eric Botcazou
 When compiled with -m32 -mavx, we get:
 
 .align 32
 .type   a, @object
 .size   a, 32
 a:
 
 so, the alignment was already raised elsewhere. We get .align 16 for
 -msse -m32 when vectorizing.
 
 without -msse (and consequently without vectorizing), we get for -m32:
 
 .align 4
 .type   a, @object
 .size   a, 32
 a:
 
 which corresponds to 32bit ABI rules (we still get .align16 for 64bit ABI).

Yes, but the issue is that it was 32 before so the alignment decrease is weird.

-- 
Eric Botcazou


Re: [PATCH i386 5/8] [AVX-512] Extend vectorizer hooks.

2014-01-03 Thread Uros Bizjak
On Fri, Jan 3, 2014 at 12:20 PM, Eric Botcazou ebotca...@adacore.com wrote:
 When compiled with -m32 -mavx, we get:

 .align 32
 .type   a, @object
 .size   a, 32
 a:

 so, the alignment was already raised elsewhere. We get .align 16 for
 -msse -m32 when vectorizing.

 without -msse (and consequently without vectorizing), we get for -m32:

 .align 4
 .type   a, @object
 .size   a, 32
 a:

 which corresponds to 32bit ABI rules (we still get .align16 for 64bit ABI).

 Yes, but the issue is that it was 32 before so the alignment decrease is 
 weird.

Yes, but this change is benign, and as shown, I don't think we need
this functionality at all. The data from other TUs is accessed with a
4 byte alignment (on 32bit targets, and unfortunately also on 64bit
targets), and the alignment of local data is increased elsewhere.

I am testing a patch that removes max_align part from ix86_data_alignment.

Uros.


Re: Rb tree node recycling patch

2014-01-03 Thread Christopher Jefferson
Personally, I consider the testing insufficent, although the testing
was already insufficient! Note that this is my opinion, don't assume I
talk for all of libstdc++!

After I broke std::nth_element in 4.8.2 (woops), I am of the view that
the libstdc++ testsuite is defficient -- many code paths are never
tested. The nth_element breakage should really have been picked up,
about 1/3 of all random invocations of nth_element failed!

You could be inspired by the code I wrote for algorithms, it is in
testsuite/util/testsuite_containergen.h. The basic idea is I make sure
to generate special cases (size 0 containers, containers containing
only a single value repeated), and then a range of containers of
various different sizes.

I realise suggesting this is probably as much work as your patch, and
you shouldn't assume this is required, but I think it would improve
the testsuite. If you do go down this route, then be sure to reduce
your test's size for simulators. You can look at for example at my new
tests which feature:

// { dg-options -std=gnu++11 }
// { dg-options -std=gnu++11 -DSIMULATOR_TEST { target simulator } }

Which enables the macro SIMULATOR_TEST on simulators, where I do much
less testing (else the tester takes too long.


On 27 December 2013 18:30, François Dumont frs.dum...@gmail.com wrote:
 Hi

 Here is a patch to add recycling of Rb tree nodes when possible.

 I replaced the _Rb_tree::_M_move_assign with a move assignment operator
 to benefit from:
 - move of elements when the whole data structure cannot be moved
 - faster data structure cloning rather than full regeneration of the tree
 when _M_move_assign was failing

 Note that this patch contains also a cleanup of a useless template
 parameter _Is_pod_comparator on _Rb_tree_impl. If you want to apply it
 quickly for 4.9 do not hesitate.

 I haven't done any specific test for this feature, existing ones looks
 good enough to me. If you want me to add some I will when back from
 vacation. I am mostly submitting this patch to show you that I worked on it
 and you do not need to do it yourself.

 2013-12-27  François Dumont fdum...@gcc.gnu.org

 * include/bits/stl_tree.h (_Rb_tree_reuse_or_alloc_node): New.
 (_Rb_tree_alloc_node): Likewise.
 (_Rb_tree::_M_clone_node): Made template to take a node
 generator.
 (_Rb_tree_impl): Remove unused _Is_pod_comparator template
 value.
 (_Rb_tree::_M_move_assign): Replace by...
 (_Rb_tree::operator(_Rb_tree)): ...this.
 (_Rb_tree_impl::_M_reset): New.
 (_Rb_tree::_M_insert_): Add node generator parameter.
 (_Rb_tree::_M_copy): Add overload taking a node generator.
 (_Rb_tree::_M_insert_unique_): Add node generator parameter.
 (_Rb_tree::_M_insert_equal_): Add node generator parameter.
 (_Rb_tree::_M_assign_unique): New.
 (_Rb_tree::_M_assign_equal): New.
 (_Rb_tree): Adapt to use _Rb_tree_impl::_M_reset and reuse
 nodes as much as possible.
 * include/bits/stl_set.h (set::operator=(set): Adapt to use
 _Rb_tree move assignment operator.
 (set::operator=(initializer_list)): Adapt to use
 _Rb_tree::_M_assign_unique.
 * include/bits/stl_multiset.h
 (multiset::operator=(multiset)): Adapt to use
 _Rb_tree move assignment operator.
 (multiset::operator=(initializer_list)): Adapt to use
 _Rb_tree::_M_assign_equal.
 * include/bits/stl_map.h (map::operator=(map): Adapt to use
 _Rb_tree move assignment operator.
 (map::operator=(initializer_list)): Adapt to use
 _Rb_tree::_M_assign_unique.
 * include/bits/stl_multimap.h
 (multimap::operator=(multimap)): Adapt to use
 _Rb_tree move assignment operator.
 (multimap::operator=(initializer_list)): Adapt to use
 _Rb_tree::_M_assign_equal.

 Tested under Linux x86_64.

 Happy new year.

 François



Re: [PATCH i386 5/8] [AVX-512] Extend vectorizer hooks.

2014-01-03 Thread Jakub Jelinek
On Fri, Jan 03, 2014 at 12:25:00PM +0100, Uros Bizjak wrote:
 I am testing a patch that removes max_align part from ix86_data_alignment.

That looks like unnecessary pessimization.  Note the hunk in question is
guarded with opt, which means it is an optimization rather than ABI issue,
it can increase alignment, but the compiler can only assume the increased
alignment if the symbol is not public or if it is public, but can't be
preempted by another TU's definition.  Even in that case it can be
worthwhile to increase the alignment, say if doing versioning for alignment,
or say just doing some AVX/AVX2/AVX512F version of memcpy/memset that can
handle it faster if it is sufficiently aligned by testing it at runtime.

So, IMHO we shouldn't drop it, just improve it.
Perhaps:

  int max_align = optimize_size ? BITS_PER_WORD
: MIN (512, MAX_OFILE_ALIGNMENT);

  if (opt
   AGGREGATE_TYPE_P (type)
   TYPE_SIZE (type)
   TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
   align  max_align)
{
  int this_align = 256;
  for (this_align = 256; this_align = max_align; this_align *= 2)
if (TREE_INT_CST_LOW (TYPE_SIZE (type))  (unsigned) this_align
 !TREE_INT_CST_HIGH (TYPE_SIZE (type)))
  break;
else if (align  this_align)
  align = this_align;
}

which will handle both the 256 bit alignment for = 256 bit objects,
512 bit alignment for = 512 bit objects and will be prepared for the
future.

128 bit I think doesn't need to be handled, DATA_ALIGNMENT has been
using 256-bit test already since it's introduction in 1998:
http://gcc.gnu.org/ml/gcc-bugs/1998-03/msg00011.html

Jakub


Re: [PATCH] Don't count asm goto for 4 branch limit optimization (PR target/59625)

2014-01-03 Thread Uros Bizjak
On Fri, Jan 3, 2014 at 9:37 AM, Jakub Jelinek ja...@redhat.com wrote:

 Linus complained about padding inserted in between asm goto.  In 4.8
 even -mtune=generic performs ix86_avoid_jump_mispredicts, in 4.9 only e.g.
 -mtune=atom and others, but not generic.
 The issue is that assuming every asm goto must contain a jump is too
 pessimistic, often asm goto (may) transfer control to the label(s) through
 other means (e.g. on the side data structure with labels etc.), plus we
 conservatively assume asm goto is 0 bytes long, so we were adding the
 paddings pretty much always when multiple asm goto stmts appeared
 consecutively.

 This patch just ignores asm goto from the optimization.

 Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk and 4.8
 after a while?

 2014-01-03  Jakub Jelinek  ja...@redhat.com

 PR target/59625
 * config/i386/i386.c (ix86_avoid_jump_mispredicts): Don't consider
 asm goto as jump.

 * gcc.target/i386/pr59625.c: New test.

OK everywhere, but please add a short comment about asm goto handling

 +/* Verify we don't consider asm goto as a jump for four jumps limit
 +   optimization.  asm goto doesn't have to contain a jump at all,
 +   the branching to labels can happen through different means.  */
 +/* { dg-final { scan-assembler-not 
 p2align\[^\n\r\]*\[\n\r]*\[^\n\r\]*p2align\[^\n\r\]*\[\n\r]*\[^\n\r\]*p2align\[^\n\r\]*\[\n\r]*\[^\n\r\]*p2align\[^\n\r\]*\[\n\r]*\[^\n\r\]*p2align\[^\n\r\]*\[\n\r]*\[^\n\r\]*p2align
  } } */

This RE can probably use match count, {n}, as e.g. in g++.dg/abi/mangle33.C

Thanks,
Uros.


[PATCH] Fix typo in docs

2014-01-03 Thread Marek Polacek
__builtin_{FILE,FUNCTION} had wrong return types, thus fixed.
Tested by make html and links -g.  Ok for trunk?

2014-01-03  Marek Polacek  pola...@redhat.com

PR other/59661
* doc/extend.texi: Fix the return value of __builtin_FUNCTION and
__builtin_FILE.

--- gcc/doc/extend.texi.mp  2014-01-03 12:32:13.366420582 +0100
+++ gcc/doc/extend.texi 2014-01-03 13:02:03.723889259 +0100
@@ -8728,12 +8728,12 @@ This function is the equivalent to the p
 macro and returns the line number of the invocation of the built-in.
 @end deftypefn
 
-@deftypefn {Built-in Function} int __builtin_FUNCTION ()
+@deftypefn {Built-in Function} {const char *} __builtin_FUNCTION ()
 This function is the equivalent to the preprocessor @code{__FUNCTION__}
 macro and returns the function name the invocation of the built-in is in.
 @end deftypefn
 
-@deftypefn {Built-in Function} int __builtin_FILE ()
+@deftypefn {Built-in Function} {const char *} __builtin_FILE ()
 This function is the equivalent to the preprocessor @code{__FILE__}
 macro and returns the file name the invocation of the built-in is in.
 @end deftypefn

Marek


Re: [PATCH] Fix typo in docs

2014-01-03 Thread Jakub Jelinek
On Fri, Jan 03, 2014 at 01:03:24PM +0100, Marek Polacek wrote:
 __builtin_{FILE,FUNCTION} had wrong return types, thus fixed.
 Tested by make html and links -g.  Ok for trunk?

Ok, thanks.

 2014-01-03  Marek Polacek  pola...@redhat.com
 
   PR other/59661
   * doc/extend.texi: Fix the return value of __builtin_FUNCTION and
   __builtin_FILE.
 
 --- gcc/doc/extend.texi.mp2014-01-03 12:32:13.366420582 +0100
 +++ gcc/doc/extend.texi   2014-01-03 13:02:03.723889259 +0100
 @@ -8728,12 +8728,12 @@ This function is the equivalent to the p
  macro and returns the line number of the invocation of the built-in.
  @end deftypefn
  
 -@deftypefn {Built-in Function} int __builtin_FUNCTION ()
 +@deftypefn {Built-in Function} {const char *} __builtin_FUNCTION ()
  This function is the equivalent to the preprocessor @code{__FUNCTION__}
  macro and returns the function name the invocation of the built-in is in.
  @end deftypefn
  
 -@deftypefn {Built-in Function} int __builtin_FILE ()
 +@deftypefn {Built-in Function} {const char *} __builtin_FILE ()
  This function is the equivalent to the preprocessor @code{__FILE__}
  macro and returns the file name the invocation of the built-in is in.
  @end deftypefn

Jakub


Re: [PATCH i386 5/8] [AVX-512] Extend vectorizer hooks.

2014-01-03 Thread Uros Bizjak
On Fri, Jan 3, 2014 at 12:59 PM, Jakub Jelinek ja...@redhat.com wrote:
 On Fri, Jan 03, 2014 at 12:25:00PM +0100, Uros Bizjak wrote:
 I am testing a patch that removes max_align part from ix86_data_alignment.

 That looks like unnecessary pessimization.  Note the hunk in question is
 guarded with opt, which means it is an optimization rather than ABI issue,
 it can increase alignment, but the compiler can only assume the increased
 alignment if the symbol is not public or if it is public, but can't be
 preempted by another TU's definition.  Even in that case it can be
 worthwhile to increase the alignment, say if doing versioning for alignment,
 or say just doing some AVX/AVX2/AVX512F version of memcpy/memset that can
 handle it faster if it is sufficiently aligned by testing it at runtime.

 So, IMHO we shouldn't drop it, just improve it.
 Perhaps:

   int max_align = optimize_size ? BITS_PER_WORD
 : MIN (512, MAX_OFILE_ALIGNMENT);

   if (opt
AGGREGATE_TYPE_P (type)
TYPE_SIZE (type)
TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
align  max_align)
 {
   int this_align = 256;
   for (this_align = 256; this_align = max_align; this_align *= 2)
 if (TREE_INT_CST_LOW (TYPE_SIZE (type))  (unsigned) this_align
  !TREE_INT_CST_HIGH (TYPE_SIZE (type)))
   break;
 else if (align  this_align)
   align = this_align;
 }

 which will handle both the 256 bit alignment for = 256 bit objects,
 512 bit alignment for = 512 bit objects and will be prepared for the
 future.

 128 bit I think doesn't need to be handled, DATA_ALIGNMENT has been
 using 256-bit test already since it's introduction in 1998:
 http://gcc.gnu.org/ml/gcc-bugs/1998-03/msg00011.html

Thanks for the pointer, there is indeed the recommendation in
optimization manual [1], section 3.6.4, where it is said:

--quote--
Misaligned data access can incur significant performance penalties.
This is particularly true for cache line
splits. The size of a cache line is 64 bytes in the Pentium 4 and
other recent Intel processors, including
processors based on Intel Core microarchitecture.
An access to data unaligned on 64-byte boundary leads to two memory
accesses and requires several
μops to be executed (instead of one). Accesses that span 64-byte
boundaries are likely to incur a large
performance penalty, the cost of each stall generally are greater on
machines with longer pipelines.

...

A 64-byte or greater data structure or array should be aligned so that
its base address is a multiple of 64.
Sorting data in decreasing size order is one heuristic for assisting
with natural alignment. As long as 16-
byte boundaries (and cache lines) are never crossed, natural alignment
is not strictly necessary (though
it is an easy way to enforce this).
--/quote--

So, this part has nothing to do with AVX512, but with cache line
width. And we do have a --param l1-cache-line-size=64, detected with
-march=native that could come handy here.

This part should be rewritten (and commented) with the information
above in mind.

[1] 
http://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-optimization-manual.html

Uros.


Extend -fstack-protector-strong to cover calls with return slot

2014-01-03 Thread Florian Weimer
This patch fixes a loophole in the -fstack-protector-strong protection. 
 If a function call uses the return slot optimization, the caller needs 
stack protector instrumentation because the return slot is addressable.


Bootstrapped and regression-tested on x86_64-redhat-linux-gnu, with 
C/C++/Java enabled.  Okay for trunk?


--
Florian Weimer / Red Hat Product Security Team
gcc/

2014-01-03  Florian Weimer  fwei...@redhat.com

	* cfgexpand.c (call_with_return_slot_opt_p): New function.
	(expand_used_vars): Emit stack protector instrumentation in strong
	mode if call_with_return_slot_opt_p.

gcc/testsuite/

2014-01-03  Florian Weimer  fwei...@redhat.com

	* gcc.dg/fstack-protector-strong.c: Add coverage for named return
	values.
	* g++.dg/fstack-protector-strong.C: Likewise.

Index: gcc/cfgexpand.c
===
--- gcc/cfgexpand.c	(revision 206311)
+++ gcc/cfgexpand.c	(working copy)
@@ -1599,6 +1599,22 @@
   return 0;
 }
 
+/* Check if the basic block has a call which uses a return slot.  */
+
+static bool
+call_with_return_slot_opt_p (basic_block bb)
+{
+  for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
+   !gsi_end_p (gsi); gsi_next (gsi))
+{
+  gimple stmt = gsi_stmt (gsi);
+  if (gimple_code (stmt) == GIMPLE_CALL
+	   gimple_call_return_slot_opt_p (stmt))
+	return true;
+}
+  return false;
+}
+
 /* Expand all variables used in the function.  */
 
 static rtx
@@ -1669,22 +1685,35 @@
   pointer_map_destroy (ssa_name_decls);
 
   if (flag_stack_protect == SPCT_FLAG_STRONG)
-FOR_EACH_LOCAL_DECL (cfun, i, var)
-  if (!is_global_var (var))
+{
+  FOR_EACH_LOCAL_DECL (cfun, i, var)
+	if (!is_global_var (var))
+	  {
+	tree var_type = TREE_TYPE (var);
+	/* Examine local referenced variables that have their
+	   addresses taken, contain an array, or are arrays.  */
+	if (TREE_CODE (var) == VAR_DECL
+		 (TREE_CODE (var_type) == ARRAY_TYPE
+		|| TREE_ADDRESSABLE (var)
+		|| (RECORD_OR_UNION_TYPE_P (var_type)
+			 record_or_union_type_has_array_p (var_type
+	  {
+		gen_stack_protect_signal = true;
+		break;
+	  }
+	  }
+  /* The return slot introduces addressable local variables.  */
+  if (!gen_stack_protect_signal)
 	{
-	  tree var_type = TREE_TYPE (var);
-	  /* Examine local referenced variables that have their addresses taken,
-	 contain an array, or are arrays.  */
-	  if (TREE_CODE (var) == VAR_DECL
-	   (TREE_CODE (var_type) == ARRAY_TYPE
-		  || TREE_ADDRESSABLE (var)
-		  || (RECORD_OR_UNION_TYPE_P (var_type)
-		   record_or_union_type_has_array_p (var_type
+	  basic_block bb;
+	  FOR_ALL_BB_FN (bb, cfun)
 	{
-	  gen_stack_protect_signal = true;
-	  break;
+	  gen_stack_protect_signal = call_with_return_slot_opt_p (bb);
+	  if (gen_stack_protect_signal)
+		break;
 	}
 	}
+}
 
   /* At this point all variables on the local_decls with TREE_USED
  set are not associated with any block scope.  Lay them out.  */
Index: gcc/testsuite/g++.dg/fstack-protector-strong.C
===
--- gcc/testsuite/g++.dg/fstack-protector-strong.C	(revision 206311)
+++ gcc/testsuite/g++.dg/fstack-protector-strong.C	(working copy)
@@ -32,4 +32,39 @@
   return global_func (a);
 }
 
-/* { dg-final { scan-assembler-times stack_chk_fail 2 } } */
+/* Frame addressed exposed through return slot. */
+
+struct B
+{
+  /* Discourage passing this struct in registers. */
+  int a1, a2, a3, a4, a5, a6, a7, a8, a9, a10;
+};
+
+B global_func ();
+void noop ();
+
+int foo3 ()
+{
+  return global_func ().a1;
+}
+
+int foo4 ()
+{
+  try {
+noop ();
+return 0;
+  } catch (...) {
+return global_func ().a1;
+  }
+}
+
+int foo5 ()
+{
+  try {
+return global_func ().a1;
+  } catch (...) {
+return 0;
+  }
+}
+
+/* { dg-final { scan-assembler-times stack_chk_fail 5 } } */
Index: gcc/testsuite/gcc.dg/fstack-protector-strong.c
===
--- gcc/testsuite/gcc.dg/fstack-protector-strong.c	(revision 206311)
+++ gcc/testsuite/gcc.dg/fstack-protector-strong.c	(working copy)
@@ -131,4 +131,17 @@
   return bb.three;
 }
 
-/* { dg-final { scan-assembler-times stack_chk_fail 10 } } */
+struct B
+{
+  /* Discourage passing this struct in registers. */
+  int a1, a2, a3, a4, a5, a6, a7, a8, a9, a10;
+};
+
+struct B global3 (void);
+
+int foo11 ()
+{
+  return global3 ().a1;
+}
+
+/* { dg-final { scan-assembler-times stack_chk_fail 11 } } */


Re: [PATCH][ARM] Add new cores to t-aprofile

2014-01-03 Thread Gerald Pfeifer
Kyrill Tkachov kyrylo.tkac...@arm.com wrote:
This patch adds the recently introduced cores to the t-aprofile
multilib 
machinery. The values added are cortex-a15.cortex-a7, cortex-a12,
cortex-a57 and 
cortex-a57.cortex-a53.

Are you going to add this to the release notes (gcc-4.9/changes.html) or will 
this be part of an ARM mega patch later? For the record, I prefer incremental 
changes, but whatever works. 

Gerald



Re: Update GCC 4.9 changes.html

2014-01-03 Thread Gerald Pfeifer
H.J. Lu hjl.to...@gmail.com wrote:
This patch adds Intel microarchitecture changes to GCC 4.9
changes.html.
OK to install?

Okay, modulo the fix Bernhard noted. 
Thanks,
Gerald



Re: [ARM] add armv7ve support

2014-01-03 Thread Gerald Pfeifer
Renlin Li renlin...@arm.com wrote:
Hi all,

This patch will add armv7ve support to gcc. Armv7ve is basically a 
armv7-a architecture profile with Virtualization Extensions.

Mind adding this to the release notes?

Gerald



Re: [PATCH i386 5/8] [AVX-512] Extend vectorizer hooks.

2014-01-03 Thread Uros Bizjak
On Fri, Jan 3, 2014 at 1:27 PM, Uros Bizjak ubiz...@gmail.com wrote:

 I am testing a patch that removes max_align part from ix86_data_alignment.

 That looks like unnecessary pessimization.  Note the hunk in question is
 guarded with opt, which means it is an optimization rather than ABI issue,
 it can increase alignment, but the compiler can only assume the increased
 alignment if the symbol is not public or if it is public, but can't be
 preempted by another TU's definition.  Even in that case it can be
 worthwhile to increase the alignment, say if doing versioning for alignment,
 or say just doing some AVX/AVX2/AVX512F version of memcpy/memset that can
 handle it faster if it is sufficiently aligned by testing it at runtime.

 So, IMHO we shouldn't drop it, just improve it.
 Perhaps:

   int max_align = optimize_size ? BITS_PER_WORD
 : MIN (512, MAX_OFILE_ALIGNMENT);

   if (opt
AGGREGATE_TYPE_P (type)
TYPE_SIZE (type)
TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
align  max_align)
 {
   int this_align = 256;
   for (this_align = 256; this_align = max_align; this_align *= 2)
 if (TREE_INT_CST_LOW (TYPE_SIZE (type))  (unsigned) this_align
  !TREE_INT_CST_HIGH (TYPE_SIZE (type)))
   break;
 else if (align  this_align)
   align = this_align;
 }

 which will handle both the 256 bit alignment for = 256 bit objects,
 512 bit alignment for = 512 bit objects and will be prepared for the
 future.

 128 bit I think doesn't need to be handled, DATA_ALIGNMENT has been
 using 256-bit test already since it's introduction in 1998:
 http://gcc.gnu.org/ml/gcc-bugs/1998-03/msg00011.html

 Thanks for the pointer, there is indeed the recommendation in
 optimization manual [1], section 3.6.4, where it is said:

 --quote--
 Misaligned data access can incur significant performance penalties.
 This is particularly true for cache line
 splits. The size of a cache line is 64 bytes in the Pentium 4 and
 other recent Intel processors, including
 processors based on Intel Core microarchitecture.
 An access to data unaligned on 64-byte boundary leads to two memory
 accesses and requires several
 μops to be executed (instead of one). Accesses that span 64-byte
 boundaries are likely to incur a large
 performance penalty, the cost of each stall generally are greater on
 machines with longer pipelines.

 ...

 A 64-byte or greater data structure or array should be aligned so that
 its base address is a multiple of 64.
 Sorting data in decreasing size order is one heuristic for assisting
 with natural alignment. As long as 16-
 byte boundaries (and cache lines) are never crossed, natural alignment
 is not strictly necessary (though
 it is an easy way to enforce this).
 --/quote--

 So, this part has nothing to do with AVX512, but with cache line
 width. And we do have a --param l1-cache-line-size=64, detected with
 -march=native that could come handy here.

 This part should be rewritten (and commented) with the information
 above in mind.

Like in the patch below. Please note, that the block_tune setting for
the nocona is wrong, -march=native on my trusted old P4 returns:

--param l1-cache-size=16 --param l1-cache-line-size=64 --param
l2-cache-size=2048 -mtune=nocona

which is consistent with the above quote from manual.

2014-01-02  Uros Bizjak  ubiz...@gmail.com

* config/i386/i386.c (ix86_data_alignment): Calculate max_align
from prefetch_block tune setting.
(nocona_cost): Correct size of prefetch block to 64.

The patch was bootstrapped on x86_64-pc-linux-gnu and is currently in
regression testing. If there are no comments, I will commit it to
mainline and release branches after a couple of days.

Uros.
Index: i386.c
===
--- i386.c  (revision 206311)
+++ i386.c  (working copy)
@@ -1568,7 +1568,7 @@ struct processor_costs nocona_cost = {
   8,   /* MMX or SSE register to integer */
   8,   /* size of l1 cache.  */
   1024,/* size of l2 cache.  */
-  128, /* size of prefetch block */
+  64,  /* size of prefetch block */
   8,   /* number of parallel prefetches */
   1,   /* Branch cost */
   COSTS_N_INSNS (6),   /* cost of FADD and FSUB insns.  */
@@ -26465,9 +26465,21 @@ ix86_constant_alignment (tree exp, int align)
 int
 ix86_data_alignment (tree type, int align, bool opt)
 {
-  int max_align = optimize_size ? BITS_PER_WORD
-   : MIN (512, MAX_OFILE_ALIGNMENT);
+  /* Misaligned data access can incur significant performance penalties.
+ This is particularly true for cache line splits. The size of a cache
+ line is 64 bytes in the Pentium 4 and other 

Re: [PATCH i386 5/8] [AVX-512] Extend vectorizer hooks.

2014-01-03 Thread Jakub Jelinek
On Fri, Jan 03, 2014 at 02:35:36PM +0100, Uros Bizjak wrote:
 Like in the patch below. Please note, that the block_tune setting for
 the nocona is wrong, -march=native on my trusted old P4 returns:
 
 --param l1-cache-size=16 --param l1-cache-line-size=64 --param
 l2-cache-size=2048 -mtune=nocona
 
 which is consistent with the above quote from manual.
 
 2014-01-02  Uros Bizjak  ubiz...@gmail.com
 
 * config/i386/i386.c (ix86_data_alignment): Calculate max_align
 from prefetch_block tune setting.
 (nocona_cost): Correct size of prefetch block to 64.
 
 The patch was bootstrapped on x86_64-pc-linux-gnu and is currently in
 regression testing. If there are no comments, I will commit it to
 mainline and release branches after a couple of days.

That still has the effect of not aligning (for most tunings) 32 to 63 bytes
long aggregates to 32 bytes, while previously they were aligned.  Forcing
aligning 32 byte long aggregates to 64 bytes would be overkill, 32 byte
alignment is just fine for those (and ensures it never crosses 64 byte
boundary), for 33 to 63 bytes perhaps using 64 bytes alignment wouldn't
be that bad, just wouldn't match what we have done before.

Jakub


Re: [PATCH i386 5/8] [AVX-512] Extend vectorizer hooks.

2014-01-03 Thread Uros Bizjak
On Fri, Jan 3, 2014 at 2:43 PM, Jakub Jelinek ja...@redhat.com wrote:
 On Fri, Jan 03, 2014 at 02:35:36PM +0100, Uros Bizjak wrote:
 Like in the patch below. Please note, that the block_tune setting for
 the nocona is wrong, -march=native on my trusted old P4 returns:

 --param l1-cache-size=16 --param l1-cache-line-size=64 --param
 l2-cache-size=2048 -mtune=nocona

 which is consistent with the above quote from manual.

 2014-01-02  Uros Bizjak  ubiz...@gmail.com

 * config/i386/i386.c (ix86_data_alignment): Calculate max_align
 from prefetch_block tune setting.
 (nocona_cost): Correct size of prefetch block to 64.

 The patch was bootstrapped on x86_64-pc-linux-gnu and is currently in
 regression testing. If there are no comments, I will commit it to
 mainline and release branches after a couple of days.

 That still has the effect of not aligning (for most tunings) 32 to 63 bytes
 long aggregates to 32 bytes, while previously they were aligned.  Forcing
 aligning 32 byte long aggregates to 64 bytes would be overkill, 32 byte
 alignment is just fine for those (and ensures it never crosses 64 byte
 boundary), for 33 to 63 bytes perhaps using 64 bytes alignment wouldn't
 be that bad, just wouldn't match what we have done before.

Please note that previous value was based on earlier (pre P4)
recommendation and it was appropriate for older chips with 32byte
cache line. The value should be updated long ago, when 64bit cache
lines were introduced, but was probably missed due to usage of magic
value without comment.

Ah, I see. My patch deals only with structures, larger than cache
line. As recommended in As long as 16-byte boundaries (and cache
lines) are never crossed, natural alignment is not strictly necessary
(though it is an easy way to enforce this). part of the manual, we
should align smaller structures to 16 or 32 bytes.

Yes, I agree. Can you please merge your patch together with the proposed patch?

Thanks,
Uros.


[PATCH] m68k: properly handle double word push for all registers

2014-01-03 Thread Andreas Schwab
handle_move_double in m68k.c has a special case to handle movd
N(sp),-(sp), but this case can happen with any (address) register,
especially if a stack like variable is forced into a register (this
happened in gforth).  Bootstrapped on m68k-suse-linux and installed on
trunk and gcc-4.8 branch.

Andreas.

* config/m68k/m68k.c (handle_move_double): Handle pushes with
overlapping registers also for registers other than the stack
pointer.
---
 gcc/config/m68k/m68k.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/gcc/config/m68k/m68k.c b/gcc/config/m68k/m68k.c
index 25b8580..f20d071 100644
--- a/gcc/config/m68k/m68k.c
+++ b/gcc/config/m68k/m68k.c
@@ -3328,12 +3328,12 @@ handle_move_double (rtx operands[2],
latehalf[1] = adjust_address (operands[1], SImode, 0);
 }
 
-  /* If insn is effectively movd N(sp),-(sp) then we will do the
- high word first.  We should use the adjusted operand 1 (which is N+4(sp))
- for the low word as well, to compensate for the first decrement of sp.  */
+  /* If insn is effectively movd N(REG),-(REG) then we will do the high
+ word first.  We should use the adjusted operand 1 (which is N+4(REG))
+ for the low word as well, to compensate for the first decrement of
+ REG.  */
   if (optype0 == PUSHOP
-   REGNO (XEXP (XEXP (operands[0], 0), 0)) == STACK_POINTER_REGNUM
-   reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
+   reg_overlap_mentioned_p (XEXP (XEXP (operands[0], 0), 0), 
operands[1]))
 operands[1] = middlehalf[1] = latehalf[1];
 
   /* For (set (reg:DI N) (mem:DI ... (reg:SI N) ...)),
-- 
1.8.5.2

-- 
Andreas Schwab, sch...@linux-m68k.org
GPG Key fingerprint = 58CA 54C7 6D53 942B 1756  01D3 44D5 214B 8276 4ED5
And now for something completely different.


[PING] [PATCH] fixed pr59651 new test case

2014-01-03 Thread Bingfeng Mei
Updated: the patch passes aarch64 tests. Added missing ChangeLog entry for 
testsuite. OK to commit? 

Bingfeng
-Original Message-
From: Bingfeng Mei 
Sent: 02 January 2014 15:02
To: gcc-patches@gcc.gnu.org
Cc: 'tbela...@arm.com'
Subject: [PATCH] fixed pr59651  new test case

Hi, 
This patch fixes pr59651. The original regression for pr52943 only appears on 
AArch64 target. I constructed a similar test that also exposes bug on x86-64. 
The problem is that calculation of address range in alias versioning for loops 
with negative step is wrong during vectorization. For example, for a loop that 
access int a[3] - a[1], the old calculated address range is [a, a+12). It 
should be [a+4, a+16) instead.

Bootstrapped and passed testsuite on x86-64. Get confirmed by original reporter 
that regression is fixed on AArch64. OK to commit?

Thanks,
Bingfeng


patch_pr59651
Description: patch_pr59651


Re: [PATCH i386 5/8] [AVX-512] Extend vectorizer hooks.

2014-01-03 Thread Jakub Jelinek
On Fri, Jan 03, 2014 at 03:02:51PM +0100, Uros Bizjak wrote:
 Please note that previous value was based on earlier (pre P4)
 recommendation and it was appropriate for older chips with 32byte
 cache line. The value should be updated long ago, when 64bit cache
 lines were introduced, but was probably missed due to usage of magic
 value without comment.
 
 Ah, I see. My patch deals only with structures, larger than cache
 line. As recommended in As long as 16-byte boundaries (and cache
 lines) are never crossed, natural alignment is not strictly necessary
 (though it is an easy way to enforce this). part of the manual, we
 should align smaller structures to 16 or 32 bytes.
 
 Yes, I agree. Can you please merge your patch together with the proposed 
 patch?

How do we want to treat the 33-63 resp. 17-31 bytes long aggregates though?
32 byte long and 16 byte long aggregates can surely be aligned just to 32
resp. 16 bytes and never crosses 64 byte boundary then and doesn't waste
space in paddings unnecessarily (still opt thing, ABI can override),
but do we want to waste some extra bytes to ensure that 17-31 resp. 33-63
bytes long objects don't cross 64 byte boundaries by aligning those to 32
resp. 64 bytes, or do align them to 16 resp. 32 bytes instead?

Jakub


Re: [PING] [PATCH] fixed pr59651 new test case

2014-01-03 Thread Jakub Jelinek
On Fri, Jan 03, 2014 at 02:07:03PM +, Bingfeng Mei wrote:
 This patch fixes pr59651.  The original regression for pr52943 only
 appears on AArch64 target.  I constructed a similar test that also exposes
 bug on x86-64.  The problem is that calculation of address range in alias
 versioning for loops with negative step is wrong during vectorization. 
 For example, for a loop that access int a[3] - a[1], the old calculated
 address range is [a, a+12).  It should be [a+4, a+16) instead.

--- tree-vect-loop-manip.c  (revision 206279)
+++ tree-vect-loop-manip.c  (working copy)
@@ -2241,12 +2241,26 @@ vect_create_cond_for_alias_checks (loop_
   tree seg_a_min = addr_base_a;
   tree seg_a_max = fold_build_pointer_plus (addr_base_a, segment_length_a);
   if (tree_int_cst_compare (DR_STEP (dr_a.dr), size_zero_node)  0)
-   seg_a_min = seg_a_max, seg_a_max = addr_base_a;
+   {
+ seg_a_min = 
+   fold_build_pointer_plus (seg_a_max,
+TYPE_SIZE_UNIT (TREE_TYPE (DR_REF 
(dr_a.dr;
+ seg_a_max =
+   fold_build_pointer_plus (addr_base_a, 
+TYPE_SIZE_UNIT (TREE_TYPE (DR_REF 
(dr_a.dr;
+   }

Too long lines, can you create a temporary tree var for TYPE_SIZE_UNIT (...); 
value?
Can you add a comment why you do this?

--- testsuite/gcc.dg/torture/pr59651.c  (revision 0)
+++ testsuite/gcc.dg/torture/pr59651.c  (revision 0)

Perhaps better would be to put this test into gcc.dg/vect/pr59651.c
(with the required /* { dg-final { cleanup-tree-dump vect } } */ ),
so that it is also tested say on i686-linux with -msse2 etc.
Or duplicate between those two places (the body of the test can be
included in one of those two from the other place using relative path).

@@ -0,0 +1,20 @@
+/* { dg-do run } */
+
+extern void abort (void);
+int a[] = { 0, 0, 0, 0, 0, 0, 0, 6 };
+
+int b;
+int
+main ()
+{
+  for (;;)
+{
+  b = 7;
+  for (; b; b -= 1)

Sounds like C-reduce weirdness, just write for (b = 7; b; --b)

+   a[b] = a[7]  1;
+  break;
+}
+  if (a[1] != 0)
+abort ();
+  return 0;
+}

Ok with those changes.

Jakub


Re: [PATCH i386 5/8] [AVX-512] Extend vectorizer hooks.

2014-01-03 Thread Uros Bizjak
On Fri, Jan 3, 2014 at 3:13 PM, Jakub Jelinek ja...@redhat.com wrote:
 On Fri, Jan 03, 2014 at 03:02:51PM +0100, Uros Bizjak wrote:
 Please note that previous value was based on earlier (pre P4)
 recommendation and it was appropriate for older chips with 32byte
 cache line. The value should be updated long ago, when 64bit cache
 lines were introduced, but was probably missed due to usage of magic
 value without comment.

 Ah, I see. My patch deals only with structures, larger than cache
 line. As recommended in As long as 16-byte boundaries (and cache
 lines) are never crossed, natural alignment is not strictly necessary
 (though it is an easy way to enforce this). part of the manual, we
 should align smaller structures to 16 or 32 bytes.

 Yes, I agree. Can you please merge your patch together with the proposed 
 patch?

 How do we want to treat the 33-63 resp. 17-31 bytes long aggregates though?
 32 byte long and 16 byte long aggregates can surely be aligned just to 32
 resp. 16 bytes and never crosses 64 byte boundary then and doesn't waste
 space in paddings unnecessarily (still opt thing, ABI can override),
 but do we want to waste some extra bytes to ensure that 17-31 resp. 33-63
 bytes long objects don't cross 64 byte boundaries by aligning those to 32
 resp. 64 bytes, or do align them to 16 resp. 32 bytes instead?

Looking at significant performance penalties part of the above
recommendation, I'd say to align it to 32/64 byte boundaries.
Hopefully, the linker is able to put other data in the hole?

Uros.


Re: [PATCH i386 5/8] [AVX-512] Extend vectorizer hooks.

2014-01-03 Thread Jakub Jelinek
On Fri, Jan 03, 2014 at 03:35:11PM +0100, Uros Bizjak wrote:
 On Fri, Jan 3, 2014 at 3:13 PM, Jakub Jelinek ja...@redhat.com wrote:
  On Fri, Jan 03, 2014 at 03:02:51PM +0100, Uros Bizjak wrote:
  Please note that previous value was based on earlier (pre P4)
  recommendation and it was appropriate for older chips with 32byte
  cache line. The value should be updated long ago, when 64bit cache
  lines were introduced, but was probably missed due to usage of magic
  value without comment.
 
  Ah, I see. My patch deals only with structures, larger than cache
  line. As recommended in As long as 16-byte boundaries (and cache
  lines) are never crossed, natural alignment is not strictly necessary
  (though it is an easy way to enforce this). part of the manual, we
  should align smaller structures to 16 or 32 bytes.
 
  Yes, I agree. Can you please merge your patch together with the proposed 
  patch?
 
  How do we want to treat the 33-63 resp. 17-31 bytes long aggregates though?
  32 byte long and 16 byte long aggregates can surely be aligned just to 32
  resp. 16 bytes and never crosses 64 byte boundary then and doesn't waste
  space in paddings unnecessarily (still opt thing, ABI can override),
  but do we want to waste some extra bytes to ensure that 17-31 resp. 33-63
  bytes long objects don't cross 64 byte boundaries by aligning those to 32
  resp. 64 bytes, or do align them to 16 resp. 32 bytes instead?
 
 Looking at significant performance penalties part of the above
 recommendation, I'd say to align it to 32/64 byte boundaries.
 Hopefully, the linker is able to put other data in the hole?

Unless -fdata-sections linker doesn't affect this, unless it is about the
very first or very last object in the TU in the particular section.
GCC itself would need to (supposedly unless -fno-toplevel-reorder) attempt
to sort the varpool constants that are going to be emitted prior to emitting
them (compute what section each decl would be emitted to, and within each
section start with putting variable with biggest alignment first and then
try to pack them nicely).  Kind of similar to what is done for
-fsection-anchors, just don't emit everything as a single block, just sort
them in the varpool queue.

Jakub


Re: [ARM] Fix incorrect restore of FP registers in nested APCS frame

2014-01-03 Thread Richard Earnshaw
On 22/12/13 17:42, Eric Botcazou wrote:
 Hi,
 
 this is the last issue with nested APCS frames according to our testing.  
 When 
 the IP register needs to be preserved on entry and r3 isn't free and there are
 no arguments to push, the prologue creates a slot above the frame, so various 
 internal offsets need to be adjusted.  One has been missed, leading to:
 
   sub ip, fp, #20
   fldmfdd ip!, {d8}
   sub sp, fp, #16
   ldmfd   sp, {r3, fp, sp, pc}
 
 in the epilogue of the nested frame.  That's wrong because the difference 
 between the 2 immediates must be equal to the size of the saved FP registers.
 
 Tested on ARM/VxWorks (where it fixes several ACATS tests at -O2) and 
 ARM/EABI, OK for the mainline?
 
 
 2013-12-22  Eric Botcazou  ebotca...@adacore.com
 
   * config/arm/arm.c (arm_get_frame_offsets): Revamp long lines.
   (arm_expand_epilogue_apcs_frame): Take into account the number of bytes
   used to save the static chain register in the computation of the offset
   from which the FP registers need to be restored.
 
 
 2013-12-22  Eric Botcazou  ebotca...@adacore.com
 
   * gcc.target/arm/neon-nested-apcs.c: New test.
 
 

OK, modulo the 2 nits below.

/* In Thumb mode this is incorrect, but never used.  */
 -  offsets-frame = offsets-saved_args + (frame_pointer_needed ? 4 : 0) +
 -   arm_compute_static_chain_stack_bytes();
 +  offsets-frame
 += offsets-saved_args
 + + arm_compute_static_chain_stack_bytes ()
 + + (frame_pointer_needed ? 4 : 0);

There should be brackets around the entire expression when split across
mulitple lines, so that auto-indentation is correctly preserved: ie

  offsets-frame
= (expression_in_backets_when
   - over_mulitple_lines);

/* Find the offset of the floating-point save area in the frame.  */
 -  floats_from_frame = offsets-saved_args - offsets-frame;
 +  floats_from_frame
 += offsets-saved_args
 + + arm_compute_static_chain_stack_bytes ()
 + - offsets-frame;
  

Same here.

R.



[C++] PR58950: warn for unused __builtin_shuffle result

2014-01-03 Thread Marc Glisse

Hello,

this is the piece of patch you commented on in the PR.

Bootstrap+testsuite on x86_64-unknown-linux-gnu.

2014-01-03  Marc Glisse  marc.gli...@inria.fr

PR c++/58950
gcc/cp/
* cvt.c (convert_to_void): Handle VEC_PERM_EXPR and VEC_COND_EXPR.
gcc/testsuite/
* g++.dg/pr58950.C: New file.

--
Marc GlisseIndex: gcc/cp/cvt.c
===
--- gcc/cp/cvt.c(revision 206313)
+++ gcc/cp/cvt.c(working copy)
@@ -1396,21 +1396,23 @@ convert_to_void (tree expr, impl_conv_vo
  code = TREE_CODE (e);
  tclass = TREE_CODE_CLASS (code);
  if ((tclass == tcc_comparison
   || tclass == tcc_unary
   || (tclass == tcc_binary
!(code == MODIFY_EXPR
|| code == INIT_EXPR
|| code == PREDECREMENT_EXPR
|| code == PREINCREMENT_EXPR
|| code == POSTDECREMENT_EXPR
-   || code == POSTINCREMENT_EXPR)))
+   || code == POSTINCREMENT_EXPR))
+  || code == VEC_PERM_EXPR
+  || code == VEC_COND_EXPR)
(complain  tf_warning))
warning_at (loc, OPT_Wunused_value, value computed is not 
used);
}
}
   expr = build1 (CONVERT_EXPR, void_type_node, expr);
 }
   if (! TREE_SIDE_EFFECTS (expr))
 expr = void_zero_node;
   return expr;
 }
Index: gcc/testsuite/g++.dg/pr58950.C
===
--- gcc/testsuite/g++.dg/pr58950.C  (revision 0)
+++ gcc/testsuite/g++.dg/pr58950.C  (working copy)
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options -Wall } */
+
+void f(){
+  int i __attribute__((vector_size(2*sizeof(int = { 2, 3 };
+  __builtin_shuffle (i, i); /* { dg-warning value computed is not used } */
+  ++i?1:0; /* { dg-warning value computed is not used } */
+}

Property changes on: gcc/testsuite/g++.dg/pr58950.C
___
Added: svn:keywords
## -0,0 +1 ##
+Author Date Id Revision URL
\ No newline at end of property
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property


[COMMITTED] [PATCH] fixed pr59651 new test case

2014-01-03 Thread Bingfeng Mei
Jakub, thanks. Committed with suggested changes.

Bingfeng

-Original Message-
From: Jakub Jelinek [mailto:ja...@redhat.com] 
Sent: 03 January 2014 14:26
To: Bingfeng Mei
Cc: gcc-patches@gcc.gnu.org; tbela...@arm.com
Subject: Re: [PING] [PATCH] fixed pr59651  new test case

On Fri, Jan 03, 2014 at 02:07:03PM +, Bingfeng Mei wrote:
 This patch fixes pr59651.  The original regression for pr52943 only
 appears on AArch64 target.  I constructed a similar test that also exposes
 bug on x86-64.  The problem is that calculation of address range in alias
 versioning for loops with negative step is wrong during vectorization. 
 For example, for a loop that access int a[3] - a[1], the old calculated
 address range is [a, a+12).  It should be [a+4, a+16) instead.

--- tree-vect-loop-manip.c  (revision 206279)
+++ tree-vect-loop-manip.c  (working copy)
@@ -2241,12 +2241,26 @@ vect_create_cond_for_alias_checks (loop_
   tree seg_a_min = addr_base_a;
   tree seg_a_max = fold_build_pointer_plus (addr_base_a, segment_length_a);
   if (tree_int_cst_compare (DR_STEP (dr_a.dr), size_zero_node)  0)
-   seg_a_min = seg_a_max, seg_a_max = addr_base_a;
+   {
+ seg_a_min = 
+   fold_build_pointer_plus (seg_a_max,
+TYPE_SIZE_UNIT (TREE_TYPE (DR_REF 
(dr_a.dr;
+ seg_a_max =
+   fold_build_pointer_plus (addr_base_a, 
+TYPE_SIZE_UNIT (TREE_TYPE (DR_REF 
(dr_a.dr;
+   }

Too long lines, can you create a temporary tree var for TYPE_SIZE_UNIT (...); 
value?
Can you add a comment why you do this?

--- testsuite/gcc.dg/torture/pr59651.c  (revision 0)
+++ testsuite/gcc.dg/torture/pr59651.c  (revision 0)

Perhaps better would be to put this test into gcc.dg/vect/pr59651.c
(with the required /* { dg-final { cleanup-tree-dump vect } } */ ),
so that it is also tested say on i686-linux with -msse2 etc.
Or duplicate between those two places (the body of the test can be
included in one of those two from the other place using relative path).

@@ -0,0 +1,20 @@
+/* { dg-do run } */
+
+extern void abort (void);
+int a[] = { 0, 0, 0, 0, 0, 0, 0, 6 };
+
+int b;
+int
+main ()
+{
+  for (;;)
+{
+  b = 7;
+  for (; b; b -= 1)

Sounds like C-reduce weirdness, just write for (b = 7; b; --b)

+   a[b] = a[7]  1;
+  break;
+}
+  if (a[1] != 0)
+abort ();
+  return 0;
+}

Ok with those changes.

Jakub


[PING][GOMP4][PATCH] SIMD-enabled functions (formerly Elemental functions) for C++

2014-01-03 Thread Iyer, Balaji V
Hello Everyone,
Did anyone get a chance to look into this?

Thanks,

Balaji V. Iyer.

 -Original Message-
 From: Iyer, Balaji V
 Sent: Monday, December 23, 2013 11:51 PM
 To: gcc-patches@gcc.gnu.org
 Cc: Jakub Jelinek
 Subject: [PING][GOMP4][PATCH] SIMD-enabled functions (formerly
 Elemental functions) for C++
 
 Ping!
 
 -Balaji V. Iyer.
 
  -Original Message-
  From: Iyer, Balaji V
  Sent: Thursday, December 19, 2013 1:12 PM
  To: Jakub Jelinek
  Cc: 'Aldy Hernandez (al...@redhat.com)'; 'gcc-patches@gcc.gnu.org'
  Subject: RE: [GOMP4][PATCH] SIMD-enabled functions (formerly
 Elemental
  functions) for C++
 
  Hi Jakub,
  Attached, please find a fixed patch. I have answered your questions
  below. Is this OK for trunk?
 
  Here are the ChangeLog entries:
  Gcc/cp/ChangeLog
  2013-12-19  Balaji V. Iyer  balaji.v.i...@intel.com
 
  * parser.c (cp_parser_direct_declarator): When Cilk Plus is enabled
  see if there is an attribute after function decl.  If so, then
  parse them now.
  (cp_parser_late_return_type_opt): Handle parsing of Cilk Plus SIMD
  enabled function late parsing.
  (cp_parser_gnu_attribute_list): Parse all the tokens for the vector
  attribute for a SIMD-enabled function.
  (cp_parser_omp_all_clauses): Skip parsing to the end of pragma when
  the function is used by SIMD-enabled function (indicated by NULL
  pragma token).   Added 3 new clauses: PRAGMA_CILK_CLAUSE_MASK,
  PRAGMA_CILK_CLAUSE_NOMASK and
  PRAGMA_CILK_CLAUSE_VECTORLENGTH
  (cp_parser_cilk_simd_vectorlength): Modified this function to handle
  vectorlength clause in SIMD-enabled function and #pragma SIMD's
  vectorlength clause.  Added a new bool parameter to differentiate
  between the two.
  (cp_parser_cilk_simd_fn_vector_attrs): New function.
  (is_cilkplus_vector_p): Likewise.
  (cp_parser_late_parsing_elem_fn_info): Likewise.
  (cp_parser_omp_clause_name): Added a check for mask, nomask
  and vectorlength clauses when Cilk Plus is enabled.
  (cp_parser_omp_clause_linear): Added a new parameter of type bool
  and emit a sorry message when step size is a parameter.
  * parser.h (cp_parser::cilk_simd_fn_info): New field.
 
  Testsuite/ChangeLog
  2013-12-19  Balaji V. Iyer  balaji.v.i...@intel.com
 
  * g++.dg/cilk-plus/cilk-plus.exp: Called the C/C++ common tests for
  SIMD enabled function.
  * g++.dg/cilk-plus/ef_test.C: New test.
  * c-c++-common/cilk-plus/vlength_errors.c: Added new dg-error tags
  to differenciate C error messages from C++ ones.
 
  Thanks,
 
  Balaji V. Iyer.
 
   -Original Message-
   From: Jakub Jelinek [mailto:ja...@redhat.com]
   Sent: Thursday, December 19, 2013 2:23 AM
   To: Iyer, Balaji V
   Cc: 'Aldy Hernandez (al...@redhat.com)'; 'gcc-patches@gcc.gnu.org'
   Subject: Re: [GOMP4][PATCH] SIMD-enabled functions (formerly
  Elemental
   functions) for C++
  
   On Wed, Dec 18, 2013 at 11:36:04PM +, Iyer, Balaji V wrote:
--- a/gcc/cp/decl2.c
+++ b/gcc/cp/decl2.c
@@ -1124,6 +1124,10 @@ is_late_template_attribute (tree attr, tree
  decl)
is_attribute_p (omp declare simd, name))
 return true;
   
+  /* Ditto as above for Cilk Plus SIMD-enabled function attributes.
+ */  if (flag_enable_cilkplus  is_attribute_p (cilk simd
+ function,
   name))
+return true;
  
   Why?  It doesn't have any argument, why it should be processed late?
  
 
  Fixed.
 
@@ -17097,6 +17102,14 @@ cp_parser_direct_declarator (cp_parser*
parser,
   
  attrs = cp_parser_std_attribute_spec_seq (parser);
   
+ /* In here, we handle cases where attribute is used 
after
+the function declaration.  For example:
+void func (int x) __attribute__((vector(..)));  */
+ if (flag_enable_cilkplus
+  cp_lexer_next_token_is_keyword (parser-lexer,
+RID_ATTRIBUTE))
+   attrs = chainon (cp_parser_gnu_attributes_opt 
(parser),
+attrs);
  late_return = (cp_parser_late_return_type_opt
 (parser, declarator,
  memfn ? cv_quals : -1));
  
   Doesn't this change the grammar (for all attributes, not just Cilk+
   specific
   ones) just based on whether -fcilkplus has been specified or not?
  
 
  OK. Fixed this by making it parse tentatively (sort of similar to how
  you parse attributes after labels (line #9584))
 
@@ -17820,10 +17833,14 @@ cp_parser_late_return_type_opt
   (cp_parser* parser, cp_declarator *declarator,
  declarator
   

Re: [PATCH] Introduce MODE_SIZE mode attribute

2014-01-03 Thread Joseph S. Myers
On Fri, 3 Jan 2014, Jakub Jelinek wrote:

 I've noticed that especially with the AVX512F introduction we use
 GET_MODE_SIZE (MODEmode) quite heavily in the i386 *.md files, and
 the problem with that is GET_MODE_SIZE isn't a compile time constant,
 needs to read mode_size array (non-const) at runtime.

It would seem better to me for genmodes to generate appropriate macro / 
inline function definitions that can be used by GET_MODE_SIZE (along the 
lines of (__builtin_constant_p (MODE)  !mode_size_variable (MODE) ? 
get_mode_size_constant (MODE) : (unsigned short) mode_size[MODE]), where 
mode_size_variable and get_mode_size_constant are always_inline functions 
generated by genmodes) - that way all targets are covered automatically, 
without needing such duplication of mode sizes.  (Of course such 
optimizations wouldn't apply for a first-stage compiler bootstrapped by a 
compiler not supporting __builtin_constant_p, but lack of optimization in 
the first stage of a bootstrap is not a particular concern.)

-- 
Joseph S. Myers
jos...@codesourcery.com


Re: wide-int, tree-ssa

2014-01-03 Thread Kenneth Zadeck

On 11/26/2013 07:34 AM, Richard Biener wrote:

--- a/gcc/tree-ssa-ccp.c
+++ b/gcc/tree-ssa-ccp.c
@@ -98,6 +98,15 @@ along with GCC; see the file COPYING3.  If not see
 array CONST_VAL[i].VALUE.  That is fed into substitute_and_fold for
 final substitution and folding.

+   This algorithm uses wide-ints at the max precision of the target.
+   This means that, with one uninteresting exception, variables with
+   UNSIGNED types never go to VARYING because the bits above the
+   precision of the type of the variable are always zero.  The
+   uninteresting case is a variable of UNSIGNED type that has the
+   maximum precision of the target.  Such variables can go to VARYING,
+   but this causes no loss of infomation since these variables will
+   never be extended.
+

I don't understand this.  In CCP a SSA name drops to varying when
it's not constant.  How is that related to signedness or precision?!


Richi,

This is a bogosity that is inherited from the double-int code. Consider 
an unsigned int in double-int (or for that matter in widest-int).   It 
has a rep of a bunch of leading zeros followed by 32 bits of real 
stuff.Even if you know nothing about the value, it still has the 
upper bits of zero that you know are constant zeros.   Of course this is 
not true for signed numbers because the upper bits are smeared with the 
sign bits.


This was what i thought you were talking about when you argued many 
months ago when you said that infinite precision was more natural and 
that i would have trouble with a fixed precision based on the size of 
the type or mode and is the reason that tree-ssa-ccp uses widest int 
rather than wide-int.   I do actually believe that wide-int is more 
natural here.   However this bogosity does buy you a few constants.


My first cut at this pass used wide-int rather than widest int, but 
there are in fact constants that the pass finds that depend on this 
being done this way so to satisfy the bit for bit same code rule for 
values smaller than timode, so i left this the way that it was but 
decided to document it.


The proper way to do this is in fact to use wide-int, not widest-int but 
that would require major surgery to the code that converts values from 
one type to another.I plan to do this during the next stage 1.   At 
that time, i will also enhance the code to expand the sign bit if it is 
known for signed types.


Kenny


Re: [PATCH i386 5/8] [AVX-512] Extend vectorizer hooks.

2014-01-03 Thread Uros Bizjak
On Fri, Jan 3, 2014 at 3:02 PM, Uros Bizjak ubiz...@gmail.com wrote:

 Like in the patch below. Please note, that the block_tune setting for
 the nocona is wrong, -march=native on my trusted old P4 returns:

 --param l1-cache-size=16 --param l1-cache-line-size=64 --param
 l2-cache-size=2048 -mtune=nocona

 which is consistent with the above quote from manual.

 2014-01-02  Uros Bizjak  ubiz...@gmail.com

 * config/i386/i386.c (ix86_data_alignment): Calculate max_align
 from prefetch_block tune setting.
 (nocona_cost): Correct size of prefetch block to 64.

 The patch was bootstrapped on x86_64-pc-linux-gnu and is currently in
 regression testing. If there are no comments, I will commit it to
 mainline and release branches after a couple of days.

 That still has the effect of not aligning (for most tunings) 32 to 63 bytes
 long aggregates to 32 bytes, while previously they were aligned.  Forcing
 aligning 32 byte long aggregates to 64 bytes would be overkill, 32 byte
 alignment is just fine for those (and ensures it never crosses 64 byte
 boundary), for 33 to 63 bytes perhaps using 64 bytes alignment wouldn't
 be that bad, just wouldn't match what we have done before.

 Please note that previous value was based on earlier (pre P4)
 recommendation and it was appropriate for older chips with 32byte
 cache line. The value should be updated long ago, when 64bit cache
 lines were introduced, but was probably missed due to usage of magic
 value without comment.

 Ah, I see. My patch deals only with structures, larger than cache
 line. As recommended in As long as 16-byte boundaries (and cache
 lines) are never crossed, natural alignment is not strictly necessary
 (though it is an easy way to enforce this). part of the manual, we
 should align smaller structures to 16 or 32 bytes.

 Yes, I agree. Can you please merge your patch together with the proposed 
 patch?

On a second thought, the crossing of 16-byte boundaries is mentioned
for the data *access* (the instruction itself) if it is not naturally
aligned (please see example 3-40 and fig 3-2), which is *NOT* in our
case.

So, we don't have to align 32 byte structures in any way for newer
processors, since this optimization applies to 64+ byte (larger or
equal to cache line size) structures only. Older processors are
handled correctly, modulo nocona, where its cache line size value has
to be corrected.

Following that, my original patch implements this optimization in the
correct way.

Uros.


[C PATCH] Don't pedwarn for C99/C11 enum bit-fields (PR c/57773)

2014-01-03 Thread Marek Polacek
As Paul Eggert says in the PR, we shouldn't warn for enum bit-fields
in C99/C11 mode.  C11 6.7.2.1 (5) says A bit-field shall have a type
that is a qualified or unqualified version of _Bool, signed int,
unsigned int, or some other implementation-defined type., so ISTM
that enum bit-fields should be fine.  OTOH, I would warn in ISO C
mode.  It's true that no constraint is violated, but in C89 in 3.5.2.1
Semantics there's: A bit-field may have type int, unsigned int, or
signed int. so it seems desirable to warn in this case.

Regtested/bootstrapped on x86_64-linux, ok for trunk?

2014-01-03  Marek Polacek  pola...@redhat.com

PR c/57773
c/
* c-decl.c (check_bitfield_type_and_width): Warn for enum bit-fields
only in ISO C.
testsuite/
* gcc.dg/pr57773.c: New test.

--- gcc/c/c-decl.c.mp   2014-01-03 13:50:37.041997222 +0100
+++ gcc/c/c-decl.c  2014-01-03 14:46:29.115816235 +0100
@@ -4840,7 +4840,8 @@ check_bitfield_type_and_width (tree *typ
   if (!in_system_header_at (input_location)
type_mv != integer_type_node
type_mv != unsigned_type_node
-   type_mv != boolean_type_node)
+   type_mv != boolean_type_node
+   !flag_isoc99)
 pedwarn (input_location, OPT_Wpedantic,
 type of bit-field %qs is a GCC extension, name);
 
--- gcc/testsuite/gcc.dg/pr57773.c.mp   2014-01-03 14:50:51.097902818 +0100
+++ gcc/testsuite/gcc.dg/pr57773.c  2014-01-03 14:49:05.727462306 +0100
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+/* { dg-options -std=c99 -Wpedantic } */
+
+enum e { zero };
+struct { enum e field: 2; } s;

Marek


Re: [C PATCH] Don't pedwarn for C99/C11 enum bit-fields (PR c/57773)

2014-01-03 Thread Joseph S. Myers
On Fri, 3 Jan 2014, Marek Polacek wrote:

 As Paul Eggert says in the PR, we shouldn't warn for enum bit-fields
 in C99/C11 mode.  C11 6.7.2.1 (5) says A bit-field shall have a type
 that is a qualified or unqualified version of _Bool, signed int,
 unsigned int, or some other implementation-defined type., so ISTM
 that enum bit-fields should be fine.  OTOH, I would warn in ISO C
 mode.  It's true that no constraint is violated, but in C89 in 3.5.2.1
 Semantics there's: A bit-field may have type int, unsigned int, or
 signed int. so it seems desirable to warn in this case.
 
 Regtested/bootstrapped on x86_64-linux, ok for trunk?

Implementation-defined behavior is documented in implement-c.texi, so this 
patch is incomplete as it doesn't update that file where it says:

No other types are permitted in strictly conforming mode.
@c Would it be better to restrict the pedwarn for other types to C90
@c mode and document the other types for C99/C11 mode?

(And this isn't just about enums, but other integer types as well, so the 
test should cover those.)

-- 
Joseph S. Myers
jos...@codesourcery.com


Re: [C++ Patch] Fix __is_base_of vs incomplete types

2014-01-03 Thread Paolo Carlini

Hi,

On 01/02/2014 10:46 PM, Jason Merrill wrote:

On 12/27/2013 07:02 AM, Paolo Carlini wrote:

the same arguments. Conservatively but still more neatly than my first
try, we could maybe use same_type_ignoring_top_level_qualifiers_p in the
definition of the DERIVED_FROM_P macro?


Sure, let's do that.  And add something about incomplete types to the 
pre-function comment for lookup_base.

Great. Thus I successfully tested on x86_64-linux the below.

Thanks,
Paolo.


/cp
2014-01-03  Paolo Carlini  paolo.carl...@oracle.com

* cp-tree.h (DERIVED_FROM_P): True when PARENT and TYPE are the same
class-type (even if incomplete).
* search.c (lookup_base): Extend comment.

/testsuite
2014-01-03  Paolo Carlini  paolo.carl...@oracle.com

* g++.dg/ext/is_base_of_incomplete-2.C: New.
Index: cp/cp-tree.h
===
--- cp/cp-tree.h(revision 206313)
+++ cp/cp-tree.h(working copy)
@@ -1324,7 +1324,9 @@ enum languages { lang_c, lang_cplusplus, lang_java
 /* Nonzero iff TYPE is derived from PARENT. Ignores accessibility and
ambiguity issues.  */
 #define DERIVED_FROM_P(PARENT, TYPE) \
-  (lookup_base ((TYPE), (PARENT), ba_any, NULL, tf_none) != NULL_TREE)
+  (lookup_base ((TYPE), (PARENT), ba_any, NULL, tf_none) != NULL_TREE \
+   || ((TYPE)  NON_UNION_CLASS_TYPE_P (TYPE) \
+same_type_ignoring_top_level_qualifiers_p ((PARENT), (TYPE
 
 /* Gives the visibility specification for a class type.  */
 #define CLASSTYPE_VISIBILITY(TYPE) \
Index: cp/search.c
===
--- cp/search.c (revision 206313)
+++ cp/search.c (working copy)
@@ -177,8 +177,8 @@ accessible_base_p (tree t, tree base, bool conside
discovered.
 
If the base is inaccessible, or ambiguous, then error_mark_node is
-   returned.  If the tf_error bit of COMPLAIN is not set, no error
-   is issued.  */
+   returned.  If the tf_error bit of COMPLAIN is not set, no error is
+   issued.  If the base in incomplete, then NULL_TREE is returned.  */
 
 tree
 lookup_base (tree t, tree base, base_access access,
Index: testsuite/g++.dg/ext/is_base_of_incomplete-2.C
===
--- testsuite/g++.dg/ext/is_base_of_incomplete-2.C  (revision 0)
+++ testsuite/g++.dg/ext/is_base_of_incomplete-2.C  (working copy)
@@ -0,0 +1,5 @@
+struct T;
+
+int check1[__is_base_of(T, T) ? 1 : -1];
+int check2[__is_base_of(T, const T) ? 1 : -1];
+int check3[__is_base_of(volatile T, T) ? 1 : -1];


Re: Rb tree node recycling patch

2014-01-03 Thread François Dumont

Thanks for your feedback.

I considered my patch enough tested because it impacts basic 
operations on the Rb tree containers. There are also a number of tests 
that are normally dedicated to exception safety but that has the 
advantage of challenging numerous containers operations in a generic way 
and with some randomness. I have already enhance those tests adding 
checks on memory leaks. It is the biggest issue that could occur with 
this patch. I will add some tests to this framework to cover the newly 
introduced C++11 allocator aware methods.


To really know how bad or good is the testsuite we should also work 
on test coverage computation but that's a big subject, perhaps for a GSOC...


François


On 01/03/2014 12:57 PM, Christopher Jefferson wrote:

Personally, I consider the testing insufficent, although the testing
was already insufficient! Note that this is my opinion, don't assume I
talk for all of libstdc++!

After I broke std::nth_element in 4.8.2 (woops), I am of the view that
the libstdc++ testsuite is defficient -- many code paths are never
tested. The nth_element breakage should really have been picked up,
about 1/3 of all random invocations of nth_element failed!

You could be inspired by the code I wrote for algorithms, it is in
testsuite/util/testsuite_containergen.h. The basic idea is I make sure
to generate special cases (size 0 containers, containers containing
only a single value repeated), and then a range of containers of
various different sizes.

I realise suggesting this is probably as much work as your patch, and
you shouldn't assume this is required, but I think it would improve
the testsuite. If you do go down this route, then be sure to reduce
your test's size for simulators. You can look at for example at my new
tests which feature:

// { dg-options -std=gnu++11 }
// { dg-options -std=gnu++11 -DSIMULATOR_TEST { target simulator } }

Which enables the macro SIMULATOR_TEST on simulators, where I do much
less testing (else the tester takes too long.


On 27 December 2013 18:30, François Dumont frs.dum...@gmail.com wrote:

Hi

 Here is a patch to add recycling of Rb tree nodes when possible.

 I replaced the _Rb_tree::_M_move_assign with a move assignment operator
to benefit from:
- move of elements when the whole data structure cannot be moved
- faster data structure cloning rather than full regeneration of the tree
when _M_move_assign was failing

 Note that this patch contains also a cleanup of a useless template
parameter _Is_pod_comparator on _Rb_tree_impl. If you want to apply it
quickly for 4.9 do not hesitate.

 I haven't done any specific test for this feature, existing ones looks
good enough to me. If you want me to add some I will when back from
vacation. I am mostly submitting this patch to show you that I worked on it
and you do not need to do it yourself.

2013-12-27  François Dumont fdum...@gcc.gnu.org

 * include/bits/stl_tree.h (_Rb_tree_reuse_or_alloc_node): New.
 (_Rb_tree_alloc_node): Likewise.
 (_Rb_tree::_M_clone_node): Made template to take a node
 generator.
 (_Rb_tree_impl): Remove unused _Is_pod_comparator template
 value.
 (_Rb_tree::_M_move_assign): Replace by...
 (_Rb_tree::operator(_Rb_tree)): ...this.
 (_Rb_tree_impl::_M_reset): New.
 (_Rb_tree::_M_insert_): Add node generator parameter.
 (_Rb_tree::_M_copy): Add overload taking a node generator.
 (_Rb_tree::_M_insert_unique_): Add node generator parameter.
 (_Rb_tree::_M_insert_equal_): Add node generator parameter.
 (_Rb_tree::_M_assign_unique): New.
 (_Rb_tree::_M_assign_equal): New.
 (_Rb_tree): Adapt to use _Rb_tree_impl::_M_reset and reuse
 nodes as much as possible.
 * include/bits/stl_set.h (set::operator=(set): Adapt to use
 _Rb_tree move assignment operator.
 (set::operator=(initializer_list)): Adapt to use
 _Rb_tree::_M_assign_unique.
 * include/bits/stl_multiset.h
 (multiset::operator=(multiset)): Adapt to use
 _Rb_tree move assignment operator.
 (multiset::operator=(initializer_list)): Adapt to use
 _Rb_tree::_M_assign_equal.
 * include/bits/stl_map.h (map::operator=(map): Adapt to use
 _Rb_tree move assignment operator.
 (map::operator=(initializer_list)): Adapt to use
 _Rb_tree::_M_assign_unique.
 * include/bits/stl_multimap.h
 (multimap::operator=(multimap)): Adapt to use
 _Rb_tree move assignment operator.
 (multimap::operator=(initializer_list)): Adapt to use
 _Rb_tree::_M_assign_equal.

Tested under Linux x86_64.

Happy new year.

François





Re: [C++ Patch] Fix __is_base_of vs incomplete types

2014-01-03 Thread Jason Merrill

On 01/03/2014 12:29 PM, Paolo Carlini wrote:

-  (lookup_base ((TYPE), (PARENT), ba_any, NULL, tf_none) != NULL_TREE)
+  (lookup_base ((TYPE), (PARENT), ba_any, NULL, tf_none) != NULL_TREE \
+   || ((TYPE)  NON_UNION_CLASS_TYPE_P (TYPE) \
+same_type_ignoring_top_level_qualifiers_p ((PARENT), (TYPE


Let's only check one or the other, depending on whether TYPE is incomplete.


+   issued.  If the base in incomplete, then NULL_TREE is returned.  */


If T is incomplete

Jason



Re: [C++ Patch] Fix __is_base_of vs incomplete types

2014-01-03 Thread Jason Merrill

On 01/03/2014 01:47 PM, Jason Merrill wrote:

+   issued.  If the base in incomplete, then NULL_TREE is returned.  */


If T is incomplete


...even if BASE is the same type.

Jason




Re: [C++] PR58950: warn for unused __builtin_shuffle result

2014-01-03 Thread Jason Merrill

OK.

Jason


Re: Extend -fstack-protector-strong to cover calls with return slot

2014-01-03 Thread Jakub Jelinek
On Fri, Jan 03, 2014 at 02:15:51PM +0100, Florian Weimer wrote:
 This patch fixes a loophole in the -fstack-protector-strong
 protection.  If a function call uses the return slot optimization,
 the caller needs stack protector instrumentation because the return
 slot is addressable.
 
 Bootstrapped and regression-tested on x86_64-redhat-linux-gnu, with
 C/C++/Java enabled.  Okay for trunk?
 
 -- 
 Florian Weimer / Red Hat Product Security Team

 2014-01-03  Florian Weimer  fwei...@redhat.com
 
   * cfgexpand.c (call_with_return_slot_opt_p): New function.
   (expand_used_vars): Emit stack protector instrumentation in strong
   mode if call_with_return_slot_opt_p.
 
 gcc/testsuite/
 
 2014-01-03  Florian Weimer  fwei...@redhat.com
 
   * gcc.dg/fstack-protector-strong.c: Add coverage for named return
   values.
   * g++.dg/fstack-protector-strong.C: Likewise.
 
 Index: gcc/cfgexpand.c
 ===
 --- gcc/cfgexpand.c   (revision 206311)
 +++ gcc/cfgexpand.c   (working copy)
 @@ -1599,6 +1599,22 @@
return 0;
  }
  
 +/* Check if the basic block has a call which uses a return slot.  */
 +
 +static bool
 +call_with_return_slot_opt_p (basic_block bb)
 +{
 +  for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
 +   !gsi_end_p (gsi); gsi_next (gsi))
 +{
 +  gimple stmt = gsi_stmt (gsi);
 +  if (gimple_code (stmt) == GIMPLE_CALL

That would be is_gimple_call (stmt) instead.

Also, I'd think the function is misnamed, given that it checks if there
are any calls with return_slot_opt_p in a bb.  I think it would be
better to move FOR_ALL_BB_FN (bb, cfun) loop also into the
function and call it any_call_...

Lastly, I wonder if gimple_call_return_slot_opt_p is really what you are
after, why does NRV matter here?  Isn't what you are looking for instead
whether the called function returns value through invisible reference,
because then you'll always have some (aggregate) addressable object
in the caller's frame and supposedly you are after making sure that the
callee doesn't overflow the return object.

So, looking at tree-nrv.c, that check would be roughly:
  if (is_gimple_call (stmt)
   gimple_call_lhs (stmt)
   aggregate_value_p (TREE_TYPE (gimple_call_lhs (stmt)),
gimple_call_fndecl (stmt)))

Jakub


[patch] libbacktrace: add support for --disable-werror

2014-01-03 Thread Mike Frysinger
In the same vein as the other dirs, add a --disable-werror option to the
libbacktrace dir to disable the explicit -Werror usage.

2014-01-03  Mike Frysinger  dirtye...@gentoo.org

* configure.ac: Add --enable-werror.
(WARN_FLAGS): Use it.
* configure: Regenerate.

--- a/libbacktrace/configure.ac
+++ a/libbacktrace/configure.ac
@@ -132,8 +132,13 @@ ACX_PROG_CC_WARNING_OPTS([-W -Wall -Wwri
  -Wmissing-format-attribute -Wcast-qual],
  [WARN_FLAGS])
 
+AC_ARG_ENABLE(werror, [AS_HELP_STRING([--enable-werror],
+  [turns on -Werror @:@default=yes@:@])])
+
 if test -n ${with_target_subdir}; then
-  WARN_FLAGS=$WARN_FLAGS -Werror
+  if test x$enable_werror != xno; then
+WARN_FLAGS=$WARN_FLAGS -Werror
+  fi
 fi
 
 AC_SUBST(WARN_FLAGS)
--- a/libbacktrace/configure
+++ a/libbacktrace/configure
@@ -730,6 +730,7 @@ enable_fast_install
 with_gnu_ld
 enable_libtool_lock
 enable_multilib
+enable_werror
 with_system_libunwind
 '
   ac_precious_vars='build_alias
@@ -1370,6 +1371,7 @@ Optional Features:
   optimize for fast installation [default=yes]
   --disable-libtool-lock  avoid locking (might break parallel builds)
   --enable-multilib   build many library versions (default)
+  --enable-werror turns on -Werror [default=yes]
 
 Optional Packages:
   --with-PACKAGE[=ARG]use PACKAGE [ARG=yes]
@@ -11614,8 +11616,16 @@ fi
 CFLAGS=$save_CFLAGS
 
 
+# Check whether --enable-werror was given.
+if test ${enable_werror+set} = set; then :
+  enableval=$enable_werror;
+fi
+
+
 if test -n ${with_target_subdir}; then
-  WARN_FLAGS=$WARN_FLAGS -Werror
+  if test x$enable_werror != xno; then
+WARN_FLAGS=$WARN_FLAGS -Werror
+  fi
 fi
 
 


signature.asc
Description: This is a digitally signed message part.


Re: wide-int, rtl

2014-01-03 Thread Mike Stump
On Jan 2, 2014, at 2:26 PM, Eric Botcazou ebotca...@adacore.com wrote:
 So, I'd like to ping the original patch and Kenny's patch to resolve the
 issues you found.  If you have any other concerns or thoughts, let us
 know.
 
 Almost OK, but remove the strange quotes in the comment for the INTEGER_CST 
 case of expand_expr_real_1

Ok, thanks.  I've resolved it this way:

Index: expr.c
===
--- expr.c  (revision 206323)
+++ expr.c  (working copy)
@@ -9477,11 +9477,10 @@ expand_expr_real_1 (tree exp, rtx target
   return decl_rtl;
 
 case INTEGER_CST:
-  /* Given that TYPE_PRECISION (type) is not always equal to
+  /* Given that TYPE_PRECISION (type) is not always equal to
  GET_MODE_PRECISION (TYPE_MODE (type)), we need to extend from
  the former to the latter according to the signedness of the
- type. */
-
+ type. */
   temp = immed_wide_int_const (wide_int::from
   (exp,
GET_MODE_PRECISION (TYPE_MODE (type)),



Re: How to generate AVX512 instructions now (just to look at them).

2014-01-03 Thread Jakub Jelinek
Hi!

On Fri, Jan 03, 2014 at 08:58:30PM +0100, Toon Moene wrote:
 I don't doubt that would work, what I'm interested in, is (cat verintlin.f):

Well, you need gather loads for that and there you hit PR target/59617.

Completely untested patch that let's your testcase be vectorized
using 64-byte vectors, for vectorizable_mask_load_store it still punts,
but I guess the steps there are first to teach it about non-gather MASK_LOAD
and MASK_STORE, which aren't handled for the AVX512F modes either
(I think V8DI/V8DF/V16SI/V16SF modes should be possible to handle right now)
and then move on to handle the gathers similarly.

2014-01-03  Jakub Jelinek  ja...@redhat.com

PR target/59617
* config/i386/i386.c (ix86_vectorize_builtin_gather): Uncomment
AVX512F gather builtins.
* tree-vect-stmts.c (vectorizable_mask_load_store): For now punt
on gather decls with INTEGER_TYPE masktype.
(vectorizable_load): For INTEGER_TYPE masktype, put the INTEGER_CST
directly into the builtin rather than hoisting it before loop.

--- gcc/config/i386/i386.c.jj   2014-01-03 13:19:14.0 +0100
+++ gcc/config/i386/i386.c  2014-01-03 21:12:23.630145609 +0100
@@ -36527,9 +36527,6 @@ ix86_vectorize_builtin_gather (const_tre
 case V8SImode:
   code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
   break;
-#if 0
-/*  FIXME: Commented until vectorizer can work with (mask_type != src_type)
-   PR59617.   */
 case V8DFmode:
   if (TARGET_AVX512F)
code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF;
@@ -36554,7 +36551,6 @@ ix86_vectorize_builtin_gather (const_tre
   else
return NULL_TREE;
   break;
-#endif
 default:
   return NULL_TREE;
 }
--- gcc/tree-vect-stmts.c.jj2014-01-03 11:41:01.0 +0100
+++ gcc/tree-vect-stmts.c   2014-01-03 21:29:47.595911084 +0100
@@ -1813,6 +1813,17 @@ vectorizable_mask_load_store (gimple stm
 gather index use not simple.);
  return false;
}
+
+  tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gather_decl));
+  tree masktype
+   = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (TREE_CHAIN (arglist;
+  if (TREE_CODE (masktype) == INTEGER_TYPE)
+   {
+ if (dump_enabled_p ())
+   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+masked gather with integer mask not supported.);
+ return false;
+   }
 }
   else if (tree_int_cst_compare (nested_in_vect_loop
 ? STMT_VINFO_DR_STEP (stmt_info)
@@ -5761,6 +5772,7 @@ vectorizable_load (gimple stmt, gimple_s
{
  mask = build_int_cst (TREE_TYPE (masktype), -1);
  mask = build_vector_from_val (masktype, mask);
+ mask = vect_init_vector (stmt, mask, masktype, NULL);
}
   else if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (masktype)))
{
@@ -5771,10 +5783,10 @@ vectorizable_load (gimple stmt, gimple_s
  real_from_target (r, tmp, TYPE_MODE (TREE_TYPE (masktype)));
  mask = build_real (TREE_TYPE (masktype), r);
  mask = build_vector_from_val (masktype, mask);
+ mask = vect_init_vector (stmt, mask, masktype, NULL);
}
   else
gcc_unreachable ();
-  mask = vect_init_vector (stmt, mask, masktype, NULL);
 
   scale = build_int_cst (scaletype, gather_scale);
 


Jakub


Re: Extend -fstack-protector-strong to cover calls with return slot

2014-01-03 Thread Florian Weimer

On 01/03/2014 07:57 PM, Jakub Jelinek wrote:


+/* Check if the basic block has a call which uses a return slot.  */
+
+static bool
+call_with_return_slot_opt_p (basic_block bb)
+{
+  for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
+   !gsi_end_p (gsi); gsi_next (gsi))
+{
+  gimple stmt = gsi_stmt (gsi);
+  if (gimple_code (stmt) == GIMPLE_CALL


That would be is_gimple_call (stmt) instead.


Ah, it's not used consistently everywhere, and I got it from of the 
leftover places.



Also, I'd think the function is misnamed, given that it checks if there
are any calls with return_slot_opt_p in a bb.  I think it would be
better to move FOR_ALL_BB_FN (bb, cfun) loop also into the
function and call it any_call_...


I should probably move both loops (the one for declarations and the one 
for basic blocks) into its own function.



Lastly, I wonder if gimple_call_return_slot_opt_p is really what you are
after, why does NRV matter here?


The C code we generate does not construct the returned value in place 
(presumably because the partial write would be visible with threads, 
longjmp etc.), unlike the C++ code.


That's why I'm interested in instrumenting NRV-able calls only.  But 
gimple_call_return_slot_opt_p doesn't actually give me that.  The GIMPLE 
from the C test case looks like this (before and after applying your 
proposal):


foo11 ()
{
  int D.2265;
  struct B D.2266;

  D.2266 = global3 (); [return slot optimization]
  D.2265 = D.2266.a1;
  return D.2265;
}

In both cases, SSP instrumentation is applied:

.type   foo11, @function
foo11:
.LFB24:
.cfi_startproc
subq$56, %rsp
.cfi_def_cfa_offset 64
movq%rsp, %rdi
movq%fs:40, %rax
movq%rax, 40(%rsp)
xorl%eax, %eax
callglobal3
movq40(%rsp), %rdx
xorq%fs:40, %rdx
movl(%rsp), %eax
jne .L50
addq$56, %rsp
.cfi_remember_state
.cfi_def_cfa_offset 8
ret
.L50:
.cfi_restore_state
call__stack_chk_fail
.cfi_endproc


Isn't what you are looking for instead
whether the called function returns value through invisible reference,
because then you'll always have some (aggregate) addressable object
in the caller's frame and supposedly you are after making sure that the
callee doesn't overflow the return object.

So, looking at tree-nrv.c, that check would be roughly:
   if (is_gimple_call (stmt)
gimple_call_lhs (stmt)
aggregate_value_p (TREE_TYPE (gimple_call_lhs (stmt)),
 gimple_call_fndecl (stmt)))


When I do that, I get SSP instrumentation even when the struct is small 
enough to be returned in registers.  gimple_call_return_slot_opt_p 
returns false in this case.  So gimple_call_return_slot_opt_p appears to 
be misnamed (it's an ABI thing, not really an optimization), but it's 
closer to what I want.


--
Florian Weimer / Red Hat Product Security Team


Re: [PATCH] i?86 unaligned/aligned load improvement for AVX512F

2014-01-03 Thread Kirill Yukhin
Hello,
On 03 Jan 09:59, Jakub Jelinek wrote:
 Does it matter which of vmovdqu32 vs. vmovdqu64 is used if no
 masking/zeroing is performed (i.e. vmovdqu32 (%rax), %zmm0 vs.
 vmovdqu64 (%rax), %zmm0) for performance reasons (i.e. isn't there some
 reinterpretation penalty)?
No, there should be no penalty (at least from today point of view).
So, I like your patch!

--
Thanks, K


Re: [Patch, Fortran, OOP] PR 59547: Problem with using tbp specification function in multiple class procedures

2014-01-03 Thread Mikael Morin
Le 22/12/2013 11:28, Janus Weil a écrit :
 Hi all,
 
 here is a patch for a rejects-valid problem with type-bound
 procedures, which is due to the fact that the PURE attribute is being
 propagated too late. (I'm not sure if this problem could show up also
 with other attributes, so for now I'm only treating PURE.)
 
 Regtested on x86_64-unknown-linux-gnu. Ok for trunk?
 
OK.

Mikael


Re: [Patch] libgcov.c re-factoring

2014-01-03 Thread Teresa Johnson
On Sun, Dec 22, 2013 at 10:27 AM, Jan Hubicka hubi...@ucw.cz wrote:
 On Tue, Dec 17, 2013 at 7:48 AM, Teresa Johnson tejohn...@google.com wrote:
  On Mon, Dec 16, 2013 at 2:48 PM, Xinliang David Li davi...@google.com 
  wrote:
  Ok -- gcov_write_counter and gcov_write_tag_length are qualified as
  low level primitives for basic gcov format and probably should be kept
  in gcov-io.c.
 
  gcov_rewrite is petty much libgcov runtime implementation details so I
  think it should be moved out. gcov_write_summary is not related to
  gcov low level format either, neither is gcov_seek.  Ok for them to be
  moved?
 
  After looking at these some more, with the idea that gcov-io.c should
  encapsulate the lower level IO routines, then I think all of these
  (including gcov_rewrite) should remain in gcov-io.c. I think
  gcov_write_summary belongs there since all of the other gcov_write_*

 Yep, I think gcov_write_summary and read summary should not be split in 
 between
 two directories.  Similary for gcov_seek/rewrite I can see either the whole
 low-level I/O being abstracted away to -driver.c but currently -driver.c seem
 to containing higher level stuff that you apparenlty want to fork for kernel
 implementation instead and the actual i/o seems to remain in gcov-io.c

 +GCOV_LINKAGE struct gcov_var gcov_var;

 If gcov_var is not used by gcov-io.c only, why its declaration remains in 
 gcov-io.h?

Good point - moved to gcov-io.c.

 Index: libgcc/libgcov.h
 ===
 --- libgcc/libgcov.h(revision 0)
 +++ libgcc/libgcov.h(revision 0)
 @@ -0,0 +1,225 @@
 +/* Header file for libgcov-*.c.
 +   Contributed by Rong Xu x...@google.com.
 +   Copyright (C) 2013 Free Software Foundation, Inc.
 I believe when the code was created by moving it from elsehwre, the copyright 
 should say
 original date of gcov-io.h.

Fixed the copyright header.

 +
 +#include tconfig.h
 +#include tsystem.h
 +#include coretypes.h
 +#include tm.h
 +#include libgcc_tm.h
 
 I would really like someone working on header restructuring to comment on
 those.
 I am not 100% sure what our best practices currently are in respect of
 including headers within headers and specially after good amount of
 defines like gcov-io.h gets included.

Ok, I have left it the same as the prior patch for now. Note that
gcov-io.h was already including tconfig.h, which has now been moved to
libgcov.h. The other includes were pulled in from the various
libgcov-*.c files that were including them.

New patch (bootstrapped and regression tested) is included below.


 +
 +#include gcov-io.h

 Otherwise the patch  is OK (if header restructuring is fine and after
 moving gcov_var structure definition into gcov-io.c if possible).

Ok, hopefully someone can comment on the header file issue.

Thanks,
Teresa


 Honza

2014-01-03  Rong Xu  x...@google.com

* gcc/gcov-io.c (gcov_var): Move from gcov-io.h.
(gcov_position): Ditto.
(gcov_is_error): Ditto.
(gcov_rewrite): Ditto.
* gcc/gcov-io.h: Refactor. Move gcov_var to gcov-io.h, and libgcov
only part to libgcc/libgcov.h.
* libgcc/libgcov-driver.c: Use libgcov.h.
(buffer_fn_data): Use xmalloc instead of malloc.
(gcov_exit_merge_gcda): Ditto.
* libgcc/libgcov-driver-system.c (allocate_filename_struct): Ditto.
* libgcc/libgcov.h: New common header files for libgcov-*.h.
* libgcc/libgcov-interface.c: Use libgcov.h
* libgcc/libgcov-merge.c: Ditto.
* libgcc/libgcov-profiler.c: Ditto.
* libgcc/Makefile.in: Add dependence to libgcov.h

Index: gcc/gcov-io.c
===
--- gcc/gcov-io.c   (revision 206100)
+++ gcc/gcov-io.c   (working copy)
@@ -36,6 +36,61 @@ static const gcov_unsigned_t *gcov_read_words (uns
 static void gcov_allocate (unsigned);
 #endif

+/* Optimum number of gcov_unsigned_t's read from or written to disk.  */
+#define GCOV_BLOCK_SIZE (1  10)
+
+GCOV_LINKAGE struct gcov_var
+{
+  FILE *file;
+  gcov_position_t start;   /* Position of first byte of block */
+  unsigned offset; /* Read/write position within the block.  */
+  unsigned length; /* Read limit in the block.  */
+  unsigned overread;   /* Number of words overread.  */
+  int error;   /*  0 overflow,  0 disk error.  */
+  int mode;/*  0 writing,  0 reading */
+#if IN_LIBGCOV
+  /* Holds one block plus 4 bytes, thus all coverage reads  writes
+ fit within this buffer and we always can transfer GCOV_BLOCK_SIZE
+ to and from the disk. libgcov never backtracks and only writes 4
+ or 8 byte objects.  */
+  gcov_unsigned_t buffer[GCOV_BLOCK_SIZE + 1];
+#else
+  int endian;  /* Swap endianness.  */
+  /* Holds a variable length block, as the compiler can write
+ strings and needs to backtrack.  */
+  size_t alloc;
+  

Re: [Patch] libgcov.c re-factoring

2014-01-03 Thread Joseph S. Myers
On Fri, 3 Jan 2014, Teresa Johnson wrote:

 Index: libgcc/libgcov.h
 ===
 --- libgcc/libgcov.h(revision 0)
 +++ libgcc/libgcov.h(revision 0)
 @@ -0,0 +1,224 @@
 +/* Header file for libgcov-*.c.
 +   Copyright (C) 1996-2013 Free Software Foundation, Inc.
 +
 +   This file is part of GCC.
 +
 +   GCC is free software; you can redistribute it and/or modify it under
 +   the terms of the GNU General Public License as published by the Free
 +   Software Foundation; either version 3, or (at your option) any later
 +   version.
 +
 +   GCC is distributed in the hope that it will be useful, but WITHOUT ANY
 +   WARRANTY; without even the implied warranty of MERCHANTABILITY or
 +   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 +   for more details.
 +
 +   You should have received a copy of the GNU General Public License
 +   along with GCC; see the file COPYING3.  If not see
 +   http://www.gnu.org/licenses/.  */

Any libgcc source file (.c/.h etc., not makefiles) should have the runtime 
license exception.

-- 
Joseph S. Myers
jos...@codesourcery.com


Re: [C++ Patch] Fix __is_base_of vs incomplete types

2014-01-03 Thread Paolo Carlini

Hi,

On 01/03/2014 07:47 PM, Jason Merrill wrote:

On 01/03/2014 12:29 PM, Paolo Carlini wrote:

-  (lookup_base ((TYPE), (PARENT), ba_any, NULL, tf_none) != NULL_TREE)
+  (lookup_base ((TYPE), (PARENT), ba_any, NULL, tf_none) != NULL_TREE \
+   || ((TYPE)  NON_UNION_CLASS_TYPE_P (TYPE) \
+same_type_ignoring_top_level_qualifiers_p ((PARENT), 
(TYPE
Let's only check one or the other, depending on whether TYPE is 
incomplete.

The dispatching seems tricky, though, because lookup_base has:

   if (!TYPE_P (t))
{
  t_binfo = t;
  t = BINFO_TYPE (t);
}
  else
{
  t = complete_type (TYPE_MAIN_VARIANT (t));
  t_binfo = TYPE_BINFO (t);
}

thus TYPE_P (t) isn't necessarily true and, more importantly, otherwise 
the function actively tries to complete the type. Do you think it can work?


Thanks,
Paolo.

PS: sorry about the sloppy comment in my last try ;)


Re: [Patch] libgcov.c re-factoring

2014-01-03 Thread Teresa Johnson
On Fri, Jan 3, 2014 at 2:49 PM, Joseph S. Myers jos...@codesourcery.com wrote:
 On Fri, 3 Jan 2014, Teresa Johnson wrote:

 Index: libgcc/libgcov.h
 ===
 --- libgcc/libgcov.h(revision 0)
 +++ libgcc/libgcov.h(revision 0)
 @@ -0,0 +1,224 @@
 +/* Header file for libgcov-*.c.
 +   Copyright (C) 1996-2013 Free Software Foundation, Inc.
 +
 +   This file is part of GCC.
 +
 +   GCC is free software; you can redistribute it and/or modify it under
 +   the terms of the GNU General Public License as published by the Free
 +   Software Foundation; either version 3, or (at your option) any later
 +   version.
 +
 +   GCC is distributed in the hope that it will be useful, but WITHOUT ANY
 +   WARRANTY; without even the implied warranty of MERCHANTABILITY or
 +   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 +   for more details.
 +
 +   You should have received a copy of the GNU General Public License
 +   along with GCC; see the file COPYING3.  If not see
 +   http://www.gnu.org/licenses/.  */

 Any libgcc source file (.c/.h etc., not makefiles) should have the runtime
 license exception.

Ok, thanks. I have changed this last paragraph to the following:

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   http://www.gnu.org/licenses/.  */

Teresa


 --
 Joseph S. Myers
 jos...@codesourcery.com



-- 
Teresa Johnson | Software Engineer | tejohn...@google.com | 408-460-2413


Re: [C++ Patch] Fix __is_base_of vs incomplete types

2014-01-03 Thread Paolo Carlini
... something like the attached appears to work. Not sure at the moment 
if it could be simplified.


Thanks,
Paolo.


Index: cp/cp-tree.h
===
--- cp/cp-tree.h(revision 206318)
+++ cp/cp-tree.h(working copy)
@@ -1324,7 +1324,12 @@ enum languages { lang_c, lang_cplusplus, lang_java
 /* Nonzero iff TYPE is derived from PARENT. Ignores accessibility and
ambiguity issues.  */
 #define DERIVED_FROM_P(PARENT, TYPE) \
-  (lookup_base ((TYPE), (PARENT), ba_any, NULL, tf_none) != NULL_TREE)
+  (((TYPE)  (!TYPE_P (TYPE)  \
+  || TYPE_BINFO (complete_type \
+ (TYPE_MAIN_VARIANT (TYPE) \
+   ? lookup_base ((TYPE), (PARENT), ba_any, NULL, tf_none) != NULL_TREE \
+   : ((TYPE)  NON_UNION_CLASS_TYPE_P (TYPE)  \
+   same_type_ignoring_top_level_qualifiers_p ((PARENT), (TYPE
 
 /* Gives the visibility specification for a class type.  */
 #define CLASSTYPE_VISIBILITY(TYPE) \
Index: cp/search.c
===
--- cp/search.c (revision 206318)
+++ cp/search.c (working copy)
@@ -177,8 +177,9 @@ accessible_base_p (tree t, tree base, bool conside
discovered.
 
If the base is inaccessible, or ambiguous, then error_mark_node is
-   returned.  If the tf_error bit of COMPLAIN is not set, no error
-   is issued.  */
+   returned.  If the tf_error bit of COMPLAIN is not set, no error is
+   issued.  If T cannot be completed, then NULL_TREE is returned even
+   if BASE is the same type.  */
 
 tree
 lookup_base (tree t, tree base, base_access access,
Index: testsuite/g++.dg/ext/is_base_of_incomplete-2.C
===
--- testsuite/g++.dg/ext/is_base_of_incomplete-2.C  (revision 0)
+++ testsuite/g++.dg/ext/is_base_of_incomplete-2.C  (working copy)
@@ -0,0 +1,5 @@
+struct T;
+
+int check1[__is_base_of(T, T) ? 1 : -1];
+int check2[__is_base_of(T, const T) ? 1 : -1];
+int check3[__is_base_of(volatile T, T) ? 1 : -1];


Re: [PATCH] Introduce MODE_SIZE mode attribute

2014-01-03 Thread Jakub Jelinek
On Fri, Jan 03, 2014 at 03:39:11PM +, Joseph S. Myers wrote:
 On Fri, 3 Jan 2014, Jakub Jelinek wrote:
 
  I've noticed that especially with the AVX512F introduction we use
  GET_MODE_SIZE (MODEmode) quite heavily in the i386 *.md files, and
  the problem with that is GET_MODE_SIZE isn't a compile time constant,
  needs to read mode_size array (non-const) at runtime.
 
 It would seem better to me for genmodes to generate appropriate macro / 
 inline function definitions that can be used by GET_MODE_SIZE (along the 
 lines of (__builtin_constant_p (MODE)  !mode_size_variable (MODE) ? 
 get_mode_size_constant (MODE) : (unsigned short) mode_size[MODE]), where 
 mode_size_variable and get_mode_size_constant are always_inline functions 
 generated by genmodes) - that way all targets are covered automatically, 
 without needing such duplication of mode sizes.  (Of course such 
 optimizations wouldn't apply for a first-stage compiler bootstrapped by a 
 compiler not supporting __builtin_constant_p, but lack of optimization in 
 the first stage of a bootstrap is not a particular concern.)

That is certainly doable (as attached), but strangely if the patch (that I've
already committed) is reverted and this one applied, the .text savings are
much smaller.

Here are .text and .rodata readelf -WS lines from x86_64 (first 4 pairs) and
i686 (last 4 pairs) builds, always vanilla trunk before r206312, that plus
r206312 patch, without r206312 but with attached patch, with both r206312
and attached patch.  So, for .text size the best is both patches, but
for .rodata patches just r206312.  I'll try to look at details why this is so
next week.

  [12] .text PROGBITS004f4b00 0f4b00 1131704 00  AX 
 0   0 16
  [14] .rodata   PROGBITS01626240 1226240 4093b4 00   A 
 0   0 64
  [12] .text PROGBITS004f20a0 0f20a0 11156e4 00  AX 
 0   0 16
  [14] .rodata   PROGBITS016077c0 12077c0 3fcbb4 00   A 
 0   0 64
  [12] .text PROGBITS004f4c60 0f4c60 112b8b4 00  AX 
 0   0 16
  [14] .rodata   PROGBITS01620540 1220540 40b134 00   A 
 0   0 64
  [12] .text PROGBITS004f2200 0f2200 1113eb4 00  AX 
 0   0 16
  [14] .rodata   PROGBITS016060c0 12060c0 3fea74 00   A 
 0   0 64
  [12] .text PROGBITS0811d750 0d5750 12b4464 00  AX  0   0 
16
  [14] .rodata   PROGBITS093d1c00 1389c00 2d8094 00   A  0   0 
64
  [12] .text PROGBITS0811b150 0d3150 12996a4 00  AX  0   0 
16
  [14] .rodata   PROGBITS093b4840 136c840 2d1354 00   A  0   0 
64
  [12] .text PROGBITS0811d840 0d5840 12aa1e4 00  AX  0   0 
16
  [14] .rodata   PROGBITS093c7a40 137fa40 2d8b94 00   A  0   0 
64
  [12] .text PROGBITS0811b240 0d3240 1292914 00  AX  0   0 
16
  [14] .rodata   PROGBITS093adb80 1365b80 2d1f94 00   A  0   0 
64

2014-01-04  Jakub Jelinek  ja...@redhat.com

* genmodes.c (struct mode_data): Add need_bytesize_adj field.
(blank_mdoe): Initialize it.
(emit_mode_size_inline, emit_mode_nunits_inline,
emit_mode_inner_inline): New functions.
(emit_insn_modes_h): Call them and surround their output with
#if GCC_VERSION = 4001 ... #endif.
* machmode.h (GET_MODE_SIZE, GET_MODE_NUNITS, GET_MODE_INNER):
For GCC_VERSION = 4001 use mode_*_inline routines instead of
mode_* arrays if the argument is __builtin_constant_p.
* lower-subreg.c (dump_choices): Make sure GET_MODE_SIZE argument
is enum machine_mode.
fortran/
* trans-types.c (gfc_init_kinds): Make sure GET_MODE_BITSIZE
argument is enum machine_mode.

--- gcc/lower-subreg.c.jj   2013-12-10 18:18:39.077943292 +0100
+++ gcc/lower-subreg.c  2014-01-03 18:35:00.510418999 +0100
@@ -1371,7 +1371,7 @@ dump_choices (bool speed_p, const char *
   fprintf (dump_file, Choices when optimizing for %s:\n, description);
 
   for (i = 0; i  MAX_MACHINE_MODE; i++)
-if (GET_MODE_SIZE (i)  UNITS_PER_WORD)
+if (GET_MODE_SIZE ((enum machine_mode) i)  UNITS_PER_WORD)
   fprintf (dump_file,   %s mode %s for copy lowering.\n,
   choices[speed_p].move_modes_to_split[i]
   ? Splitting
--- gcc/fortran/trans-types.c.jj2013-11-21 22:24:18.790939654 +0100
+++ gcc/fortran/trans-types.c   2014-01-03 18:35:00.534418997 +0100
@@ -373,7 +373,7 @@ gfc_init_kinds (void)
   /* The middle end doesn't support constants larger than 2*HWI.
 Perhaps the target hook shouldn't have accepted these either,
 but just to be safe...  */
-  bitsize = GET_MODE_BITSIZE (mode);
+  bitsize = GET_MODE_BITSIZE ((enum machine_mode) mode);
   if (bitsize  2*HOST_BITS_PER_WIDE_INT)
continue;
 
--- gcc/machmode.h.jj   2013-11-29 18:22:15.671594951 +0100

Re: wide-int, tree

2014-01-03 Thread Mike Stump
I'm wondering if all the outstanding issues you raised with tree have been 
addressed.  If there are any that remain, let us know.

If they have been, is the original patch (as modified of course by approved 
later work) Ok?

Re: [PATCH] Introduce MODE_SIZE mode attribute

2014-01-03 Thread Andrew Pinski
On Fri, Jan 3, 2014 at 3:37 PM, Jakub Jelinek ja...@redhat.com wrote:
 On Fri, Jan 03, 2014 at 03:39:11PM +, Joseph S. Myers wrote:
 On Fri, 3 Jan 2014, Jakub Jelinek wrote:

  I've noticed that especially with the AVX512F introduction we use
  GET_MODE_SIZE (MODEmode) quite heavily in the i386 *.md files, and
  the problem with that is GET_MODE_SIZE isn't a compile time constant,
  needs to read mode_size array (non-const) at runtime.

 It would seem better to me for genmodes to generate appropriate macro /
 inline function definitions that can be used by GET_MODE_SIZE (along the
 lines of (__builtin_constant_p (MODE)  !mode_size_variable (MODE) ?
 get_mode_size_constant (MODE) : (unsigned short) mode_size[MODE]), where
 mode_size_variable and get_mode_size_constant are always_inline functions
 generated by genmodes) - that way all targets are covered automatically,
 without needing such duplication of mode sizes.  (Of course such
 optimizations wouldn't apply for a first-stage compiler bootstrapped by a
 compiler not supporting __builtin_constant_p, but lack of optimization in
 the first stage of a bootstrap is not a particular concern.)

 That is certainly doable (as attached), but strangely if the patch (that I've
 already committed) is reverted and this one applied, the .text savings are
 much smaller.

 Here are .text and .rodata readelf -WS lines from x86_64 (first 4 pairs) and
 i686 (last 4 pairs) builds, always vanilla trunk before r206312, that plus
 r206312 patch, without r206312 but with attached patch, with both r206312
 and attached patch.  So, for .text size the best is both patches, but
 for .rodata patches just r206312.  I'll try to look at details why this is so
 next week.

   [12] .text PROGBITS004f4b00 0f4b00 1131704 00  
 AX  0   0 16
   [14] .rodata   PROGBITS01626240 1226240 4093b4 00   
 A  0   0 64
   [12] .text PROGBITS004f20a0 0f20a0 11156e4 00  
 AX  0   0 16
   [14] .rodata   PROGBITS016077c0 12077c0 3fcbb4 00   
 A  0   0 64
   [12] .text PROGBITS004f4c60 0f4c60 112b8b4 00  
 AX  0   0 16
   [14] .rodata   PROGBITS01620540 1220540 40b134 00   
 A  0   0 64
   [12] .text PROGBITS004f2200 0f2200 1113eb4 00  
 AX  0   0 16
   [14] .rodata   PROGBITS016060c0 12060c0 3fea74 00   
 A  0   0 64
   [12] .text PROGBITS0811d750 0d5750 12b4464 00  AX  0   
 0 16
   [14] .rodata   PROGBITS093d1c00 1389c00 2d8094 00   A  0   
 0 64
   [12] .text PROGBITS0811b150 0d3150 12996a4 00  AX  0   
 0 16
   [14] .rodata   PROGBITS093b4840 136c840 2d1354 00   A  0   
 0 64
   [12] .text PROGBITS0811d840 0d5840 12aa1e4 00  AX  0   
 0 16
   [14] .rodata   PROGBITS093c7a40 137fa40 2d8b94 00   A  0   
 0 64
   [12] .text PROGBITS0811b240 0d3240 1292914 00  AX  0   
 0 16
   [14] .rodata   PROGBITS093adb80 1365b80 2d1f94 00   A  0   
 0 64

 2014-01-04  Jakub Jelinek  ja...@redhat.com

 * genmodes.c (struct mode_data): Add need_bytesize_adj field.
 (blank_mdoe): Initialize it.
 (emit_mode_size_inline, emit_mode_nunits_inline,
 emit_mode_inner_inline): New functions.
 (emit_insn_modes_h): Call them and surround their output with
 #if GCC_VERSION = 4001 ... #endif.
 * machmode.h (GET_MODE_SIZE, GET_MODE_NUNITS, GET_MODE_INNER):
 For GCC_VERSION = 4001 use mode_*_inline routines instead of
 mode_* arrays if the argument is __builtin_constant_p.
 * lower-subreg.c (dump_choices): Make sure GET_MODE_SIZE argument
 is enum machine_mode.
 fortran/
 * trans-types.c (gfc_init_kinds): Make sure GET_MODE_BITSIZE
 argument is enum machine_mode.

It turns out Jorn filed a bug about this exact issue (back in 2008).
See bug 36109.


Thanks,
Andrew Pinski



 --- gcc/lower-subreg.c.jj   2013-12-10 18:18:39.077943292 +0100
 +++ gcc/lower-subreg.c  2014-01-03 18:35:00.510418999 +0100
 @@ -1371,7 +1371,7 @@ dump_choices (bool speed_p, const char *
fprintf (dump_file, Choices when optimizing for %s:\n, description);

for (i = 0; i  MAX_MACHINE_MODE; i++)
 -if (GET_MODE_SIZE (i)  UNITS_PER_WORD)
 +if (GET_MODE_SIZE ((enum machine_mode) i)  UNITS_PER_WORD)
fprintf (dump_file,   %s mode %s for copy lowering.\n,
choices[speed_p].move_modes_to_split[i]
? Splitting
 --- gcc/fortran/trans-types.c.jj2013-11-21 22:24:18.790939654 +0100
 +++ gcc/fortran/trans-types.c   2014-01-03 18:35:00.534418997 +0100
 @@ -373,7 +373,7 @@ gfc_init_kinds (void)
/* The middle end doesn't support constants larger than 2*HWI.
  Perhaps the target hook shouldn't have accepted these either,
  but just to be 

Re: [PATCH] Introduce MODE_SIZE mode attribute

2014-01-03 Thread Andrew Pinski
On Fri, Jan 3, 2014 at 6:27 PM, Andrew Pinski pins...@gmail.com wrote:
 On Fri, Jan 3, 2014 at 3:37 PM, Jakub Jelinek ja...@redhat.com wrote:
 On Fri, Jan 03, 2014 at 03:39:11PM +, Joseph S. Myers wrote:
 On Fri, 3 Jan 2014, Jakub Jelinek wrote:

  I've noticed that especially with the AVX512F introduction we use
  GET_MODE_SIZE (MODEmode) quite heavily in the i386 *.md files, and
  the problem with that is GET_MODE_SIZE isn't a compile time constant,
  needs to read mode_size array (non-const) at runtime.

 It would seem better to me for genmodes to generate appropriate macro /
 inline function definitions that can be used by GET_MODE_SIZE (along the
 lines of (__builtin_constant_p (MODE)  !mode_size_variable (MODE) ?
 get_mode_size_constant (MODE) : (unsigned short) mode_size[MODE]), where
 mode_size_variable and get_mode_size_constant are always_inline functions
 generated by genmodes) - that way all targets are covered automatically,
 without needing such duplication of mode sizes.  (Of course such
 optimizations wouldn't apply for a first-stage compiler bootstrapped by a
 compiler not supporting __builtin_constant_p, but lack of optimization in
 the first stage of a bootstrap is not a particular concern.)

 That is certainly doable (as attached), but strangely if the patch (that I've
 already committed) is reverted and this one applied, the .text savings are
 much smaller.

 Here are .text and .rodata readelf -WS lines from x86_64 (first 4 pairs) and
 i686 (last 4 pairs) builds, always vanilla trunk before r206312, that plus
 r206312 patch, without r206312 but with attached patch, with both r206312
 and attached patch.  So, for .text size the best is both patches, but
 for .rodata patches just r206312.  I'll try to look at details why this is so
 next week.

   [12] .text PROGBITS004f4b00 0f4b00 1131704 00  
 AX  0   0 16
   [14] .rodata   PROGBITS01626240 1226240 4093b4 00  
  A  0   0 64
   [12] .text PROGBITS004f20a0 0f20a0 11156e4 00  
 AX  0   0 16
   [14] .rodata   PROGBITS016077c0 12077c0 3fcbb4 00  
  A  0   0 64
   [12] .text PROGBITS004f4c60 0f4c60 112b8b4 00  
 AX  0   0 16
   [14] .rodata   PROGBITS01620540 1220540 40b134 00  
  A  0   0 64
   [12] .text PROGBITS004f2200 0f2200 1113eb4 00  
 AX  0   0 16
   [14] .rodata   PROGBITS016060c0 12060c0 3fea74 00  
  A  0   0 64
   [12] .text PROGBITS0811d750 0d5750 12b4464 00  AX  0   
 0 16
   [14] .rodata   PROGBITS093d1c00 1389c00 2d8094 00   A  0   
 0 64
   [12] .text PROGBITS0811b150 0d3150 12996a4 00  AX  0   
 0 16
   [14] .rodata   PROGBITS093b4840 136c840 2d1354 00   A  0   
 0 64
   [12] .text PROGBITS0811d840 0d5840 12aa1e4 00  AX  0   
 0 16
   [14] .rodata   PROGBITS093c7a40 137fa40 2d8b94 00   A  0   
 0 64
   [12] .text PROGBITS0811b240 0d3240 1292914 00  AX  0   
 0 16
   [14] .rodata   PROGBITS093adb80 1365b80 2d1f94 00   A  0   
 0 64

 2014-01-04  Jakub Jelinek  ja...@redhat.com

 * genmodes.c (struct mode_data): Add need_bytesize_adj field.
 (blank_mdoe): Initialize it.
 (emit_mode_size_inline, emit_mode_nunits_inline,
 emit_mode_inner_inline): New functions.
 (emit_insn_modes_h): Call them and surround their output with
 #if GCC_VERSION = 4001 ... #endif.
 * machmode.h (GET_MODE_SIZE, GET_MODE_NUNITS, GET_MODE_INNER):
 For GCC_VERSION = 4001 use mode_*_inline routines instead of
 mode_* arrays if the argument is __builtin_constant_p.
 * lower-subreg.c (dump_choices): Make sure GET_MODE_SIZE argument
 is enum machine_mode.
 fortran/
 * trans-types.c (gfc_init_kinds): Make sure GET_MODE_BITSIZE
 argument is enum machine_mode.

 It turns out Jorn filed a bug about this exact issue (back in 2008).
 See bug 36109.

Also Kazu filed a similar thing about TREE_CODE_LENGTH and
TREE_CODE_CLASS, see bug 14840; I attached a patch but I never got
around to seeing if it improves compile time speed either.  It
definitely showed up in fold-const.c code at one point.

Thanks,
Andrew Pinski



 Thanks,
 Andrew Pinski



 --- gcc/lower-subreg.c.jj   2013-12-10 18:18:39.077943292 +0100
 +++ gcc/lower-subreg.c  2014-01-03 18:35:00.510418999 +0100
 @@ -1371,7 +1371,7 @@ dump_choices (bool speed_p, const char *
fprintf (dump_file, Choices when optimizing for %s:\n, description);

for (i = 0; i  MAX_MACHINE_MODE; i++)
 -if (GET_MODE_SIZE (i)  UNITS_PER_WORD)
 +if (GET_MODE_SIZE ((enum machine_mode) i)  UNITS_PER_WORD)
fprintf (dump_file,   %s mode %s for copy lowering.\n,
choices[speed_p].move_modes_to_split[i]
? Splitting
 --- 

[Patch] Regex bracket matcher cache optimization

2014-01-03 Thread Tim Shen
The data structure _BracketMatcher (storing regex like [123a-z]) could
be quite slow mainly because of regex_traits. So a result of cache for
small range of inputs (char) sounds reasonable. It iterates all 256
inputs and calculate them at regex compile time.

Booted and tested under -m64 and -m32.

Thanks!


-- 
Regards,
Tim Shen
commit de1e43989be9c9dffebf488e118549cfe2987b9e
Author: tim timshe...@gmail.com
Date:   Fri Jan 3 11:43:12 2014 -0500

2014-01-03  Tim Shen  timshe...@gmail.com

	* include/bits/regex_compiler.h (_AnyMatcher::_AnyMatcher(),
	_AnyMatcher::operator(), _AnyMatcher::_M_apply(),
	_CharMatcher::_CharMatcher(), _CharMatcher::_M_translate(),
	_BracketMatcher::_BracketMatcher(), _BracketMatcher::operator(),
	_BracketMatcher::_M_add_char(),
	_BracketMatcher::_M_add_collating_element(),
	_BracketMatcher::_M_add_equivalence_class(),
	_BracketMatcher::_M_add_character_class(),
	_BracketMatcher::_M_make_range(), _BracketMatcher::_M_ready(),
	_BracketMatcher::_M_apply(), _BracketMatcher::_M_make_cache()):
	Fix _AnyMatcher behavior of POSIX style and move _M_flags
	to template parameter; Add cache for _BracketMatcher. Adjust
	declarations from here...
	* include/bits/regex.h (basic_regex::imbue()): ...to here. Also,
	imbuing a regex will trigger a recompilation to rebuild the cache.
	* include/bits/regex_compiler.tcc (_Compiler::_M_atom(),
	_Compiler::_M_bracket_expression()): Adjust matchers' caller for
	different template bool parameters.
	* include/bits/regex_executor.h: Remove unnecessary declarations.
	* include/std/regex: Adjust including orders.
	* testsuite/28_regex/traits/char/user_defined.cc: New.
	* testsuite/28_regex/traits/wchar_t/user_defined.cc: New.

diff --git a/libstdc++-v3/include/bits/regex.h b/libstdc++-v3/include/bits/regex.h
index 4e091e0..ae8e1f5 100644
--- a/libstdc++-v3/include/bits/regex.h
+++ b/libstdc++-v3/include/bits/regex.h
@@ -30,6 +30,15 @@
 
 namespace std _GLIBCXX_VISIBILITY(default)
 {
+_GLIBCXX_BEGIN_NAMESPACE_VERSION
+  templatetypename, typename
+class basic_regex;
+
+  templatetypename, typename
+class match_results;
+
+_GLIBCXX_END_NAMESPACE_VERSION
+
 namespace __detail
 {
 _GLIBCXX_BEGIN_NAMESPACE_VERSION
@@ -48,6 +57,56 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 		  const basic_regex_CharT, _TraitsT __re,
 		  regex_constants::match_flag_type __flags);
 
+  templatetypename, typename, typename, bool
+class _Executor;
+
+  templatetypename _Tp
+struct __has_contiguous_iter : std::false_type { };
+
+  templatetypename _Ch, typename _Tr, typename _Alloc
+struct __has_contiguous_iterstd::basic_string_Ch, _Tr, _Alloc
+: std::true_type  // stringCh storage is contiguous
+{ };
+
+  templatetypename _Tp, typename _Alloc
+struct __has_contiguous_iterstd::vector_Tp, _Alloc
+: std::true_type  // vectorTp storage is contiguous
+{ };
+
+  templatetypename _Alloc
+struct __has_contiguous_iterstd::vectorbool, _Alloc
+: std::false_type // vectorbool storage is not contiguous
+{ };
+
+  templatetypename _Tp
+struct __is_contiguous_normal_iter : std::false_type { };
+
+  templatetypename _Tp, typename _Cont
+struct
+__is_contiguous_normal_iter__gnu_cxx::__normal_iterator_Tp, _Cont
+: __has_contiguous_iter_Cont::type
+{ };
+
+  templatetypename _Iter, typename _TraitsT
+using __enable_if_contiguous_normal_iter
+  = typename enable_if __is_contiguous_normal_iter_Iter::value,
+			std::shared_ptr_NFA_TraitsT ::type;
+
+  templatetypename _Iter, typename _TraitsT
+using __disable_if_contiguous_normal_iter
+  = typename enable_if !__is_contiguous_normal_iter_Iter::value,
+			std::shared_ptr_NFA_TraitsT ::type;
+
+  templatetypename _FwdIter, typename _TraitsT
+__disable_if_contiguous_normal_iter_FwdIter, _TraitsT
+__compile_nfa(_FwdIter __first, _FwdIter __last, const _TraitsT __traits,
+		  regex_constants::syntax_option_type __flags);
+
+  templatetypename _Iter, typename _TraitsT
+__enable_if_contiguous_normal_iter_Iter, _TraitsT
+__compile_nfa(_Iter __first, _Iter __last, const _TraitsT __traits,
+		  regex_constants::syntax_option_type __flags);
+
 _GLIBCXX_END_NAMESPACE_VERSION
 }
 
@@ -501,6 +560,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	basic_regex(_FwdIter __first, _FwdIter __last,
 		flag_type __f = ECMAScript)
 	: _M_flags(__f),
+	  _M_original_str(__first, __last),
 	  _M_automaton(__detail::__compile_nfa(__first, __last, _M_traits,
 	   _M_flags))
 	{ }
@@ -637,6 +697,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	   flag_type __flags = ECMAScript)
 	{
 	  _M_flags = __flags;
+	  _M_original_str.assign(__s.begin(), __s.end());
 	  _M_automaton = __detail::__compile_nfa(__s.begin(), __s.end(),
 		 _M_traits, _M_flags);
 	  return *this;
@@ -701,7 +762,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
*/
   locale_type
   

Re: Fix IBM long double division inaccuracy (glibc bug 15396)

2014-01-03 Thread Alan Modra
On Thu, Jan 02, 2014 at 09:46:56PM +, Joseph S. Myers wrote:
 (Note that there remain other bugs in the IBM long double code, some
 causing glibc test failures, at least (a) invalid results in rounding
 modes other than FE_TONEAREST, (b) spurious overflow and underflow
 exceptions, mainly but not entirely where discontiguous mantissa bits
 are involved.)

Thing is, the algorithms in rs6000/ibm-ldouble.c require round to
nearest to generate correct results.  Quoting from the PowerPC64 ABI:

This Extended precision differs from the IEEE 754 Standard in the 
following ways:
 
 * The software support is restricted to round-to-nearest mode.  
   Programs that use extended precision must ensure that this rounding 
   mode is in effect when extended-precision calculations are performed.
 * Does not fully support the IEEE special numbers NaN and INF.  These
   values are encoded in the high-order double value only.  The 
   low-order value is not significant.  
 * Does not support the IEEE status flags for overflow, underflow, and 
   other conditions.  These flag have no meaning in this format.


-- 
Alan Modra
Australia Development Lab, IBM


Re: [patch] libbacktrace: add support for --disable-werror

2014-01-03 Thread Ian Lance Taylor
On Fri, Jan 3, 2014 at 11:19 AM, Mike Frysinger vap...@gentoo.org wrote:
 In the same vein as the other dirs, add a --disable-werror option to the
 libbacktrace dir to disable the explicit -Werror usage.

As far as I can see most of the target directories do not do this.
And libbacktrace is only turning on -Werror when it is a target
directory.  I'm not necessarily opposed to this, but I question the
premise: this is not like the other dirs, this is something new.

Ian



 2014-01-03  Mike Frysinger  dirtye...@gentoo.org

 * configure.ac: Add --enable-werror.
 (WARN_FLAGS): Use it.
 * configure: Regenerate.

 --- a/libbacktrace/configure.ac
 +++ a/libbacktrace/configure.ac
 @@ -132,8 +132,13 @@ ACX_PROG_CC_WARNING_OPTS([-W -Wall -Wwri
   -Wmissing-format-attribute -Wcast-qual],
   [WARN_FLAGS])

 +AC_ARG_ENABLE(werror, [AS_HELP_STRING([--enable-werror],
 +  [turns on -Werror 
 @:@default=yes@:@])])
 +
  if test -n ${with_target_subdir}; then
 -  WARN_FLAGS=$WARN_FLAGS -Werror
 +  if test x$enable_werror != xno; then
 +WARN_FLAGS=$WARN_FLAGS -Werror
 +  fi
  fi

  AC_SUBST(WARN_FLAGS)
 --- a/libbacktrace/configure
 +++ a/libbacktrace/configure
 @@ -730,6 +730,7 @@ enable_fast_install
  with_gnu_ld
  enable_libtool_lock
  enable_multilib
 +enable_werror
  with_system_libunwind
  '
ac_precious_vars='build_alias
 @@ -1370,6 +1371,7 @@ Optional Features:
optimize for fast installation [default=yes]
--disable-libtool-lock  avoid locking (might break parallel builds)
--enable-multilib   build many library versions (default)
 +  --enable-werror turns on -Werror [default=yes]

  Optional Packages:
--with-PACKAGE[=ARG]use PACKAGE [ARG=yes]
 @@ -11614,8 +11616,16 @@ fi
  CFLAGS=$save_CFLAGS


 +# Check whether --enable-werror was given.
 +if test ${enable_werror+set} = set; then :
 +  enableval=$enable_werror;
 +fi
 +
 +
  if test -n ${with_target_subdir}; then
 -  WARN_FLAGS=$WARN_FLAGS -Werror
 +  if test x$enable_werror != xno; then
 +WARN_FLAGS=$WARN_FLAGS -Werror
 +  fi
  fi