Re: [RFC, RFH PATCH, i386] Fix gcc.target/i386/pr61403.c FAIL with -mavx2

2014-10-03 Thread Jakub Jelinek
On Thu, Oct 02, 2014 at 08:34:40PM +0200, Uros Bizjak wrote:
 Index: i386.c
 ===
 --- i386.c  (revision 215802)
 +++ i386.c  (working copy)
 @@ -43407,8 +43407,10 @@ expand_vec_perm_pblendv (struct expand_vec_perm_d
   AVX and AVX2 as they require more than 2 instructions.  */
if (d-one_operand_p)
  return false;
 -  if (TARGET_SSE4_1  GET_MODE_SIZE (vmode) == 16)
 +  if (TARGET_AVX2  GET_MODE_SIZE (vmode) == 32)
  ;
 +  else if (TARGET_SSE4_1  GET_MODE_SIZE (vmode) == 16)
 +;
else
  return false;
 
 --cut here--
 
 The comment above expand_vec_perm_pblendv claims that:
 
   /* Use the same checks as in expand_vec_perm_blend, but skipping
  AVX and AVX2 as they require more than 2 instructions.  */

The comment is mostly right though, I'd say as they sometimes require
more than 2 instructions.

 BTW: I have no access to avx2 target, so I can't test the patch with a
 runtime tests. OTOH, it doesn't ICE for GCC_TEST_RUN_EXPENSIVE=1 make
 check-gcc RUNTESTFLAGS='--target_board=unix/-mavx2
 dg-torture.exp=vshuf*.c'.

Even the expensive testsuite has very limited coverage.
As I wanted to prove your patch will ICE, I wrote following generator:

#ifndef ITYPE
#define ITYPE TYPE
#endif
#define S2(X) #X
#define S(X) S2(X)

int
main ()
{
  int i, j, nelt = 32 / sizeof (TYPE);
  printf (
typedef  S(TYPE)  V __attribute__ ((vector_size (32)));\n
typedef  S(ITYPE)  VI __attribute__ ((vector_size (32)));\n
V a, b, c;\n
\n
#define T(n, m...) void foo##n (void) { c = __builtin_shuffle (a, b, (VI) m); 
}\n
#define S(n, m...) T(n, m)\n);
  for (i = 0; i  10; i++)
{
  printf (S (__LINE__, { );
  for (j = 0; j  nelt; j++)
{
  int k = random ()  3;
  int v = j;
  if (k  1)
v = ((k  2) ? nelt : 0) + (random ()  (nelt - 1));
  printf (%d%s, v, j  (nelt - 1) ? ,  :  })\n);
}
}
}

which can be compiled e.g. with
-DTYPE=char
-DTYPE=short
-DTYPE=int
-DTYPE=long
-DTYPE=float -DITYPE=int
-DTYPE=double -DITYPE=long
and then in each case generate 10 tests (sort -u on it plus manual fixup
can decrease that, for the V4DI/V4DF cases substantially).  The first one
triggered almost immediately an ICE, added to vshuf-32.inc (non-expensive).

With the following updated patch all those generated testcases don't ICE
(-mavx2 for the first four, -mavx for the last two).

Also tested with
GCC_TEST_RUN_EXPENSIVE=1 make check-gcc 
RUNTESTFLAGS='--target_board=unix/-mavx2 dg-torture.exp=vshuf*.c'

The pr61403.c testcase can be simplified into:
typedef float V __attribute__ ((vector_size (32)));
typedef int VI __attribute__ ((vector_size (32)));
V a, b, c;

#define T(n, m...) void foo##n (void) { c = __builtin_shuffle (a, b, (VI) m); }
T (0, { 0, 1, 2, 3, 4, 5, 10, 13 })
T (1, { 0, 1, 2, 3, 4, 8, 11, 14 })
T (2, { 0, 1, 2, 3, 4, 9, 12, 15 })
T (3, { 0, 13, 2, 3, 14, 5, 6, 15 })
T (4, { 0, 1, 8, 3, 4, 9, 6, 7 })
T (5, { 0, 3, 11, 0, 4, 12, 0, 5 })
T (6, { 0, 3, 6, 9, 12, 15, 0, 0 })
T (7, { 0, 8, 0, 1, 9, 0, 2, 10 })
T (8, { 10, 1, 2, 11, 4, 5, 12, 7 })
T (9, { 13, 0, 6, 14, 0, 7, 15, 0 })
T (10, { 1, 4, 7, 10, 13, 0, 0, 0 })
T (11, { 2, 5, 8, 11, 14, 0, 0, 0 })
permutations, where both your and my patch optimize
foo{0,1,2,3,4,8}.

2014-10-03  Jakub Jelinek  ja...@redhat.com
Uros Bizjak  ubiz...@gmail.com

PR tree-optimization/61403
* config/i386/i386.c (expand_vec_perm_palignr): Fix a spelling
error in comment.  Also optimize 256-bit vectors for AVX2
or AVX (floating vectors only), provided the first permutation
can be performed in one insn.

* gcc.dg/torture/vshuf-32.inc: Add a new test 29.

--- gcc/config/i386/i386.c.jj   2014-10-03 09:26:14.0 +0200
+++ gcc/config/i386/i386.c  2014-10-03 12:39:24.040185310 +0200
@@ -43422,7 +43422,7 @@ expand_vec_perm_palignr (struct expand_v
 
 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
the permutation using the SSE4_1 pblendv instruction.  Potentially
-   reduces permutaion from 2 pshufb and or to 1 pshufb and pblendv.  */
+   reduces permutation from 2 pshufb and or to 1 pshufb and pblendv.  */
 
 static bool
 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
@@ -43432,11 +43432,14 @@ expand_vec_perm_pblendv (struct expand_v
   enum machine_mode vmode = d-vmode;
   bool ok;
 
-  /* Use the same checks as in expand_vec_perm_blend, but skipping
- AVX and AVX2 as they require more than 2 instructions.  */
+  /* Use the same checks as in expand_vec_perm_blend.  */
   if (d-one_operand_p)
 return false;
-  if (TARGET_SSE4_1  GET_MODE_SIZE (vmode) == 16)
+  if (TARGET_AVX2  GET_MODE_SIZE (vmode) == 32)
+;
+  else if (TARGET_AVX  (vmode == V4DFmode || vmode == V8SFmode))
+;
+  else if (TARGET_SSE4_1  GET_MODE_SIZE (vmode) == 16)
 ;
   else
 return false;
@@ -43458,7 +43461,7 @@ expand_vec_perm_pblendv (struct 

Re: [RFC, RFH PATCH, i386] Fix gcc.target/i386/pr61403.c FAIL with -mavx2

2014-10-03 Thread Uros Bizjak
On Fri, Oct 3, 2014 at 1:11 PM, Jakub Jelinek ja...@redhat.com wrote:
 On Thu, Oct 02, 2014 at 08:34:40PM +0200, Uros Bizjak wrote:
 Index: i386.c
 ===
 --- i386.c  (revision 215802)
 +++ i386.c  (working copy)
 @@ -43407,8 +43407,10 @@ expand_vec_perm_pblendv (struct expand_vec_perm_d
   AVX and AVX2 as they require more than 2 instructions.  */
if (d-one_operand_p)
  return false;
 -  if (TARGET_SSE4_1  GET_MODE_SIZE (vmode) == 16)
 +  if (TARGET_AVX2  GET_MODE_SIZE (vmode) == 32)
  ;
 +  else if (TARGET_SSE4_1  GET_MODE_SIZE (vmode) == 16)
 +;
else
  return false;

 --cut here--

 The comment above expand_vec_perm_pblendv claims that:

   /* Use the same checks as in expand_vec_perm_blend, but skipping
  AVX and AVX2 as they require more than 2 instructions.  */

 The comment is mostly right though, I'd say as they sometimes require
 more than 2 instructions.

 BTW: I have no access to avx2 target, so I can't test the patch with a
 runtime tests. OTOH, it doesn't ICE for GCC_TEST_RUN_EXPENSIVE=1 make
 check-gcc RUNTESTFLAGS='--target_board=unix/-mavx2
 dg-torture.exp=vshuf*.c'.

 Even the expensive testsuite has very limited coverage.
 As I wanted to prove your patch will ICE, I wrote following generator:

 #ifndef ITYPE
 #define ITYPE TYPE
 #endif
 #define S2(X) #X
 #define S(X) S2(X)

 int
 main ()
 {
   int i, j, nelt = 32 / sizeof (TYPE);
   printf (
 typedef  S(TYPE)  V __attribute__ ((vector_size (32)));\n
 typedef  S(ITYPE)  VI __attribute__ ((vector_size (32)));\n
 V a, b, c;\n
 \n
 #define T(n, m...) void foo##n (void) { c = __builtin_shuffle (a, b, (VI) 
 m); }\n
 #define S(n, m...) T(n, m)\n);
   for (i = 0; i  10; i++)
 {
   printf (S (__LINE__, { );
   for (j = 0; j  nelt; j++)
 {
   int k = random ()  3;
   int v = j;
   if (k  1)
 v = ((k  2) ? nelt : 0) + (random ()  (nelt - 1));
   printf (%d%s, v, j  (nelt - 1) ? ,  :  })\n);
 }
 }
 }

 which can be compiled e.g. with
 -DTYPE=char
 -DTYPE=short
 -DTYPE=int
 -DTYPE=long
 -DTYPE=float -DITYPE=int
 -DTYPE=double -DITYPE=long
 and then in each case generate 10 tests (sort -u on it plus manual fixup
 can decrease that, for the V4DI/V4DF cases substantially).  The first one
 triggered almost immediately an ICE, added to vshuf-32.inc (non-expensive).

 With the following updated patch all those generated testcases don't ICE
 (-mavx2 for the first four, -mavx for the last two).

 Also tested with
 GCC_TEST_RUN_EXPENSIVE=1 make check-gcc 
 RUNTESTFLAGS='--target_board=unix/-mavx2 dg-torture.exp=vshuf*.c'

I have had some problems testing with TARGET_AVX part of the change
and /-mavx tests. I assume that your patch survives these tests.

 The pr61403.c testcase can be simplified into:
 typedef float V __attribute__ ((vector_size (32)));
 typedef int VI __attribute__ ((vector_size (32)));
 V a, b, c;

 #define T(n, m...) void foo##n (void) { c = __builtin_shuffle (a, b, (VI) m); 
 }
 T (0, { 0, 1, 2, 3, 4, 5, 10, 13 })
 T (1, { 0, 1, 2, 3, 4, 8, 11, 14 })
 T (2, { 0, 1, 2, 3, 4, 9, 12, 15 })
 T (3, { 0, 13, 2, 3, 14, 5, 6, 15 })
 T (4, { 0, 1, 8, 3, 4, 9, 6, 7 })
 T (5, { 0, 3, 11, 0, 4, 12, 0, 5 })
 T (6, { 0, 3, 6, 9, 12, 15, 0, 0 })
 T (7, { 0, 8, 0, 1, 9, 0, 2, 10 })
 T (8, { 10, 1, 2, 11, 4, 5, 12, 7 })
 T (9, { 13, 0, 6, 14, 0, 7, 15, 0 })
 T (10, { 1, 4, 7, 10, 13, 0, 0, 0 })
 T (11, { 2, 5, 8, 11, 14, 0, 0, 0 })
 permutations, where both your and my patch optimize
 foo{0,1,2,3,4,8}.

 2014-10-03  Jakub Jelinek  ja...@redhat.com
 Uros Bizjak  ubiz...@gmail.com

 PR tree-optimization/61403
 * config/i386/i386.c (expand_vec_perm_palignr): Fix a spelling
 error in comment.  Also optimize 256-bit vectors for AVX2
 or AVX (floating vectors only), provided the first permutation
 can be performed in one insn.

 * gcc.dg/torture/vshuf-32.inc: Add a new test 29.

OK if the patch bootstraps and regtests without problems.

Thanks,
Uros.


[RFC, RFH PATCH, i386] Fix gcc.target/i386/pr61403.c FAIL with -mavx2

2014-10-02 Thread Uros Bizjak
On Wed, Oct 1, 2014 at 9:03 PM, Uros Bizjak ubiz...@gmail.com wrote:

 And now the expand_vec_perm_palignr improvement, tested
 with GCC_TEST_RUN_EXPENSIVE=1 make check-gcc \
 RUNTESTFLAGS='--target_board=unix/-mavx2 dg-torture.exp=vshuf*.c'
 E.g.
 typedef unsigned long long V __attribute__ ((vector_size (32)));
 extern void abort (void);
 V a, b, c, d;
 void test_14 (void)
 {
   V mask = { 6, 1, 3, 4 };
   int i;
   c = __builtin_shuffle (a, mask);
   d = __builtin_shuffle (a, b, mask);
 }
 (distilled from test 15 in vshuf-v4di.c) results in:
 -   vmovdqa a(%rip), %ymm0
 -   vpermq  $54, %ymm0, %ymm1
 -   vpshufb .LC1(%rip), %ymm0, %ymm0
 -   vmovdqa %ymm1, c(%rip)
 -   vmovdqa b(%rip), %ymm1
 -   vpshufb .LC0(%rip), %ymm1, %ymm1
 -   vpermq  $78, %ymm1, %ymm1
 -   vpor%ymm1, %ymm0, %ymm0
 +   vmovdqa a(%rip), %ymm1
 +   vpermq  $54, %ymm1, %ymm0
 +   vmovdqa %ymm0, c(%rip)
 +   vmovdqa b(%rip), %ymm0
 +   vpalignr$8, %ymm1, %ymm0, %ymm0
 +   vpermq  $99, %ymm0, %ymm0
 vmovdqa %ymm0, d(%rip)
 vzeroupper
 ret
 change (and two fewer .rodata constants).

 On a related note, I would like to point out that
 gcc.target/i386/pr61403.c also fails to generate blend insn with
 -mavx2. The new insn sequence includes lots of new vpshufb insns with
 memory access.

Following patch fixes the failure:

--cut here--
Index: i386.c
===
--- i386.c  (revision 215802)
+++ i386.c  (working copy)
@@ -43407,8 +43407,10 @@ expand_vec_perm_pblendv (struct expand_vec_perm_d
  AVX and AVX2 as they require more than 2 instructions.  */
   if (d-one_operand_p)
 return false;
-  if (TARGET_SSE4_1  GET_MODE_SIZE (vmode) == 16)
+  if (TARGET_AVX2  GET_MODE_SIZE (vmode) == 32)
 ;
+  else if (TARGET_SSE4_1  GET_MODE_SIZE (vmode) == 16)
+;
   else
 return false;

--cut here--

The comment above expand_vec_perm_pblendv claims that:

  /* Use the same checks as in expand_vec_perm_blend, but skipping
 AVX and AVX2 as they require more than 2 instructions.  */

However, I see a significant reduction in vpshufb and vpor
instructions (33-16 and 22-11), and 6 new vblendps insns.

BTW: I have no access to avx2 target, so I can't test the patch with a
runtime tests. OTOH, it doesn't ICE for GCC_TEST_RUN_EXPENSIVE=1 make
check-gcc RUNTESTFLAGS='--target_board=unix/-mavx2
dg-torture.exp=vshuf*.c'.

Jakub, what do you think?

Uros.