Hi!

I've noticed vector extraction generates terrible code
with -mavx{,2} when extracting something from 32-byte vectors.
Everything is forced into memory, then loaded back.

For extraction of first lane I believe we can just use
standard 128-bit extraction from the %xmmN register corresponding
to %ymmN register containing the 256-bit vector, if the extraction
is %v prefixed it shouldn't result in any penalty, right?
For the second lane vextract{f,i}128 is used to swap the lanes
first (well, before reload even the first lane is represented as
extraction of the first lane, but post reload this is splitted
into a subreg).

>From what I understood, for vectors containing integers instead of
floats AVX2 prefers vextracti128 instead of vextractf128, this patch
teaches those patterns to do that.  Should we do the same for
vinsertf128 patterns with V*[QHSD]Imode 32-byte modes?

The avx2_extracti128 pattern looked like wrong RTL, as to extract
a 2 element vector from 4 element vector it used just one constant
in the parallel instead of two.  I've changed it into a define_expand.

Still, for SSE4.1+ we seem to generate terrible code for
float extraction of elements 1, 2 and 3 (and for 32-byte vectors
also 5, 6 and 7 after the lanes are swapped).  For -mno-sse4
we would be in those cases shuffling the vectors and then doing
vec_extractv4sf_0 which is a nop.  But for SSE4.1 we have:
(define_insn "*sse4_1_extractps"
  [(set (match_operand:SF 0 "nonimmediate_operand" "=rm")
        (vec_select:SF
          (match_operand:V4SF 1 "register_operand" "x")
          (parallel [(match_operand:SI 2 "const_0_to_3_operand" "n")])))]
  "TARGET_SSE4_1"
  "%vextractps\t{%2, %1, %0|%0, %1, %2}"
  [(set_attr "type" "sselog")
   (set_attr "prefix_data16" "1")
   (set_attr "prefix_extra" "1")
   (set_attr "length_immediate" "1")
   (set_attr "prefix" "maybe_vex")
   (set_attr "mode" "V4SF")])
which is fine if we want to extract into general register or memory,
but if we want to extract the float into "x" constraint register, this
results in spilling it and loading back immediately.  I wonder if
the above insn shouldn't have "=rm,x" alternative which would be splitted
after reload into doing what ix86_expand_vector_extract does in that
case for pre-SSE4.1 - i.e. some vector shuffling and the noop
vec_extractv4sf_0-ish SUBREGing.

Thoughts?

2011-09-16  Jakub Jelinek  <ja...@redhat.com>

        * config/i386/sse.md (vec_extract_hi_<mode>,
        vec_extract_hi_v16hi, vec_extract_hi_v32qi): Use
        vextracti128 instead of vextractf128 for -mavx2 and
        integer vectors.  For V4DFmode fix up mode attribute.
        (VEC_EXTRACT_MODE): For TARGET_AVX add 32-byte vectors.
        (vec_set_lo_<mode>, vec_set_hi_<mode>): For VI8F_256 modes use V4DF
        instead of V8SF mode attribute.
        (avx2_extracti128): Change into define_expand.
        * config/i386/i386.c (ix86_expand_vector_extract): Handle
        32-byte vector modes if TARGET_AVX.

        * gcc.target/i386/sse2-extract-1.c: New test.
        * gcc.target/i386/avx-extract-1.c: New test.

--- gcc/config/i386/sse.md.jj   2011-09-15 17:36:20.000000000 +0200
+++ gcc/config/i386/sse.md      2011-09-16 10:51:51.000000000 +0200
@@ -3863,13 +3863,23 @@ (define_insn "vec_extract_hi_<mode>"
          (match_operand:VI8F_256 1 "register_operand" "x,x")
          (parallel [(const_int 2) (const_int 3)])))]
   "TARGET_AVX"
-  "vextractf128\t{$0x1, %1, %0|%0, %1, 0x1}"
+{
+  if (get_attr_mode (insn) == MODE_OI)
+    return "vextracti128\t{$0x1, %1, %0|%0, %1, 0x1}";
+  else
+    return "vextractf128\t{$0x1, %1, %0|%0, %1, 0x1}";
+}
   [(set_attr "type" "sselog")
    (set_attr "prefix_extra" "1")
    (set_attr "length_immediate" "1")
    (set_attr "memory" "none,store")
    (set_attr "prefix" "vex")
-   (set_attr "mode" "V8SF")])
+   (set (attr "mode")
+     (if_then_else
+       (and (match_test "TARGET_AVX2")
+           (eq (const_string "<MODE>mode") (const_string "V4DImode")))
+     (const_string "OI")
+     (const_string "V4DF")))])
 
 (define_insn_and_split "vec_extract_lo_<mode>"
   [(set (match_operand:<ssehalfvecmode> 0 "nonimmediate_operand" "=x,m")
@@ -3898,13 +3908,23 @@ (define_insn "vec_extract_hi_<mode>"
          (parallel [(const_int 4) (const_int 5)
                     (const_int 6) (const_int 7)])))]
   "TARGET_AVX"
-  "vextractf128\t{$0x1, %1, %0|%0, %1, 0x1}"
+{
+  if (get_attr_mode (insn) == MODE_OI)
+    return "vextracti128\t{$0x1, %1, %0|%0, %1, 0x1}";
+  else
+    return "vextractf128\t{$0x1, %1, %0|%0, %1, 0x1}";
+}
   [(set_attr "type" "sselog")
    (set_attr "prefix_extra" "1")
    (set_attr "length_immediate" "1")
    (set_attr "memory" "none,store")
    (set_attr "prefix" "vex")
-   (set_attr "mode" "V8SF")])
+   (set (attr "mode")
+     (if_then_else
+       (and (match_test "TARGET_AVX2")
+           (eq (const_string "<MODE>mode") (const_string "V8SImode")))
+     (const_string "OI")
+     (const_string "V8SF")))])
 
 (define_insn_and_split "vec_extract_lo_v16hi"
   [(set (match_operand:V8HI 0 "nonimmediate_operand" "=x,m")
@@ -3937,13 +3957,21 @@ (define_insn "vec_extract_hi_v16hi"
                     (const_int 12) (const_int 13)
                     (const_int 14) (const_int 15)])))]
   "TARGET_AVX"
-  "vextractf128\t{$0x1, %1, %0|%0, %1, 0x1}"
+{
+  if (get_attr_mode (insn) == MODE_OI)
+    return "vextracti128\t{$0x1, %1, %0|%0, %1, 0x1}";
+  else
+    return "vextractf128\t{$0x1, %1, %0|%0, %1, 0x1}";
+}
   [(set_attr "type" "sselog")
    (set_attr "prefix_extra" "1")
    (set_attr "length_immediate" "1")
    (set_attr "memory" "none,store")
    (set_attr "prefix" "vex")
-   (set_attr "mode" "V8SF")])
+   (set (attr "mode")
+     (if_then_else (match_test "TARGET_AVX2")
+   (const_string "OI")
+   (const_string "V8SF")))])
 
 (define_insn_and_split "vec_extract_lo_v32qi"
   [(set (match_operand:V16QI 0 "nonimmediate_operand" "=x,m")
@@ -3984,13 +4012,21 @@ (define_insn "vec_extract_hi_v32qi"
                     (const_int 28) (const_int 29)
                     (const_int 30) (const_int 31)])))]
   "TARGET_AVX"
-  "vextractf128\t{$0x1, %1, %0|%0, %1, 0x1}"
+{
+  if (get_attr_mode (insn) == MODE_OI)
+    return "vextracti128\t{$0x1, %1, %0|%0, %1, 0x1}";
+  else
+    return "vextractf128\t{$0x1, %1, %0|%0, %1, 0x1}";
+}
   [(set_attr "type" "sselog")
    (set_attr "prefix_extra" "1")
    (set_attr "length_immediate" "1")
    (set_attr "memory" "none,store")
    (set_attr "prefix" "vex")
-   (set_attr "mode" "V8SF")])
+   (set (attr "mode")
+     (if_then_else (match_test "TARGET_AVX2")
+   (const_string "OI")
+   (const_string "V8SF")))])
 
 (define_insn "*sse4_1_extractps"
   [(set (match_operand:SF 0 "nonimmediate_operand" "=rm")
@@ -4024,7 +4060,10 @@ (define_insn_and_split "*vec_extract_v4s
 
 ;; Modes handled by vec_extract patterns.
 (define_mode_iterator VEC_EXTRACT_MODE
-  [V16QI V8HI V4SI V2DI
+  [(V32QI "TARGET_AVX") V16QI
+   (V16HI "TARGET_AVX") V8HI
+   (V8SI "TARGET_AVX") V4SI
+   (V4DI "TARGET_AVX") V2DI
    (V8SF "TARGET_AVX") V4SF
    (V4DF "TARGET_AVX") V2DF])
 
@@ -11952,7 +11991,7 @@ (define_insn "vec_set_lo_<mode>"
    (set_attr "prefix_extra" "1")
    (set_attr "length_immediate" "1")
    (set_attr "prefix" "vex")
-   (set_attr "mode" "V8SF")])
+   (set_attr "mode" "V4DF")])
 
 (define_insn "vec_set_hi_<mode>"
   [(set (match_operand:VI8F_256 0 "register_operand" "=x")
@@ -11967,7 +12006,7 @@ (define_insn "vec_set_hi_<mode>"
    (set_attr "prefix_extra" "1")
    (set_attr "length_immediate" "1")
    (set_attr "prefix" "vex")
-   (set_attr "mode" "V8SF")])
+   (set_attr "mode" "V4DF")])
 
 (define_insn "vec_set_lo_<mode>"
   [(set (match_operand:VI4F_256 0 "register_operand" "=x")
@@ -12158,17 +12197,29 @@ (define_expand "vec_init<mode>"
   DONE;
 })
 
-(define_insn "avx2_extracti128"
-  [(set (match_operand:V2DI 0 "register_operand" "=x")
-       (vec_select:V2DI
-         (match_operand:V4DI 1 "nonimmediate_operand" "xm")
-         (parallel [(match_operand:SI 2 "const_0_to_1_operand" "n")])))]
+(define_expand "avx2_extracti128"
+  [(match_operand:V2DI 0 "register_operand" "")
+   (match_operand:V4DI 1 "nonimmediate_operand" "")
+   (match_operand:SI 2 "const_0_to_1_operand" "")]
   "TARGET_AVX2"
-  "vextracti128\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "type" "ssemov")
-   (set_attr "prefix_extra" "1")
-   (set_attr "prefix" "vex")
-   (set_attr "mode" "OI")])
+{
+  rtx (*insn)(rtx, rtx);
+
+  switch (INTVAL (operands[3]))
+    {
+    case 0:
+      insn = gen_vec_extract_lo_v4di;
+      break;
+    case 1:
+      insn = gen_vec_extract_hi_v4di;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  emit_insn (insn (operands[0], operands[1]));
+  DONE;
+})
 
 (define_expand "avx2_inserti128"
   [(match_operand:V4DI 0 "register_operand" "")
--- gcc/config/i386/i386.c.jj   2011-09-15 19:27:03.000000000 +0200
+++ gcc/config/i386/i386.c      2011-09-16 09:37:43.000000000 +0200
@@ -32592,6 +32592,84 @@ ix86_expand_vector_extract (bool mmx_ok,
       use_vec_extr = TARGET_SSE4_1;
       break;
 
+    case V8SFmode:
+      if (TARGET_AVX)
+       {
+         tmp = gen_reg_rtx (V4SFmode);
+         if (elt < 4)
+           emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
+         else
+           emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
+         ix86_expand_vector_extract (false, target, tmp, elt & 3);
+         return;
+       }
+      break;
+
+    case V4DFmode:
+      if (TARGET_AVX)
+       {
+         tmp = gen_reg_rtx (V2DFmode);
+         if (elt < 2)
+           emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
+         else
+           emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
+         ix86_expand_vector_extract (false, target, tmp, elt & 1);
+         return;
+       }
+      break;
+
+    case V32QImode:
+      if (TARGET_AVX)
+       {
+         tmp = gen_reg_rtx (V16QImode);
+         if (elt < 16)
+           emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
+         else
+           emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
+         ix86_expand_vector_extract (false, target, tmp, elt & 15);
+         return;
+       }
+      break;
+
+    case V16HImode:
+      if (TARGET_AVX)
+       {
+         tmp = gen_reg_rtx (V8HImode);
+         if (elt < 8)
+           emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
+         else
+           emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
+         ix86_expand_vector_extract (false, target, tmp, elt & 7);
+         return;
+       }
+      break;
+
+    case V8SImode:
+      if (TARGET_AVX)
+       {
+         tmp = gen_reg_rtx (V4SImode);
+         if (elt < 4)
+           emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
+         else
+           emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
+         ix86_expand_vector_extract (false, target, tmp, elt & 3);
+         return;
+       }
+      break;
+
+    case V4DImode:
+      if (TARGET_AVX)
+       {
+         tmp = gen_reg_rtx (V2DImode);
+         if (elt < 2)
+           emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
+         else
+           emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
+         ix86_expand_vector_extract (false, target, tmp, elt & 1);
+         return;
+       }
+      break;
+
     case V8QImode:
       /* ??? Could extract the appropriate HImode element and shift.  */
     default:
--- gcc/testsuite/gcc.target/i386/sse2-extract-1.c.jj   2011-09-16 
10:41:45.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/sse2-extract-1.c      2011-09-16 
10:41:55.000000000 +0200
@@ -0,0 +1,102 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -msse2" } */
+/* { dg-require-effective-target sse2_runtime } */
+
+extern void abort (void);
+typedef unsigned long long uint64_t;
+
+#define vector(elcount, type)  \
+__attribute__((vector_size((elcount)*sizeof(type)))) type
+
+#define FN(elcount, type, idx) \
+__attribute__((noinline, noclone)) \
+type f##type##elcount##_##idx (vector (elcount, type) x) { return x[idx] + 1; }
+#define T2(elcount, type) \
+  H (elcount, type) \
+  F (elcount, type, 0) \
+  F (elcount, type, 1)
+#define T4(elcount, type) \
+  T2 (elcount, type) \
+  F (elcount, type, 2) \
+  F (elcount, type, 3)
+#define T8(elcount, type) \
+  T4 (elcount, type) \
+  F (elcount, type, 4) \
+  F (elcount, type, 5) \
+  F (elcount, type, 6) \
+  F (elcount, type, 7)
+#define T16(elcount, type) \
+  T8 (elcount, type) \
+  F (elcount, type, 8) \
+  F (elcount, type, 9) \
+  F (elcount, type, 10) \
+  F (elcount, type, 11) \
+  F (elcount, type, 12) \
+  F (elcount, type, 13) \
+  F (elcount, type, 14) \
+  F (elcount, type, 15)
+#define T32(elcount, type) \
+  T16 (elcount, type) \
+  F (elcount, type, 16) \
+  F (elcount, type, 17) \
+  F (elcount, type, 18) \
+  F (elcount, type, 19) \
+  F (elcount, type, 20) \
+  F (elcount, type, 21) \
+  F (elcount, type, 22) \
+  F (elcount, type, 23) \
+  F (elcount, type, 24) \
+  F (elcount, type, 25) \
+  F (elcount, type, 26) \
+  F (elcount, type, 27) \
+  F (elcount, type, 28) \
+  F (elcount, type, 29) \
+  F (elcount, type, 30) \
+  F (elcount, type, 31)
+#define TESTS_SSE2 \
+T2 (2, double) E \
+T2 (2, uint64_t) E \
+T4 (4, float) E \
+T4 (4, int) E \
+T8 (8, short) E \
+T16 (16, char) E
+#define TESTS_AVX \
+T4 (4, double) E \
+T4 (4, uint64_t) E \
+T8 (8, float) E \
+T8 (8, int) E \
+T16 (16, short) E \
+T32 (32, char) E
+#ifdef __AVX__
+#define TESTS TESTS_SSE2 TESTS_AVX
+#else
+#define TESTS TESTS_SSE2
+#endif
+
+#define F FN
+#define H(elcount, type)
+#define E
+TESTS
+
+int
+main ()
+{
+#undef F
+#undef H
+#undef E
+#define H(elcount, type) \
+  vector (elcount, type) v##type##elcount = {
+#define E };
+#define F(elcount, type, idx) idx + 1,
+  TESTS
+#undef F
+#undef H
+#undef E
+#define H(elcount, type)
+#define E
+#define F(elcount, type, idx) \
+  if (f##type##elcount##_##idx (v##type##elcount) != idx + 2) \
+    abort ();
+  TESTS
+  return 0;
+}
--- gcc/testsuite/gcc.target/i386/avx-extract-1.c.jj    2011-09-16 
10:44:19.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/avx-extract-1.c       2011-09-16 
10:44:58.000000000 +0200
@@ -0,0 +1,5 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-require-effective-target avx_runtime } */
+
+#include "sse2-extract-1.c"

        Jakub

Reply via email to