Hi! I've noticed vector extraction generates terrible code with -mavx{,2} when extracting something from 32-byte vectors. Everything is forced into memory, then loaded back.
For extraction of first lane I believe we can just use standard 128-bit extraction from the %xmmN register corresponding to %ymmN register containing the 256-bit vector, if the extraction is %v prefixed it shouldn't result in any penalty, right? For the second lane vextract{f,i}128 is used to swap the lanes first (well, before reload even the first lane is represented as extraction of the first lane, but post reload this is splitted into a subreg). >From what I understood, for vectors containing integers instead of floats AVX2 prefers vextracti128 instead of vextractf128, this patch teaches those patterns to do that. Should we do the same for vinsertf128 patterns with V*[QHSD]Imode 32-byte modes? The avx2_extracti128 pattern looked like wrong RTL, as to extract a 2 element vector from 4 element vector it used just one constant in the parallel instead of two. I've changed it into a define_expand. Still, for SSE4.1+ we seem to generate terrible code for float extraction of elements 1, 2 and 3 (and for 32-byte vectors also 5, 6 and 7 after the lanes are swapped). For -mno-sse4 we would be in those cases shuffling the vectors and then doing vec_extractv4sf_0 which is a nop. But for SSE4.1 we have: (define_insn "*sse4_1_extractps" [(set (match_operand:SF 0 "nonimmediate_operand" "=rm") (vec_select:SF (match_operand:V4SF 1 "register_operand" "x") (parallel [(match_operand:SI 2 "const_0_to_3_operand" "n")])))] "TARGET_SSE4_1" "%vextractps\t{%2, %1, %0|%0, %1, %2}" [(set_attr "type" "sselog") (set_attr "prefix_data16" "1") (set_attr "prefix_extra" "1") (set_attr "length_immediate" "1") (set_attr "prefix" "maybe_vex") (set_attr "mode" "V4SF")]) which is fine if we want to extract into general register or memory, but if we want to extract the float into "x" constraint register, this results in spilling it and loading back immediately. I wonder if the above insn shouldn't have "=rm,x" alternative which would be splitted after reload into doing what ix86_expand_vector_extract does in that case for pre-SSE4.1 - i.e. some vector shuffling and the noop vec_extractv4sf_0-ish SUBREGing. Thoughts? 2011-09-16 Jakub Jelinek <ja...@redhat.com> * config/i386/sse.md (vec_extract_hi_<mode>, vec_extract_hi_v16hi, vec_extract_hi_v32qi): Use vextracti128 instead of vextractf128 for -mavx2 and integer vectors. For V4DFmode fix up mode attribute. (VEC_EXTRACT_MODE): For TARGET_AVX add 32-byte vectors. (vec_set_lo_<mode>, vec_set_hi_<mode>): For VI8F_256 modes use V4DF instead of V8SF mode attribute. (avx2_extracti128): Change into define_expand. * config/i386/i386.c (ix86_expand_vector_extract): Handle 32-byte vector modes if TARGET_AVX. * gcc.target/i386/sse2-extract-1.c: New test. * gcc.target/i386/avx-extract-1.c: New test. --- gcc/config/i386/sse.md.jj 2011-09-15 17:36:20.000000000 +0200 +++ gcc/config/i386/sse.md 2011-09-16 10:51:51.000000000 +0200 @@ -3863,13 +3863,23 @@ (define_insn "vec_extract_hi_<mode>" (match_operand:VI8F_256 1 "register_operand" "x,x") (parallel [(const_int 2) (const_int 3)])))] "TARGET_AVX" - "vextractf128\t{$0x1, %1, %0|%0, %1, 0x1}" +{ + if (get_attr_mode (insn) == MODE_OI) + return "vextracti128\t{$0x1, %1, %0|%0, %1, 0x1}"; + else + return "vextractf128\t{$0x1, %1, %0|%0, %1, 0x1}"; +} [(set_attr "type" "sselog") (set_attr "prefix_extra" "1") (set_attr "length_immediate" "1") (set_attr "memory" "none,store") (set_attr "prefix" "vex") - (set_attr "mode" "V8SF")]) + (set (attr "mode") + (if_then_else + (and (match_test "TARGET_AVX2") + (eq (const_string "<MODE>mode") (const_string "V4DImode"))) + (const_string "OI") + (const_string "V4DF")))]) (define_insn_and_split "vec_extract_lo_<mode>" [(set (match_operand:<ssehalfvecmode> 0 "nonimmediate_operand" "=x,m") @@ -3898,13 +3908,23 @@ (define_insn "vec_extract_hi_<mode>" (parallel [(const_int 4) (const_int 5) (const_int 6) (const_int 7)])))] "TARGET_AVX" - "vextractf128\t{$0x1, %1, %0|%0, %1, 0x1}" +{ + if (get_attr_mode (insn) == MODE_OI) + return "vextracti128\t{$0x1, %1, %0|%0, %1, 0x1}"; + else + return "vextractf128\t{$0x1, %1, %0|%0, %1, 0x1}"; +} [(set_attr "type" "sselog") (set_attr "prefix_extra" "1") (set_attr "length_immediate" "1") (set_attr "memory" "none,store") (set_attr "prefix" "vex") - (set_attr "mode" "V8SF")]) + (set (attr "mode") + (if_then_else + (and (match_test "TARGET_AVX2") + (eq (const_string "<MODE>mode") (const_string "V8SImode"))) + (const_string "OI") + (const_string "V8SF")))]) (define_insn_and_split "vec_extract_lo_v16hi" [(set (match_operand:V8HI 0 "nonimmediate_operand" "=x,m") @@ -3937,13 +3957,21 @@ (define_insn "vec_extract_hi_v16hi" (const_int 12) (const_int 13) (const_int 14) (const_int 15)])))] "TARGET_AVX" - "vextractf128\t{$0x1, %1, %0|%0, %1, 0x1}" +{ + if (get_attr_mode (insn) == MODE_OI) + return "vextracti128\t{$0x1, %1, %0|%0, %1, 0x1}"; + else + return "vextractf128\t{$0x1, %1, %0|%0, %1, 0x1}"; +} [(set_attr "type" "sselog") (set_attr "prefix_extra" "1") (set_attr "length_immediate" "1") (set_attr "memory" "none,store") (set_attr "prefix" "vex") - (set_attr "mode" "V8SF")]) + (set (attr "mode") + (if_then_else (match_test "TARGET_AVX2") + (const_string "OI") + (const_string "V8SF")))]) (define_insn_and_split "vec_extract_lo_v32qi" [(set (match_operand:V16QI 0 "nonimmediate_operand" "=x,m") @@ -3984,13 +4012,21 @@ (define_insn "vec_extract_hi_v32qi" (const_int 28) (const_int 29) (const_int 30) (const_int 31)])))] "TARGET_AVX" - "vextractf128\t{$0x1, %1, %0|%0, %1, 0x1}" +{ + if (get_attr_mode (insn) == MODE_OI) + return "vextracti128\t{$0x1, %1, %0|%0, %1, 0x1}"; + else + return "vextractf128\t{$0x1, %1, %0|%0, %1, 0x1}"; +} [(set_attr "type" "sselog") (set_attr "prefix_extra" "1") (set_attr "length_immediate" "1") (set_attr "memory" "none,store") (set_attr "prefix" "vex") - (set_attr "mode" "V8SF")]) + (set (attr "mode") + (if_then_else (match_test "TARGET_AVX2") + (const_string "OI") + (const_string "V8SF")))]) (define_insn "*sse4_1_extractps" [(set (match_operand:SF 0 "nonimmediate_operand" "=rm") @@ -4024,7 +4060,10 @@ (define_insn_and_split "*vec_extract_v4s ;; Modes handled by vec_extract patterns. (define_mode_iterator VEC_EXTRACT_MODE - [V16QI V8HI V4SI V2DI + [(V32QI "TARGET_AVX") V16QI + (V16HI "TARGET_AVX") V8HI + (V8SI "TARGET_AVX") V4SI + (V4DI "TARGET_AVX") V2DI (V8SF "TARGET_AVX") V4SF (V4DF "TARGET_AVX") V2DF]) @@ -11952,7 +11991,7 @@ (define_insn "vec_set_lo_<mode>" (set_attr "prefix_extra" "1") (set_attr "length_immediate" "1") (set_attr "prefix" "vex") - (set_attr "mode" "V8SF")]) + (set_attr "mode" "V4DF")]) (define_insn "vec_set_hi_<mode>" [(set (match_operand:VI8F_256 0 "register_operand" "=x") @@ -11967,7 +12006,7 @@ (define_insn "vec_set_hi_<mode>" (set_attr "prefix_extra" "1") (set_attr "length_immediate" "1") (set_attr "prefix" "vex") - (set_attr "mode" "V8SF")]) + (set_attr "mode" "V4DF")]) (define_insn "vec_set_lo_<mode>" [(set (match_operand:VI4F_256 0 "register_operand" "=x") @@ -12158,17 +12197,29 @@ (define_expand "vec_init<mode>" DONE; }) -(define_insn "avx2_extracti128" - [(set (match_operand:V2DI 0 "register_operand" "=x") - (vec_select:V2DI - (match_operand:V4DI 1 "nonimmediate_operand" "xm") - (parallel [(match_operand:SI 2 "const_0_to_1_operand" "n")])))] +(define_expand "avx2_extracti128" + [(match_operand:V2DI 0 "register_operand" "") + (match_operand:V4DI 1 "nonimmediate_operand" "") + (match_operand:SI 2 "const_0_to_1_operand" "")] "TARGET_AVX2" - "vextracti128\t{%2, %1, %0|%0, %1, %2}" - [(set_attr "type" "ssemov") - (set_attr "prefix_extra" "1") - (set_attr "prefix" "vex") - (set_attr "mode" "OI")]) +{ + rtx (*insn)(rtx, rtx); + + switch (INTVAL (operands[3])) + { + case 0: + insn = gen_vec_extract_lo_v4di; + break; + case 1: + insn = gen_vec_extract_hi_v4di; + break; + default: + gcc_unreachable (); + } + + emit_insn (insn (operands[0], operands[1])); + DONE; +}) (define_expand "avx2_inserti128" [(match_operand:V4DI 0 "register_operand" "") --- gcc/config/i386/i386.c.jj 2011-09-15 19:27:03.000000000 +0200 +++ gcc/config/i386/i386.c 2011-09-16 09:37:43.000000000 +0200 @@ -32592,6 +32592,84 @@ ix86_expand_vector_extract (bool mmx_ok, use_vec_extr = TARGET_SSE4_1; break; + case V8SFmode: + if (TARGET_AVX) + { + tmp = gen_reg_rtx (V4SFmode); + if (elt < 4) + emit_insn (gen_vec_extract_lo_v8sf (tmp, vec)); + else + emit_insn (gen_vec_extract_hi_v8sf (tmp, vec)); + ix86_expand_vector_extract (false, target, tmp, elt & 3); + return; + } + break; + + case V4DFmode: + if (TARGET_AVX) + { + tmp = gen_reg_rtx (V2DFmode); + if (elt < 2) + emit_insn (gen_vec_extract_lo_v4df (tmp, vec)); + else + emit_insn (gen_vec_extract_hi_v4df (tmp, vec)); + ix86_expand_vector_extract (false, target, tmp, elt & 1); + return; + } + break; + + case V32QImode: + if (TARGET_AVX) + { + tmp = gen_reg_rtx (V16QImode); + if (elt < 16) + emit_insn (gen_vec_extract_lo_v32qi (tmp, vec)); + else + emit_insn (gen_vec_extract_hi_v32qi (tmp, vec)); + ix86_expand_vector_extract (false, target, tmp, elt & 15); + return; + } + break; + + case V16HImode: + if (TARGET_AVX) + { + tmp = gen_reg_rtx (V8HImode); + if (elt < 8) + emit_insn (gen_vec_extract_lo_v16hi (tmp, vec)); + else + emit_insn (gen_vec_extract_hi_v16hi (tmp, vec)); + ix86_expand_vector_extract (false, target, tmp, elt & 7); + return; + } + break; + + case V8SImode: + if (TARGET_AVX) + { + tmp = gen_reg_rtx (V4SImode); + if (elt < 4) + emit_insn (gen_vec_extract_lo_v8si (tmp, vec)); + else + emit_insn (gen_vec_extract_hi_v8si (tmp, vec)); + ix86_expand_vector_extract (false, target, tmp, elt & 3); + return; + } + break; + + case V4DImode: + if (TARGET_AVX) + { + tmp = gen_reg_rtx (V2DImode); + if (elt < 2) + emit_insn (gen_vec_extract_lo_v4di (tmp, vec)); + else + emit_insn (gen_vec_extract_hi_v4di (tmp, vec)); + ix86_expand_vector_extract (false, target, tmp, elt & 1); + return; + } + break; + case V8QImode: /* ??? Could extract the appropriate HImode element and shift. */ default: --- gcc/testsuite/gcc.target/i386/sse2-extract-1.c.jj 2011-09-16 10:41:45.000000000 +0200 +++ gcc/testsuite/gcc.target/i386/sse2-extract-1.c 2011-09-16 10:41:55.000000000 +0200 @@ -0,0 +1,102 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -msse2" } */ +/* { dg-require-effective-target sse2_runtime } */ + +extern void abort (void); +typedef unsigned long long uint64_t; + +#define vector(elcount, type) \ +__attribute__((vector_size((elcount)*sizeof(type)))) type + +#define FN(elcount, type, idx) \ +__attribute__((noinline, noclone)) \ +type f##type##elcount##_##idx (vector (elcount, type) x) { return x[idx] + 1; } +#define T2(elcount, type) \ + H (elcount, type) \ + F (elcount, type, 0) \ + F (elcount, type, 1) +#define T4(elcount, type) \ + T2 (elcount, type) \ + F (elcount, type, 2) \ + F (elcount, type, 3) +#define T8(elcount, type) \ + T4 (elcount, type) \ + F (elcount, type, 4) \ + F (elcount, type, 5) \ + F (elcount, type, 6) \ + F (elcount, type, 7) +#define T16(elcount, type) \ + T8 (elcount, type) \ + F (elcount, type, 8) \ + F (elcount, type, 9) \ + F (elcount, type, 10) \ + F (elcount, type, 11) \ + F (elcount, type, 12) \ + F (elcount, type, 13) \ + F (elcount, type, 14) \ + F (elcount, type, 15) +#define T32(elcount, type) \ + T16 (elcount, type) \ + F (elcount, type, 16) \ + F (elcount, type, 17) \ + F (elcount, type, 18) \ + F (elcount, type, 19) \ + F (elcount, type, 20) \ + F (elcount, type, 21) \ + F (elcount, type, 22) \ + F (elcount, type, 23) \ + F (elcount, type, 24) \ + F (elcount, type, 25) \ + F (elcount, type, 26) \ + F (elcount, type, 27) \ + F (elcount, type, 28) \ + F (elcount, type, 29) \ + F (elcount, type, 30) \ + F (elcount, type, 31) +#define TESTS_SSE2 \ +T2 (2, double) E \ +T2 (2, uint64_t) E \ +T4 (4, float) E \ +T4 (4, int) E \ +T8 (8, short) E \ +T16 (16, char) E +#define TESTS_AVX \ +T4 (4, double) E \ +T4 (4, uint64_t) E \ +T8 (8, float) E \ +T8 (8, int) E \ +T16 (16, short) E \ +T32 (32, char) E +#ifdef __AVX__ +#define TESTS TESTS_SSE2 TESTS_AVX +#else +#define TESTS TESTS_SSE2 +#endif + +#define F FN +#define H(elcount, type) +#define E +TESTS + +int +main () +{ +#undef F +#undef H +#undef E +#define H(elcount, type) \ + vector (elcount, type) v##type##elcount = { +#define E }; +#define F(elcount, type, idx) idx + 1, + TESTS +#undef F +#undef H +#undef E +#define H(elcount, type) +#define E +#define F(elcount, type, idx) \ + if (f##type##elcount##_##idx (v##type##elcount) != idx + 2) \ + abort (); + TESTS + return 0; +} --- gcc/testsuite/gcc.target/i386/avx-extract-1.c.jj 2011-09-16 10:44:19.000000000 +0200 +++ gcc/testsuite/gcc.target/i386/avx-extract-1.c 2011-09-16 10:44:58.000000000 +0200 @@ -0,0 +1,5 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx" } */ +/* { dg-require-effective-target avx_runtime } */ + +#include "sse2-extract-1.c" Jakub