Hi! Richard has asked me recently to look at V[24]TI vector extraction and initialization, which he wants to use from the vectorizer.
The following is an attempt to implement that. On the testcases included in the patch we get usually better or significantly better code generated, the exception is f1, where the change is: - movq %rdi, -32(%rsp) - movq %rsi, -24(%rsp) - movq %rdx, -16(%rsp) - movq %rcx, -8(%rsp) - vmovdqa -32(%rsp), %ymm0 + movq %rdi, -16(%rsp) + movq %rsi, -8(%rsp) + movq %rdx, -32(%rsp) + movq %rcx, -24(%rsp) + vmovdqa -32(%rsp), %xmm0 + vmovdqa -16(%rsp), %xmm1 + vinserti128 $0x1, %xmm0, %ymm1, %ymm0 which is something that is hard to handle before RA. If the RA would spill it the other way around, perhaps it would be solveable by transforming vmovdqa -32(%rsp), %xmm1 vmovdqa -16(%rsp), %xmm0 vinserti128 $0x01, %xmm0, %ymm1, %ymm0 into vmovdqa -32(%rsp), %ymm0 using peephole2, but no idea how to force it that way. And f11 also has similar problem, that time with 3 extra insns. But if the TImode variable is allocated in a %?mm* register, we get better code even in those cases. For V4TImode perhaps we could improve some special cases of vec_initv4ti, like broadcast or only one variable otherwise everything constant, but at least for the broadcast I'm not really sure what is the optimal sequence. vbroadcasti32x4 is only able to broadcast from memory, which is good if the TImode input lives in memory, but if it doesn't? __builtin_shuffle right now generates vpermq with the indices loaded from memory, but that needs to wait for memory load... Another thing is that we actually don't permit a normal move instruction for V4TImode unless AVX512BW, so we used to generate terrible code (spill it into memory using GPRs and then load back). Any reason for that? I've found: https://gcc.gnu.org/ml/gcc-patches/2014-08/msg01465.html > > > - (V2TI "TARGET_AVX") V1TI > > > + (V4TI "TARGET_AVX") (V2TI "TARGET_AVX") V1TI > > > > Are you sure TARGET_AVX is the correct condition for V4TI? > Right! This should be TARGET_AVX512BW (because corresponding shifts > belong to AVX-512BW). but it isn't at all clear what shifts this is talking about. This is VMOVE, which is used just in mov<mode>, mov<mode>_internal and movmisalign<mode> patterns, I fail to see what kind of shifts would those produce. Those should only produce vmovdqa64, vmovdqu64, vpxord or vpternlogd insns with %zmm* operands, those are all AVX512F already. Anyway, bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? Maybe it would be nice to also improve bitwise logical operations on V2TI/V4TImode - probably just expanders like {and,ior,xor}v[24]ti and maybe __builtin_shuffle. Richard also talked about V2OImode support, but I'm afraid that is going to be way too hard, we don't really have OImode support in most places. 2017-07-20 Jakub Jelinek <ja...@redhat.com> PR target/80846 * config/i386/i386.c (ix86_expand_vector_init_general): Handle V2TImode and V4TImode. (ix86_expand_vector_extract): Likewise. * config/i386/sse.md (VMOVE): Enable V4TImode even for just TARGET_AVX512F, instead of only for TARGET_AVX512BW. (ssescalarmode): Handle V4TImode and V2TImode. (VEC_EXTRACT_MODE): Add V4TImode and V2TImode. (*vec_extractv2ti, *vec_extractv4ti): New insns. (VEXTRACTI128_MODE): New mode iterator. (splitter for *vec_extractv?ti first element): New. (VEC_INIT_MODE): New mode iterator. (vec_init<mode>): Consolidate 3 expanders into one using VEC_INIT_MODE mode iterator. * gcc.target/i386/avx-pr80846.c: New test. * gcc.target/i386/avx2-pr80846.c: New test. * gcc.target/i386/avx512f-pr80846.c: New test. --- gcc/config/i386/i386.c.jj 2017-07-17 10:08:39.000000000 +0200 +++ gcc/config/i386/i386.c 2017-07-19 12:59:47.242174431 +0200 @@ -44118,6 +44118,26 @@ ix86_expand_vector_init_general (bool mm ix86_expand_vector_init_concat (mode, target, ops, n); return; + case V2TImode: + for (i = 0; i < 2; i++) + ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i)); + op0 = gen_reg_rtx (V4DImode); + ix86_expand_vector_init_concat (V4DImode, op0, ops, 2); + emit_move_insn (target, gen_lowpart (GET_MODE (target), op0)); + return; + + case V4TImode: + for (i = 0; i < 4; i++) + ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i)); + ops[4] = gen_reg_rtx (V4DImode); + ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2); + ops[5] = gen_reg_rtx (V4DImode); + ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2); + op0 = gen_reg_rtx (V8DImode); + ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2); + emit_move_insn (target, gen_lowpart (GET_MODE (target), op0)); + return; + case V32QImode: half_mode = V16QImode; goto half; @@ -44659,6 +44679,8 @@ ix86_expand_vector_extract (bool mmx_ok, case V2DFmode: case V2DImode: + case V2TImode: + case V4TImode: use_vec_extr = true; break; --- gcc/config/i386/sse.md.jj 2017-07-06 20:31:45.000000000 +0200 +++ gcc/config/i386/sse.md 2017-07-19 19:02:08.884640151 +0200 @@ -175,7 +175,7 @@ (define_mode_iterator VMOVE (V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX") V8HI (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX") V2DI - (V4TI "TARGET_AVX512BW") (V2TI "TARGET_AVX") V1TI + (V4TI "TARGET_AVX512F") (V2TI "TARGET_AVX") V1TI (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") V2DF]) @@ -687,7 +687,8 @@ (define_mode_attr ssescalarmode (V16SI "SI") (V8SI "SI") (V4SI "SI") (V8DI "DI") (V4DI "DI") (V2DI "DI") (V16SF "SF") (V8SF "SF") (V4SF "SF") - (V8DF "DF") (V4DF "DF") (V2DF "DF")]) + (V8DF "DF") (V4DF "DF") (V2DF "DF") + (V4TI "TI") (V2TI "TI")]) ;; Mapping of vector modes to the 128bit modes (define_mode_attr ssexmmmode @@ -6920,15 +6921,6 @@ (define_insn "*vec_concatv4sf" (set_attr "prefix" "orig,maybe_evex,orig,maybe_evex") (set_attr "mode" "V4SF,V4SF,V2SF,V2SF")]) -(define_expand "vec_init<mode>" - [(match_operand:V_128 0 "register_operand") - (match_operand 1)] - "TARGET_SSE" -{ - ix86_expand_vector_init (false, operands[0], operands[1]); - DONE; -}) - ;; Avoid combining registers from different units in a single alternative, ;; see comment above inline_secondary_memory_needed function in i386.c (define_insn "vec_set<mode>_0" @@ -7886,7 +7878,8 @@ (define_mode_iterator VEC_EXTRACT_MODE (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX") V2DI (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF - (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") V2DF]) + (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") V2DF + (V4TI "TARGET_AVX512F") (V2TI "TARGET_AVX")]) (define_expand "vec_extract<mode>" [(match_operand:<ssescalarmode> 0 "register_operand") @@ -13734,6 +13727,50 @@ (define_split operands[1] = adjust_address (operands[1], <ssescalarmode>mode, offs); }) +(define_insn "*vec_extractv2ti" + [(set (match_operand:TI 0 "nonimmediate_operand" "=xm,vm") + (vec_select:TI + (match_operand:V2TI 1 "register_operand" "x,v") + (parallel + [(match_operand:SI 2 "const_0_to_1_operand")])))] + "TARGET_AVX" + "@ + vextract%~128\t{%2, %1, %0|%0, %1, %2} + vextracti32x4\t{%2, %g1, %0|%0, %g1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "vex,evex") + (set_attr "mode" "OI")]) + +(define_insn "*vec_extractv4ti" + [(set (match_operand:TI 0 "nonimmediate_operand" "=vm") + (vec_select:TI + (match_operand:V4TI 1 "register_operand" "v") + (parallel + [(match_operand:SI 2 "const_0_to_3_operand")])))] + "TARGET_AVX512F" + "vextracti32x4\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "XI")]) + +(define_mode_iterator VEXTRACTI128_MODE + [(V4TI "TARGET_AVX512F") V2TI]) + +(define_split + [(set (match_operand:TI 0 "nonimmediate_operand") + (vec_select:TI + (match_operand:VEXTRACTI128_MODE 1 "register_operand") + (parallel [(const_int 0)])))] + "TARGET_AVX + && reload_completed + && (TARGET_AVX512VL || !EXT_REX_SSE_REG_P (operands[1]))" + [(set (match_dup 0) (match_dup 1))] + "operands[1] = gen_lowpart (TImode, operands[1]);") + ;; Turn SImode or DImode extraction from arbitrary SSE/AVX/AVX512F ;; vector modes into vec_extract*. (define_split @@ -18738,19 +18775,20 @@ (define_insn_and_split "avx_<castmode><a <ssehalfvecmode>mode); }) -(define_expand "vec_init<mode>" - [(match_operand:V_256 0 "register_operand") - (match_operand 1)] - "TARGET_AVX" -{ - ix86_expand_vector_init (false, operands[0], operands[1]); - DONE; -}) +;; Modes handled by vec_init patterns. +(define_mode_iterator VEC_INIT_MODE + [(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX") V16QI + (V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX") V8HI + (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI + (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX") V2DI + (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF + (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2") + (V4TI "TARGET_AVX512F") (V2TI "TARGET_AVX")]) (define_expand "vec_init<mode>" - [(match_operand:VF48_I1248 0 "register_operand") + [(match_operand:VEC_INIT_MODE 0 "register_operand") (match_operand 1)] - "TARGET_AVX512F" + "TARGET_SSE" { ix86_expand_vector_init (false, operands[0], operands[1]); DONE; --- gcc/testsuite/gcc.target/i386/avx-pr80846.c.jj 2017-07-19 13:50:48.412824445 +0200 +++ gcc/testsuite/gcc.target/i386/avx-pr80846.c 2017-07-19 13:50:16.000000000 +0200 @@ -0,0 +1,39 @@ +/* PR target/80846 */ +/* { dg-do compile { target int128 } } */ +/* { dg-options "-O2 -mavx -mno-avx2" } */ + +typedef __int128 V __attribute__((vector_size (32))); +typedef long long W __attribute__((vector_size (32))); +typedef int X __attribute__((vector_size (16))); +typedef __int128 Y __attribute__((vector_size (64))); +typedef long long Z __attribute__((vector_size (64))); + +W f1 (__int128 x, __int128 y) { return (W) ((V) { x, y }); } +__int128 f2 (W x) { return ((V)x)[0]; } +__int128 f3 (W x) { return ((V)x)[1]; } +W f4 (X x, X y) { union { X x; __int128 i; } u = { .x = x }, v = { .x = y }; return (W) ((V) { u.i, v.i }); } +X f5 (W x) { return (X)(((V)x)[0]); } +X f6 (W x) { return (X)(((V)x)[1]); } +W f7 (void) { return (W) ((V) { 2, 3 }); } +W f8 (X x) { union { X x; __int128 i; } u = { .x = x }; return (W) ((V) { u.i, 3 }); } +W f9 (X x) { union { X x; __int128 i; } u = { .x = x }; return (W) ((V) { 2, u.i }); } +W f10 (X x) { union { X x; __int128 i; } u = { .x = x }; return (W) ((V) { u.i, u.i }); } +#ifdef __AVX512F__ +Z f11 (__int128 x, __int128 y, __int128 z, __int128 a) { return (Z) ((Y) { x, y, z, a }); } +__int128 f12 (Z x) { return ((Y)x)[0]; } +__int128 f13 (Z x) { return ((Y)x)[1]; } +__int128 f14 (Z x) { return ((Y)x)[2]; } +__int128 f15 (Z x) { return ((Y)x)[3]; } +Z f16 (X x, X y, X z, X a) { union { X x; __int128 i; } u = { .x = x }, v = { .x = y }, w = { .x = z }, t = { .x = a }; + return (Z) ((Y) { u.i, v.i, w.i, t.i }); } +X f17 (Z x) { return (X)(((Y)x)[0]); } +X f18 (Z x) { return (X)(((Y)x)[1]); } +X f19 (Z x) { return (X)(((Y)x)[2]); } +X f20 (Z x) { return (X)(((Y)x)[3]); } +Z f21 (void) { return (Z) ((Y) { 2, 3, 4, 5 }); } +Z f22 (X x) { union { X x; __int128 i; } u = { .x = x }; return (Z) ((Y) { u.i, 3, 4, 5 }); } +Z f23 (X x) { union { X x; __int128 i; } u = { .x = x }; return (Z) ((Y) { 2, u.i, 4, 5 }); } +Z f24 (X x) { union { X x; __int128 i; } u = { .x = x }; return (Z) ((Y) { 2, 3, u.i, 5 }); } +Z f25 (X x) { union { X x; __int128 i; } u = { .x = x }; return (Z) ((Y) { 2, 3, 4, u.i }); } +Z f26 (X x) { union { X x; __int128 i; } u = { .x = x }; return (Z) ((Y) { u.i, u.i, u.i, u.i }); } +#endif --- gcc/testsuite/gcc.target/i386/avx2-pr80846.c.jj 2017-07-19 13:51:23.289528396 +0200 +++ gcc/testsuite/gcc.target/i386/avx2-pr80846.c 2017-07-19 13:51:14.976598960 +0200 @@ -0,0 +1,5 @@ +/* PR target/80846 */ +/* { dg-do compile { target int128 } } */ +/* { dg-options "-O2 -mavx2 -mno-avx512f" } */ + +#include "avx-pr80846.c" --- gcc/testsuite/gcc.target/i386/avx512f-pr80846.c.jj 2017-07-19 13:51:36.628415170 +0200 +++ gcc/testsuite/gcc.target/i386/avx512f-pr80846.c 2017-07-19 13:51:48.686312817 +0200 @@ -0,0 +1,5 @@ +/* PR target/80846 */ +/* { dg-do compile { target int128 } } */ +/* { dg-options "-O2 -mavx512f" } */ + +#include "avx-pr80846.c" Jakub