On Fri, Nov 30, 2018 at 10:14 PM Jakub Jelinek <ja...@redhat.com> wrote: > > Hi! > > As mentioned in the PR, while in vec_concatv2df or vec_concatv2di we have > alternatives where the lower half is low 64-bit part of a xmm reg or 64-bit > memory and upper half is zero using movq/vmovq (or movq2dq), for > vec_concatv4sf and vec_concatv4si we don't have anything similar, although > the operations do pretty much the same thing. I'm not advocating to put > in alternatives with GPR operands as having V2SF or V2SI in a GPR is too > weird, but for the cases where a simple movq or vmovq instruction can move > 64-bits and clear upper 64-bits these patterns look useful to me. > > I had to tweak the pr53759.c testcase because it started FAILing, but only > because it changed: > - vxorps %xmm0, %xmm0, %xmm0 > - vmovlps (%rsi), %xmm0, %xmm0 > + vmovq (%rsi), %xmm0 > vaddps %xmm0, %xmm0, %xmm0 > vmovaps %xmm0, (%rdi) > ret > I've added a variant of that testcase that tests its original purpose by > using a register there not known to be all zeros. > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? > > 2018-11-30 Jakub Jelinek <ja...@redhat.com> > > PR target/88278 > * config/i386/sse.md (*vec_concatv4sf_0, *vec_concatv4si_0): New > insns. > > * gcc.target/i386/pr88278.c: New test. > * gcc.target/i386/pr53759.c: Don't expect vmovlps insn, expect vmovq > instead. > * gcc.target/i386/pr53759-2.c: New test.
OK with a constraint adjustment below. Thanks, Uros. > > --- gcc/config/i386/sse.md.jj 2018-11-30 21:36:22.277762263 +0100 > +++ gcc/config/i386/sse.md 2018-11-30 21:38:02.261120768 +0100 > @@ -7248,6 +7248,17 @@ (define_insn "*vec_concatv4sf" > (set_attr "prefix" "orig,maybe_evex,orig,maybe_evex") > (set_attr "mode" "V4SF,V4SF,V2SF,V2SF")]) > > +(define_insn "*vec_concatv4sf_0" > + [(set (match_operand:V4SF 0 "register_operand" "=v") > + (vec_concat:V4SF > + (match_operand:V2SF 1 "nonimmediate_operand" "xm") The constraint here can be "vm". There is no limitation on register set for vmovq insn. > + (match_operand:V2SF 2 "const0_operand" " C")))] > + "TARGET_SSE2" > + "%vmovq\t{%1, %0|%0, %1}" > + [(set_attr "type" "ssemov") > + (set_attr "prefix" "maybe_vex") > + (set_attr "mode" "DF")]) > + > ;; Avoid combining registers from different units in a single alternative, > ;; see comment above inline_secondary_memory_needed function in i386.c > (define_insn "vec_set<mode>_0" > @@ -14409,6 +14420,23 @@ (define_insn "*vec_concatv4si" > (set_attr "prefix" "orig,maybe_evex,orig,orig,maybe_evex") > (set_attr "mode" "TI,TI,V4SF,V2SF,V2SF")]) > > +(define_insn "*vec_concatv4si_0" > + [(set (match_operand:V4SI 0 "register_operand" "=v,x") > + (vec_concat:V4SI > + (match_operand:V2SI 1 "nonimmediate_operand" "vm,?!*y") > + (match_operand:V2SI 2 "const0_operand" " C,C")))] > + "TARGET_SSE2" > + "@ > + %vmovq\t{%1, %0|%0, %1} > + movq2dq\t{%1, %0|%0, %1}" > + [(set_attr "type" "ssemov") > + (set_attr "prefix" "maybe_vex,orig") > + (set_attr "mode" "TI") > + (set (attr "preferred_for_speed") > + (if_then_else (eq_attr "alternative" "1") > + (symbol_ref "TARGET_INTER_UNIT_MOVES_FROM_VEC") > + (symbol_ref "true")))]) > + > ;; movd instead of movq is required to handle broken assemblers. > (define_insn "vec_concatv2di" > [(set (match_operand:V2DI 0 "register_operand" > --- gcc/testsuite/gcc.target/i386/pr88278.c.jj 2018-11-30 21:38:02.261120768 > +0100 > +++ gcc/testsuite/gcc.target/i386/pr88278.c 2018-11-30 21:38:02.261120768 > +0100 > @@ -0,0 +1,34 @@ > +/* PR target/88278 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -msse2 -mno-sse3 -fgimple -masm=att" } */ > +/* { dg-final { scan-assembler-times "movq\[ \t]+\\(" 2 } } */ > +/* { dg-final { scan-assembler-not "punpcklqdq\[ \t]+" } } */ > +/* { dg-final { scan-assembler-not "pxor\[ \t]+" } } */ > + > +typedef unsigned char v16qi __attribute__((vector_size(16))); > +typedef unsigned char v8qi __attribute__((vector_size(8))); > +typedef unsigned int v4si __attribute__((vector_size(16))); > +typedef unsigned int v2si __attribute__((vector_size(8))); > + > +v16qi __GIMPLE foo (unsigned char *p) > +{ > + v8qi _2; > + v16qi _3; > + > +bb_2: > + _2 = __MEM <v8qi, 8> (p_1(D)); > + _3 = _Literal (v16qi) { _2, _Literal (v8qi) { _Literal (unsigned char) 0, > _Literal (unsigned char) 0, _Literal (unsigned char) 0, _Literal (unsigned > char) 0, _Literal (unsigned char) 0, _Literal (unsigned char) 0, _Literal > (unsigned char) 0 } }; > + return _3; > +} > + > + > +v4si __GIMPLE bar (unsigned int *p) > +{ > + v2si _2; > + v4si _3; > + > +bb_2: > + _2 = __MEM <v2si, 32> (p_1(D)); > + _3 = _Literal (v4si) { _2, _Literal (v2si) { 0u, 0u } }; > + return _3; > +} > --- gcc/testsuite/gcc.target/i386/pr53759.c.jj 2016-05-22 12:20:04.184034591 > +0200 > +++ gcc/testsuite/gcc.target/i386/pr53759.c 2018-11-30 22:04:56.432806470 > +0100 > @@ -12,5 +12,6 @@ foo (__m128 *x, __m64 *y) > *x = _mm_add_ps (b, b); > } > > -/* { dg-final { scan-assembler "vmovlps\[ \\t\]" } } */ > +/* { dg-final { scan-assembler "vmovq\[ \\t\]" } } */ > +/* { dg-final { scan-assembler-not "vmovlps\[ \\t\]" } } */ > /* { dg-final { scan-assembler-not "vshufps\[ \\t\]" } } */ > --- gcc/testsuite/gcc.target/i386/pr53759-2.c.jj 2018-11-30 > 22:05:06.932657374 +0100 > +++ gcc/testsuite/gcc.target/i386/pr53759-2.c 2018-11-30 22:05:42.568151361 > +0100 > @@ -0,0 +1,16 @@ > +/* PR target/53759 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mavx" } */ > + > +#include <xmmintrin.h> > + > +void > +foo (__m128 *x, __m64 *y) > +{ > + __m128 a = _mm_add_ps (x[1], x[2]); > + __m128 b = _mm_loadl_pi (a, y); > + *x = _mm_add_ps (b, b); > +} > + > +/* { dg-final { scan-assembler "vmovlps\[ \\t\]" } } */ > +/* { dg-final { scan-assembler-not "vshufps\[ \\t\]" } } */ > > Jakub