On Fri, Nov 30, 2018 at 10:14 PM Jakub Jelinek <ja...@redhat.com> wrote:
>
> Hi!
>
> As mentioned in the PR, while in vec_concatv2df or vec_concatv2di we have
> alternatives where the lower half is low 64-bit part of a xmm reg or 64-bit
> memory and upper half is zero using movq/vmovq (or movq2dq), for
> vec_concatv4sf and vec_concatv4si we don't have anything similar, although
> the operations do pretty much the same thing.  I'm not advocating to put
> in alternatives with GPR operands as having V2SF or V2SI in a GPR is too
> weird, but for the cases where a simple movq or vmovq instruction can move
> 64-bits and clear upper 64-bits these patterns look useful to me.
>
> I had to tweak the pr53759.c testcase because it started FAILing, but only
> because it changed:
> -       vxorps  %xmm0, %xmm0, %xmm0
> -       vmovlps (%rsi), %xmm0, %xmm0
> +       vmovq   (%rsi), %xmm0
>         vaddps  %xmm0, %xmm0, %xmm0
>         vmovaps %xmm0, (%rdi)
>         ret
> I've added a variant of that testcase that tests its original purpose by
> using a register there not known to be all zeros.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2018-11-30  Jakub Jelinek  <ja...@redhat.com>
>
>         PR target/88278
>         * config/i386/sse.md (*vec_concatv4sf_0, *vec_concatv4si_0): New 
> insns.
>
>         * gcc.target/i386/pr88278.c: New test.
>         * gcc.target/i386/pr53759.c: Don't expect vmovlps insn, expect vmovq
>         instead.
>         * gcc.target/i386/pr53759-2.c: New test.

OK with a constraint adjustment below.

Thanks,
Uros.

>
> --- gcc/config/i386/sse.md.jj   2018-11-30 21:36:22.277762263 +0100
> +++ gcc/config/i386/sse.md      2018-11-30 21:38:02.261120768 +0100
> @@ -7248,6 +7248,17 @@ (define_insn "*vec_concatv4sf"
>     (set_attr "prefix" "orig,maybe_evex,orig,maybe_evex")
>     (set_attr "mode" "V4SF,V4SF,V2SF,V2SF")])
>
> +(define_insn "*vec_concatv4sf_0"
> +  [(set (match_operand:V4SF 0 "register_operand"       "=v")
> +       (vec_concat:V4SF
> +         (match_operand:V2SF 1 "nonimmediate_operand" "xm")

The constraint here can be "vm". There is no limitation on register
set for vmovq insn.

> +         (match_operand:V2SF 2 "const0_operand"       " C")))]
> +  "TARGET_SSE2"
> +  "%vmovq\t{%1, %0|%0, %1}"
> +  [(set_attr "type" "ssemov")
> +   (set_attr "prefix" "maybe_vex")
> +   (set_attr "mode" "DF")])
> +
>  ;; Avoid combining registers from different units in a single alternative,
>  ;; see comment above inline_secondary_memory_needed function in i386.c
>  (define_insn "vec_set<mode>_0"
> @@ -14409,6 +14420,23 @@ (define_insn "*vec_concatv4si"
>     (set_attr "prefix" "orig,maybe_evex,orig,orig,maybe_evex")
>     (set_attr "mode" "TI,TI,V4SF,V2SF,V2SF")])
>
> +(define_insn "*vec_concatv4si_0"
> +  [(set (match_operand:V4SI 0 "register_operand"       "=v,x")
> +       (vec_concat:V4SI
> +         (match_operand:V2SI 1 "nonimmediate_operand" "vm,?!*y")
> +         (match_operand:V2SI 2 "const0_operand"       " C,C")))]
> +  "TARGET_SSE2"
> +  "@
> +   %vmovq\t{%1, %0|%0, %1}
> +   movq2dq\t{%1, %0|%0, %1}"
> +  [(set_attr "type" "ssemov")
> +   (set_attr "prefix" "maybe_vex,orig")
> +   (set_attr "mode" "TI")
> +   (set (attr "preferred_for_speed")
> +     (if_then_else (eq_attr "alternative" "1")
> +       (symbol_ref "TARGET_INTER_UNIT_MOVES_FROM_VEC")
> +       (symbol_ref "true")))])
> +
>  ;; movd instead of movq is required to handle broken assemblers.
>  (define_insn "vec_concatv2di"
>    [(set (match_operand:V2DI 0 "register_operand"
> --- gcc/testsuite/gcc.target/i386/pr88278.c.jj  2018-11-30 21:38:02.261120768 
> +0100
> +++ gcc/testsuite/gcc.target/i386/pr88278.c     2018-11-30 21:38:02.261120768 
> +0100
> @@ -0,0 +1,34 @@
> +/* PR target/88278 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse2 -mno-sse3 -fgimple -masm=att" } */
> +/* { dg-final { scan-assembler-times "movq\[ \t]+\\(" 2 } } */
> +/* { dg-final { scan-assembler-not "punpcklqdq\[ \t]+" } } */
> +/* { dg-final { scan-assembler-not "pxor\[ \t]+" } } */
> +
> +typedef unsigned char v16qi __attribute__((vector_size(16)));
> +typedef unsigned char v8qi __attribute__((vector_size(8)));
> +typedef unsigned int v4si __attribute__((vector_size(16)));
> +typedef unsigned int v2si __attribute__((vector_size(8)));
> +
> +v16qi __GIMPLE foo (unsigned char *p)
> +{
> +  v8qi _2;
> +  v16qi _3;
> +
> +bb_2:
> +  _2 = __MEM <v8qi, 8> (p_1(D));
> +  _3 = _Literal (v16qi) { _2, _Literal (v8qi) { _Literal (unsigned char) 0, 
> _Literal (unsigned char) 0, _Literal (unsigned char) 0, _Literal (unsigned 
> char) 0, _Literal (unsigned char) 0, _Literal (unsigned char) 0, _Literal 
> (unsigned char) 0 } };
> +  return _3;
> +}
> +
> +
> +v4si __GIMPLE bar (unsigned int *p)
> +{
> +  v2si _2;
> +  v4si _3;
> +
> +bb_2:
> +  _2 = __MEM <v2si, 32> (p_1(D));
> +  _3 = _Literal (v4si) { _2, _Literal (v2si) { 0u, 0u } };
> +  return _3;
> +}
> --- gcc/testsuite/gcc.target/i386/pr53759.c.jj  2016-05-22 12:20:04.184034591 
> +0200
> +++ gcc/testsuite/gcc.target/i386/pr53759.c     2018-11-30 22:04:56.432806470 
> +0100
> @@ -12,5 +12,6 @@ foo (__m128 *x, __m64 *y)
>    *x = _mm_add_ps (b, b);
>  }
>
> -/* { dg-final { scan-assembler "vmovlps\[ \\t\]" } } */
> +/* { dg-final { scan-assembler "vmovq\[ \\t\]" } } */
> +/* { dg-final { scan-assembler-not "vmovlps\[ \\t\]" } } */
>  /* { dg-final { scan-assembler-not "vshufps\[ \\t\]" } } */
> --- gcc/testsuite/gcc.target/i386/pr53759-2.c.jj        2018-11-30 
> 22:05:06.932657374 +0100
> +++ gcc/testsuite/gcc.target/i386/pr53759-2.c   2018-11-30 22:05:42.568151361 
> +0100
> @@ -0,0 +1,16 @@
> +/* PR target/53759 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +
> +#include <xmmintrin.h>
> +
> +void
> +foo (__m128 *x, __m64 *y)
> +{
> +  __m128 a = _mm_add_ps (x[1], x[2]);
> +  __m128 b = _mm_loadl_pi (a, y);
> +  *x = _mm_add_ps (b, b);
> +}
> +
> +/* { dg-final { scan-assembler "vmovlps\[ \\t\]" } } */
> +/* { dg-final { scan-assembler-not "vshufps\[ \\t\]" } } */
>
>         Jakub

Reply via email to