On Samstag, 11. August 2018 11:18:39 CEST Jakub Jelinek wrote:
> On Sat, Aug 11, 2018 at 10:59:26AM +0200, Allan Sandfeld Jensen wrote:
> > +/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
> > + using movss or movsd. */
> > +static bool
> > +expand_vec_perm_movs (struct expand_vec_perm_d *d)
> > +{
> > + machine_mode vmode = d->vmode;
> > + unsigned i, nelt = d->nelt;
> > + rtx x;
> > +
> > + if (d->one_operand_p)
> > + return false;
> > +
> > + if (TARGET_SSE2 && (vmode == V2DFmode || vmode == V4SFmode))
> > + ;
> > + else
> > + return false;
> > +
> > + /* Only the first element is changed. */
>
> Two spaces after .
>
> > + if (d->perm[0] != nelt && d->perm[0] != 0)
> > + return false;
> > + for (i = 1; i < nelt; ++i) {
> > + {
> > + if (d->perm[i] != i + nelt - d->perm[0])
> > + return false;
> > + }
> > + }
>
> Extraneous {}s (both pairs, the outer ones even badly indented).
>
> Otherwise LGTM.
>
Updated:
Note as an infrequent contributor don't have commit access, so I need someone
reviewing to also commit.
'Allan
>From e33241e5ddc7fa57c4ba7893669af7f7e636125e Mon Sep 17 00:00:00 2001
From: Allan Sandfeld Jensen <[email protected]>
Date: Sat, 11 Aug 2018 11:52:21 +0200
Subject: [PATCH] Match movss and movsd "blend" instructions
Adds the ability to match movss and movsd as blend patterns,
implemented in a new method to be able to match these before shuffles,
while keeping other blends after.
2018-08-11 Allan Sandfeld Jensen <[email protected]>
gcc/config/i386
* i386.cc (expand_vec_perm_movs): New method matching movs
patterns.
* i386.cc (expand_vec_perm_1): Try the new method.
gcc/testsuite
* gcc.target/i386/sse2-movs.c: New test.
---
gcc/config/i386/emmintrin.h | 2 +-
gcc/config/i386/i386.c | 41 +++++++++++++++++++++++++++++++++++++
gcc/config/i386/xmmintrin.h | 5 ++++-
3 files changed, 46 insertions(+), 2 deletions(-)
diff --git a/gcc/config/i386/emmintrin.h b/gcc/config/i386/emmintrin.h
index b940a39d27b..6501638f619 100644
--- a/gcc/config/i386/emmintrin.h
+++ b/gcc/config/i386/emmintrin.h
@@ -113,7 +113,7 @@ _mm_setzero_pd (void)
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_move_sd (__m128d __A, __m128d __B)
{
- return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
+ return __extension__ (__m128d) __builtin_shuffle((__v2df)__A, (__v2df)__B, (__v2di){2, 1});
}
/* Load two DPFP values from P. The address must be 16-byte aligned. */
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 7554fd1f659..15a3caa94c3 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -46145,6 +46145,43 @@ expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
return ok;
}
+/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
+ using movss or movsd. */
+static bool
+expand_vec_perm_movs (struct expand_vec_perm_d *d)
+{
+ machine_mode vmode = d->vmode;
+ unsigned i, nelt = d->nelt;
+ rtx x;
+
+ if (d->one_operand_p)
+ return false;
+
+ if (TARGET_SSE2 && (vmode == V2DFmode || vmode == V4SFmode))
+ ;
+ else
+ return false;
+
+ /* Only the first element is changed. */
+ if (d->perm[0] != nelt && d->perm[0] != 0)
+ return false;
+ for (i = 1; i < nelt; ++i)
+ if (d->perm[i] != i + nelt - d->perm[0])
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ if (d->perm[0] == nelt)
+ x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
+ else
+ x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
+
+ emit_insn (gen_rtx_SET (d->target, x));
+
+ return true;
+}
+
/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
@@ -46887,6 +46924,10 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
}
}
+ /* Try movss/movsd instructions. */
+ if (expand_vec_perm_movs (d))
+ return true;
+
/* Finally, try the fully general two operand permute. */
if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
d->testing_p))
diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h
index f64f3f74a0b..f770570295c 100644
--- a/gcc/config/i386/xmmintrin.h
+++ b/gcc/config/i386/xmmintrin.h
@@ -1011,7 +1011,10 @@ _mm_storer_ps (float *__P, __m128 __A)
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_move_ss (__m128 __A, __m128 __B)
{
- return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B);
+ return (__m128) __builtin_shuffle ((__v4sf)__A, (__v4sf)__B,
+ __extension__
+ (__attribute__((__vector_size__ (16))) int)
+ {4,1,2,3});
}
/* Extracts one of the four words of A. The selector N must be immediate. */
--
2.17.1