On Donnerstag, 2. August 2018 11:18:41 CEST Richard Biener wrote:
> On Thu, Aug 2, 2018 at 11:12 AM Allan Sandfeld Jensen
> 
> <li...@carewolf.com> wrote:
> > On Mittwoch, 1. August 2018 18:51:41 CEST Marc Glisse wrote:
> > > On Wed, 1 Aug 2018, Allan Sandfeld Jensen wrote:
> > > >  extern __inline __m128d __attribute__((__gnu_inline__,
> > > >  __always_inline__,
> > > > 
> > > > __artificial__))
> > > > 
> > > >  _mm_move_sd (__m128d __A, __m128d __B)
> > > >  {
> > > > 
> > > > -  return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
> > > > +  return __extension__ (__m128d)(__v2df){__B[0],__A[1]};
> > > > 
> > > >  }
> > > 
> > > If the goal is to have it represented as a VEC_PERM_EXPR internally, I
> > > wonder if we should be explicit and use __builtin_shuffle instead of
> > > relying on some forwprop pass to transform it. Maybe not, just asking.
> > > And
> > > the answer need not even be the same for _mm_move_sd and _mm_move_ss.
> > 
> > I wrote it this way because this pattern could later also be used for the
> > other _ss intrinsics, such as _mm_add_ss, where a _builtin_shuffle could
> > not. To match the other intrinsics the logic that tries to match vector
> > construction just needs to be extended to try merge patterns even if one
> > of the subexpressions is not simple.
> 
> The question is what users expect and get when they use -O0 with intrinsics?
> 
> Richard.
> 
Here is the version with __builtin_shuffle. It might be more expectable -O0, 
but it is also uglier.

diff --git a/gcc/config/i386/emmintrin.h b/gcc/config/i386/emmintrin.h
index b940a39d27b..6501638f619 100644
--- a/gcc/config/i386/emmintrin.h
+++ b/gcc/config/i386/emmintrin.h
@@ -113,7 +113,7 @@ _mm_setzero_pd (void)
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_move_sd (__m128d __A, __m128d __B)
 {
-  return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
+  return __extension__ (__m128d) __builtin_shuffle((__v2df)__A, (__v2df)__B, (__v2di){2, 1});
 }
 
 /* Load two DPFP values from P.  The address must be 16-byte aligned.  */
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index ee409cfe7e4..2337ef5ea08 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -46143,6 +46143,46 @@ expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
   return ok;
 }
 
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+   using movss or movsd.  */
+static bool
+expand_vec_perm_movs (struct expand_vec_perm_d *d)
+{
+  machine_mode vmode = d->vmode;
+  unsigned i, nelt = d->nelt;
+  rtx x;
+
+  if (d->one_operand_p)
+    return false;
+
+  if (TARGET_SSE2 && (vmode == V2DFmode || vmode == V4SFmode))
+    ;
+  else
+    return false;
+
+  /* Only the first element is changed. */
+  if (d->perm[0] != nelt && d->perm[0] != 0)
+    return false;
+  for (i = 1; i < nelt; ++i) {
+    {
+      if (d->perm[i] != i + nelt - d->perm[0])
+        return false;
+    }
+  }
+
+  if (d->testing_p)
+    return true;
+
+  if (d->perm[0] == nelt)
+    x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
+  else
+    x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
+
+  emit_insn (gen_rtx_SET (d->target, x));
+
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
    in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  */
 
@@ -46885,6 +46925,10 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
 	}
     }
 
+  /* Try movss/movsd instructions.  */
+  if (expand_vec_perm_movs (d))
+    return true;
+
   /* Finally, try the fully general two operand permute.  */
   if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
 			      d->testing_p))
diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h
index f64f3f74a0b..45b99ff87d5 100644
--- a/gcc/config/i386/xmmintrin.h
+++ b/gcc/config/i386/xmmintrin.h
@@ -1011,7 +1011,8 @@ _mm_storer_ps (float *__P, __m128 __A)
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_move_ss (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B);
+  return __extension__ (__m128) __builtin_shuffle((__v4sf)__A, (__v4sf)__B,
+                                                  (__attribute__((__vector_size__ (16))) int){4, 1, 2, 3});
 }
 
 /* Extracts one of the four words of A.  The selector N must be immediate.  */

Reply via email to