On Donnerstag, 2. August 2018 11:18:41 CEST Richard Biener wrote: > On Thu, Aug 2, 2018 at 11:12 AM Allan Sandfeld Jensen > > <li...@carewolf.com> wrote: > > On Mittwoch, 1. August 2018 18:51:41 CEST Marc Glisse wrote: > > > On Wed, 1 Aug 2018, Allan Sandfeld Jensen wrote: > > > > extern __inline __m128d __attribute__((__gnu_inline__, > > > > __always_inline__, > > > > > > > > __artificial__)) > > > > > > > > _mm_move_sd (__m128d __A, __m128d __B) > > > > { > > > > > > > > - return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B); > > > > + return __extension__ (__m128d)(__v2df){__B[0],__A[1]}; > > > > > > > > } > > > > > > If the goal is to have it represented as a VEC_PERM_EXPR internally, I > > > wonder if we should be explicit and use __builtin_shuffle instead of > > > relying on some forwprop pass to transform it. Maybe not, just asking. > > > And > > > the answer need not even be the same for _mm_move_sd and _mm_move_ss. > > > > I wrote it this way because this pattern could later also be used for the > > other _ss intrinsics, such as _mm_add_ss, where a _builtin_shuffle could > > not. To match the other intrinsics the logic that tries to match vector > > construction just needs to be extended to try merge patterns even if one > > of the subexpressions is not simple. > > The question is what users expect and get when they use -O0 with intrinsics? > > Richard. > Here is the version with __builtin_shuffle. It might be more expectable -O0, but it is also uglier.
diff --git a/gcc/config/i386/emmintrin.h b/gcc/config/i386/emmintrin.h index b940a39d27b..6501638f619 100644 --- a/gcc/config/i386/emmintrin.h +++ b/gcc/config/i386/emmintrin.h @@ -113,7 +113,7 @@ _mm_setzero_pd (void) extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_move_sd (__m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B); + return __extension__ (__m128d) __builtin_shuffle((__v2df)__A, (__v2df)__B, (__v2di){2, 1}); } /* Load two DPFP values from P. The address must be 16-byte aligned. */ diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index ee409cfe7e4..2337ef5ea08 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -46143,6 +46143,46 @@ expand_vselect_vconcat (rtx target, rtx op0, rtx op1, return ok; } +/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D + using movss or movsd. */ +static bool +expand_vec_perm_movs (struct expand_vec_perm_d *d) +{ + machine_mode vmode = d->vmode; + unsigned i, nelt = d->nelt; + rtx x; + + if (d->one_operand_p) + return false; + + if (TARGET_SSE2 && (vmode == V2DFmode || vmode == V4SFmode)) + ; + else + return false; + + /* Only the first element is changed. */ + if (d->perm[0] != nelt && d->perm[0] != 0) + return false; + for (i = 1; i < nelt; ++i) { + { + if (d->perm[i] != i + nelt - d->perm[0]) + return false; + } + } + + if (d->testing_p) + return true; + + if (d->perm[0] == nelt) + x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1)); + else + x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1)); + + emit_insn (gen_rtx_SET (d->target, x)); + + return true; +} + /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */ @@ -46885,6 +46925,10 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) } } + /* Try movss/movsd instructions. */ + if (expand_vec_perm_movs (d)) + return true; + /* Finally, try the fully general two operand permute. */ if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt, d->testing_p)) diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h index f64f3f74a0b..45b99ff87d5 100644 --- a/gcc/config/i386/xmmintrin.h +++ b/gcc/config/i386/xmmintrin.h @@ -1011,7 +1011,8 @@ _mm_storer_ps (float *__P, __m128 __A) extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_move_ss (__m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B); + return __extension__ (__m128) __builtin_shuffle((__v4sf)__A, (__v4sf)__B, + (__attribute__((__vector_size__ (16))) int){4, 1, 2, 3}); } /* Extracts one of the four words of A. The selector N must be immediate. */