On Thu, May 2, 2019 at 10:53 AM H.J. Lu <hjl.to...@gmail.com> wrote:
>
> On Thu, May 2, 2019 at 7:55 AM Richard Biener
> <richard.guent...@gmail.com> wrote:
> >
> > On Thu, May 2, 2019 at 4:54 PM Richard Biener
> > <richard.guent...@gmail.com> wrote:
> > >
> > > On Mon, Mar 11, 2019 at 8:03 AM H.J. Lu <hjl.to...@gmail.com> wrote:
> > > >
> > > > On Fri, Mar 8, 2019 at 7:03 PM Richard Biener
> > > > <richard.guent...@gmail.com> wrote:
> > > > >
> > > > > On Fri, Mar 8, 2019 at 9:49 AM H.J. Lu <hjl.to...@gmail.com> wrote:
> > > > > >
> > > > > > On Thu, Mar 7, 2019 at 9:51 AM H.J. Lu <hjl.to...@gmail.com> wrote:
> > > > > > >
> > > > > > > On Wed, Mar 6, 2019 at 8:33 PM Richard Biener
> > > > > > > <richard.guent...@gmail.com> wrote:
> > > > > > > >
> > > > > > > > On Wed, Mar 6, 2019 at 8:46 AM H.J. Lu <hjl.to...@gmail.com> 
> > > > > > > > wrote:
> > > > > > > > >
> > > > > > > > > On Tue, Mar 5, 2019 at 1:46 AM H.J. Lu <hjl.to...@gmail.com> 
> > > > > > > > > wrote:
> > > > > > > > > >
> > > > > > > > > > On Mon, Mar 04, 2019 at 12:55:04PM +0100, Richard Biener 
> > > > > > > > > > wrote:
> > > > > > > > > > > On Sun, Mar 3, 2019 at 10:13 PM H.J. Lu 
> > > > > > > > > > > <hjl.to...@gmail.com> wrote:
> > > > > > > > > > > >
> > > > > > > > > > > > On Sun, Mar 03, 2019 at 06:40:09AM -0800, Andrew Pinski 
> > > > > > > > > > > > wrote:
> > > > > > > > > > > > > )
> > > > > > > > > > > > > ,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu 
> > > > > > > > > > > > > <hjl.to...@gmail.com> wrote:
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > For vector init constructor:
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > ---
> > > > > > > > > > > > > > typedef float __v4sf __attribute__ 
> > > > > > > > > > > > > > ((__vector_size__ (16)));
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > __v4sf
> > > > > > > > > > > > > > foo (__v4sf x, float f)
> > > > > > > > > > > > > > {
> > > > > > > > > > > > > >   __v4sf y = { f, x[1], x[2], x[3] };
> > > > > > > > > > > > > >   return y;
> > > > > > > > > > > > > > }
> > > > > > > > > > > > > > ---
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > we can optimize vector init constructor with vector 
> > > > > > > > > > > > > > copy or permute
> > > > > > > > > > > > > > followed by a single scalar insert:
> > > > > > > > > >
> > > > > > > > > > > and you want to advance to the _1 = BIT_INSERT_EXPR here. 
> > > > > > > > > > >  The easiest way
> > > > > > > > > > > is to emit a new stmt for _2 = copy ...; and do the 
> > > > > > > > > > > set_rhs with the
> > > > > > > > > > > BIT_INSERT_EXPR.
> > > > > > > > > >
> > > > > > > > > > Thanks for BIT_INSERT_EXPR suggestion.  I am testing this 
> > > > > > > > > > patch.
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > H.J.
> > > > > > > > > > ---
> > > > > > > > > > We can optimize vector constructor with vector copy or 
> > > > > > > > > > permute followed
> > > > > > > > > > by a single scalar insert:
> > > > > > > > > >
> > > > > > > > > >   __v4sf y;
> > > > > > > > > >   __v4sf D.1930;
> > > > > > > > > >   float _1;
> > > > > > > > > >   float _2;
> > > > > > > > > >   float _3;
> > > > > > > > > >
> > > > > > > > > >   <bb 2> :
> > > > > > > > > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > > > > > > > > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > > > > > > > > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > > > > > > > > >   y_6 = {f_5(D), _3, _2, _1};
> > > > > > > > > >   return y_6;
> > > > > > > > > >
> > > > > > > > > > with
> > > > > > > > > >
> > > > > > > > > >  __v4sf y;
> > > > > > > > > >   __v4sf D.1930;
> > > > > > > > > >   float _1;
> > > > > > > > > >   float _2;
> > > > > > > > > >   float _3;
> > > > > > > > > >   vector(4) float _8;
> > > > > > > > > >
> > > > > > > > > >   <bb 2> :
> > > > > > > > > >   _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
> > > > > > > > > >   _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
> > > > > > > > > >   _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
> > > > > > > > > >   _8 = x_9(D);
> > > > > > > > > >   y_6 = BIT_INSERT_EXPR <x_9(D), f_5(D), 0 (32 bits)>;
> > > > > > > > > >   return y_6;
> > > > > > > > > >
> > > > > > > > > > gcc/
> > > > > > > > > >
> > > > > > > > > >         PR tree-optimization/88828
> > > > > > > > > >         * tree-ssa-forwprop.c 
> > > > > > > > > > (simplify_vector_constructor): Optimize
> > > > > > > > > >         vector init constructor with vector copy or permute 
> > > > > > > > > > followed
> > > > > > > > > >         by a single scalar insert.
> > > > > > > > > >
> > > > > > > > > > gcc/testsuite/
> > > > > > > > > >
> > > > > > > > > >         PR tree-optimization/88828
> > > > > > > > > >         * gcc.target/i386/pr88828-1a.c: New test.
> > > > > > > > > >         * gcc.target/i386/pr88828-2b.c: Likewise.
> > > > > > > > > >         * gcc.target/i386/pr88828-2.c: Likewise.
> > > > > > > > > >         * gcc.target/i386/pr88828-3a.c: Likewise.
> > > > > > > > > >         * gcc.target/i386/pr88828-3b.c: Likewise.
> > > > > > > > > >         * gcc.target/i386/pr88828-3c.c: Likewise.
> > > > > > > > > >         * gcc.target/i386/pr88828-3d.c: Likewise.
> > > > > > > > > >         * gcc.target/i386/pr88828-4a.c: Likewise.
> > > > > > > > > >         * gcc.target/i386/pr88828-4b.c: Likewise.
> > > > > > > > > >         * gcc.target/i386/pr88828-5a.c: Likewise.
> > > > > > > > > >         * gcc.target/i386/pr88828-5b.c: Likewise.
> > > > > > > > > >         * gcc.target/i386/pr88828-6a.c: Likewise.
> > > > > > > > > >         * gcc.target/i386/pr88828-6b.c: Likewise.
> > > > > > > > >
> > > > > > > > > Here is the updated patch with run-time tests.
> > > > > > > >
> > > > > > > > -      if (TREE_CODE (elt->value) != SSA_NAME)
> > > > > > > > +      if (TREE_CODE (ce->value) != SSA_NAME)
> > > > > > > >         return false;
> > > > > > > >
> > > > > > > > hmm, so it doesn't allow { 0, v[1], v[2], v[3] }?  I think the 
> > > > > > > > single
> > > > > > > > scalar value can be a constant as well.
> > > > > > >
> > > > > > > Fixed.
> > > > > > >
> > > > > > > >        if (!def_stmt)
> > > > > > > > -       return false;
> > > > > > > > +       {
> > > > > > > > +         if (gimple_nop_p (SSA_NAME_DEF_STMT (ce->value)))
> > > > > > > >
> > > > > > > > if (SSA_NAME_IS_DEFAULT_DEF (ce->value))
> > > > > > > >
> > > > > > > > +           {
> > > > > > > >
> > > > > > > > also you seem to disallow
> > > > > > > >
> > > > > > > >   { i + 1, v[1], v[2], v[3] }
> > > > > > >
> > > > > > > Fixed by
> > > > > > >
> > > > > > >      if (code != BIT_FIELD_REF)
> > > > > > >         {
> > > > > > >           /* Only allow one scalar insert.  */
> > > > > > >           if (nscalars != 0)
> > > > > > >             return false;
> > > > > > >
> > > > > > >           nscalars = 1;
> > > > > > >           insert = true;
> > > > > > >           scalar_idx = i;
> > > > > > >           sel.quick_push (i);
> > > > > > >           scalar_element = ce->value;
> > > > > > >           continue;
> > > > > > >         }
> > > > > > >
> > > > > > > > because get_prop_source_stmt will return the definition 
> > > > > > > > computing
> > > > > > > > i + 1 in this case and your code will be skipped?
> > > > > > > >
> > > > > > > > I think you can simplify the code by treating scalar_element != 
> > > > > > > > NULL
> > > > > > > > as nscalars == 1 and eliding nscalars.
> > > > > > >
> > > > > > > It works only if
> > > > > > >
> > > > > > > TYPE_VECTOR_SUBPARTS (type).to_constant ()  == (nscalars + 
> > > > > > > nvectors)
> > > > > > >
> > > > > > > We need to check both nscalars and nvectors.  Elide nscalar
> > > > > > > check doesn't help much here.
> > > > > > >
> > > > > > > > -      if (conv_code == ERROR_MARK)
> > > > > > > > +      if (conv_code == ERROR_MARK && !insert)
> > > > > > > >         gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, 
> > > > > > > > orig[0],
> > > > > > > >                                         orig[1], op2);
> > > > > > > >        else
> > > > > > > > @@ -2148,10 +2198,25 @@ simplify_vector_constructor 
> > > > > > > > (gimple_stmt_iterator *gsi)
> > > > > > > >                                    VEC_PERM_EXPR, orig[0], 
> > > > > > > > orig[1], op2);
> > > > > > > >           orig[0] = gimple_assign_lhs (perm);
> > > > > > > >           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
> > > > > > > > -         gimple_assign_set_rhs_with_ops (gsi, conv_code, 
> > > > > > > > orig[0],
> > > > > > > > +         gimple_assign_set_rhs_with_ops (gsi,
> > > > > > > > +                                         (conv_code != 
> > > > > > > > ERROR_MARK
> > > > > > > > +                                          ? conv_code
> > > > > > > > +                                          : NOP_EXPR),
> > > > > > > > +                                         orig[0],
> > > > > > > >                                           NULL_TREE, NULL_TREE);
> > > > > > > >
> > > > > > > > I believe you should elide the last stmt for conv_code == 
> > > > > > > > ERROR_MARK,
> > > > > > > > that is, why did you need to add the && !insert check in the 
> > > > > > > > guarding condition
> > > > > > >
> > > > > > > When conv_code == ERROR_MARK, we still need
> > > > > > >
> > > > > > >        gimple *perm
> > > > > > >             = gimple_build_assign (make_ssa_name (TREE_TYPE 
> > > > > > > (orig[0])),
> > > > > > >                                    VEC_PERM_EXPR, orig[0], 
> > > > > > > orig[1], op2);
> > > > > > >           orig[0] = gimple_assign_lhs (perm);
> > > > > > >           gsi_insert_before (gsi, perm, GSI_SAME_STMT);
> > > > > > >           gimple_assign_set_rhs_with_ops (gsi,  NOP_EXPR,
> > > > > > >                                           orig[0],
> > > > > > >                                           NULL_TREE, NULL_TREE);
> > > > > > >
> > > > > > > Otherwise, scalar insert won't work.
> > > > > > >
> > > > > > > > (this path should already do the correct thing?).  Note that in 
> > > > > > > > all
> > > > > > > > cases it looks
> > > > > > > > that with conv_code != ERROR_MARK you may end up doing a 
> > > > > > > > float->int
> > > > > > > > or int->float conversion on a value it wasn't done on before 
> > > > > > > > which might
> > > > > > > > raise exceptions?  That is, do we need to make sure we permute a
> > > > > > > > value we already do convert into the place we're going to 
> > > > > > > > insert to?
> > > > > > >
> > > > > > > This couldn't happen:
> > > > > > >
> > > > > > >       if (type == TREE_TYPE (ref))
> > > > > > >          {
> > > > > > >            /* The RHS vector has the same type as LHS.  */
> > > > > > >            if (rhs_vector == NULL)
> > > > > > >              rhs_vector = ref;
> > > > > > >            /* Check if all RHS vector elements come fome the same
> > > > > > >               vector.  */
> > > > > > >            if (rhs_vector == ref)
> > > > > > >              nvectors++;
> > > > > > >          }
> > > > > > > ...
> > > > > > >   if (insert
> > > > > > >       && (nvectors == 0
> > > > > > >           || (TYPE_VECTOR_SUBPARTS (type).to_constant ()
> > > > > > >               != (nscalars + nvectors))))
> > > > > > >     return false;
> > > > >
> > > > > I see - that looks like a missed case then?
> > > > >
> > > > >  { 1., (float)v[1], (float)v[2], (float)v[3] }
> > > > >
> > > > > with integer vector v?
> > > >
> > > > True.
> > > >
> > > > > I'll have a look at the full patch next week (it's GCC 10 material in 
> > > > > any case).
> > > > >
> > >
> > > Now looking again.  I still don't like the new "structure" of the loop
> > > very much.
> > > A refactoring like the attached should make it easier to clearly separate 
> > > the
> > > cases where we reach a vector def and where not.
> >
> > Now attached.
> >
> > > Do you want me to take over the patch?
> > >
>

Here is the updated patch on top of your patch plus my fix.

-- 
H.J.
From 1bc95acebc23e030024d50f0d4473f706e296ab6 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 3 May 2019 09:33:30 -0700
Subject: [PATCH] Optimize vector constructor

We can optimize vector constructor with vector copy or permute followed
by a single scalar insert:

  __v4sf y;
  __v4sf D.1930;
  float _1;
  float _2;
  float _3;

  <bb 2> :
  _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
  _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
  _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
  y_6 = {f_5(D), _3, _2, _1};
  return y_6;

with

 __v4sf y;
  __v4sf D.1930;
  float _1;
  float _2;
  float _3;
  vector(4) float _8;

  <bb 2> :
  _1 = BIT_FIELD_REF <x_9(D), 32, 96>;
  _2 = BIT_FIELD_REF <x_9(D), 32, 64>;
  _3 = BIT_FIELD_REF <x_9(D), 32, 32>;
  _8 = x_9(D);
  y_6 = BIT_INSERT_EXPR <x_9(D), f_5(D), 0 (32 bits)>;
  return y_6;

gcc/

	PR tree-optimization/88828
	* tree-ssa-forwprop.c (simplify_vector_constructor): Optimize
	vector init constructor with vector copy or permute followed
	by a single scalar insert.

gcc/testsuite/

	PR tree-optimization/88828
	* gcc.target/i386/pr88828-1.c: New test.
	* gcc.target/i386/pr88828-1a.c: Likewise.
	* gcc.target/i386/pr88828-1b.c: Likewise.
	* gcc.target/i386/pr88828-1c.c: Likewise.
	* gcc.target/i386/pr88828-2.c: Likewise.
	* gcc.target/i386/pr88828-2a.c: Likewise.
	* gcc.target/i386/pr88828-2b.c: Likewise.
	* gcc.target/i386/pr88828-2c.c: Likewise.
	* gcc.target/i386/pr88828-2d.c: Likewise.
	* gcc.target/i386/pr88828-3.c: Likewise.
	* gcc.target/i386/pr88828-3a.c: Likewise.
	* gcc.target/i386/pr88828-3b.c: Likewise.
	* gcc.target/i386/pr88828-3c.c: Likewise.
	* gcc.target/i386/pr88828-3d.c: Likewise.
	* gcc.target/i386/pr88828-4a.c: Likewise.
	* gcc.target/i386/pr88828-4b.c: Likewise.
	* gcc.target/i386/pr88828-5a.c: Likewise.
	* gcc.target/i386/pr88828-5b.c: Likewise.
	* gcc.target/i386/pr88828-6.c: Likewise.
	* gcc.target/i386/pr88828-6a.c: Likewise.
	* gcc.target/i386/pr88828-6b.c: Likewise.
	* gcc.target/i386/pr88828-7.c: Likewise.
	* gcc.target/i386/pr88828-7a.c: Likewise.
	* gcc.target/i386/pr88828-7b.c: Likewise.
	* gcc.target/i386/pr88828-8.c: Likewise.
	* gcc.target/i386/pr88828-8a.c: Likewise.
	* gcc.target/i386/pr88828-8b.c: Likewise.
	* gcc.target/i386/pr88828-9.c: Likewise.
	* gcc.target/i386/pr88828-9a.c: Likewise.
	* gcc.target/i386/pr88828-9b.c: Likewise.
---
 gcc/testsuite/gcc.target/i386/pr88828-1.c  | 49 +++++++++++++++
 gcc/testsuite/gcc.target/i386/pr88828-1a.c | 17 ++++++
 gcc/testsuite/gcc.target/i386/pr88828-1b.c | 23 +++++++
 gcc/testsuite/gcc.target/i386/pr88828-1c.c | 18 ++++++
 gcc/testsuite/gcc.target/i386/pr88828-2.c  | 51 ++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr88828-2a.c | 17 ++++++
 gcc/testsuite/gcc.target/i386/pr88828-2b.c | 19 ++++++
 gcc/testsuite/gcc.target/i386/pr88828-2c.c | 23 +++++++
 gcc/testsuite/gcc.target/i386/pr88828-2d.c | 25 ++++++++
 gcc/testsuite/gcc.target/i386/pr88828-3.c  | 54 +++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr88828-3a.c | 17 ++++++
 gcc/testsuite/gcc.target/i386/pr88828-3b.c | 19 ++++++
 gcc/testsuite/gcc.target/i386/pr88828-3c.c | 25 ++++++++
 gcc/testsuite/gcc.target/i386/pr88828-4a.c | 18 ++++++
 gcc/testsuite/gcc.target/i386/pr88828-4b.c | 21 +++++++
 gcc/testsuite/gcc.target/i386/pr88828-5a.c | 18 ++++++
 gcc/testsuite/gcc.target/i386/pr88828-5b.c | 20 +++++++
 gcc/testsuite/gcc.target/i386/pr88828-6.c  | 47 +++++++++++++++
 gcc/testsuite/gcc.target/i386/pr88828-6a.c | 16 +++++
 gcc/testsuite/gcc.target/i386/pr88828-6b.c | 22 +++++++
 gcc/testsuite/gcc.target/i386/pr88828-7.c  | 53 ++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr88828-7a.c | 16 +++++
 gcc/testsuite/gcc.target/i386/pr88828-7b.c | 22 +++++++
 gcc/testsuite/gcc.target/i386/pr88828-8.c  | 46 ++++++++++++++
 gcc/testsuite/gcc.target/i386/pr88828-8a.c | 15 +++++
 gcc/testsuite/gcc.target/i386/pr88828-8b.c | 21 +++++++
 gcc/testsuite/gcc.target/i386/pr88828-9.c  | 46 ++++++++++++++
 gcc/testsuite/gcc.target/i386/pr88828-9a.c | 16 +++++
 gcc/testsuite/gcc.target/i386/pr88828-9b.c | 23 +++++++
 gcc/tree-ssa-forwprop.c                    | 70 ++++++++++++++++++++--
 30 files changed, 842 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2d.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-4a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-4b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-5a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-5b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-6a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-6b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-7.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-7a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-7b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-8.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-8a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-8b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-9.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-9a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-9b.c

diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1.c b/gcc/testsuite/gcc.target/i386/pr88828-1.c
new file mode 100644
index 00000000000..a15d1fea3f5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-1.c
@@ -0,0 +1,49 @@
+/* { dg-do run { target sse2_runtime } } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "pr88828-1a.c"
+#include "pr88828-1b.c"
+#include "pr88828-1c.c"
+
+extern void abort ();
+
+void
+do_check (__v4sf y, float f[4], float z)
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (i == 0)
+      {
+	if (y[i] != z)
+	  abort ();
+      }
+    else
+      {
+	if (y[i] != f[i])
+	  abort ();
+      }
+}
+
+int
+main (void)
+{
+  float f[4] = { -11, 2, 55553, -4 };
+  float z = 134567;
+  __v4sf x = { f[0], f[1], f[2], f[3] };
+  __v4sf y;
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (x[i] != f[i])
+      abort ();
+
+  y = foo1 (x, z);
+  do_check (y, f, z);
+  y = foo2 (x, z);
+  do_check (y, f, z);
+  y = foo3 (x, z);
+  do_check (y, f, z);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1a.c b/gcc/testsuite/gcc.target/i386/pr88828-1a.c
new file mode 100644
index 00000000000..d37b24c6661
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-1a.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo1 (__v4sf x, float f)
+{
+  __v4sf y = { f, x[1], x[2], x[3] };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1b.c b/gcc/testsuite/gcc.target/i386/pr88828-1b.c
new file mode 100644
index 00000000000..af4aced65f8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-1b.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__attribute__((noinline, noclone))
+__v4sf
+foo2 (__v4sf x, float f)
+{
+  return vector_init (f, x[1], x[2], x[3]) ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1c.c b/gcc/testsuite/gcc.target/i386/pr88828-1c.c
new file mode 100644
index 00000000000..a117f3ec7b1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-1c.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo3 (__v4sf x, float f)
+{
+  __v4sf y = x;
+  y[0] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2.c b/gcc/testsuite/gcc.target/i386/pr88828-2.c
new file mode 100644
index 00000000000..011fd486bb1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-2.c
@@ -0,0 +1,51 @@
+/* { dg-do run { target sse2_runtime } } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "pr88828-2a.c"
+#include "pr88828-2c.c"
+
+extern void abort ();
+
+void
+do_check (__v4sf y, float f[4], float z)
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (i == 0)
+      {
+	if (y[i] != z)
+	  abort ();
+      }
+    else if (i == 1)
+      {
+	if (y[i] != f[0])
+	  abort ();
+      }
+    else
+      {
+	if (y[i] != f[i])
+	  abort ();
+      }
+}
+
+int
+main (void)
+{
+  float f[4] = { -11, 2, 55553, -4 };
+  float z = 134567;
+  __v4sf x = { f[0], f[1], f[2], f[3] };
+  __v4sf y;
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (x[i] != f[i])
+      abort ();
+
+  y = foo1 (x, z);
+  do_check (y, f, z);
+  y = foo2 (x, z);
+  do_check (y, f, z);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2a.c b/gcc/testsuite/gcc.target/i386/pr88828-2a.c
new file mode 100644
index 00000000000..85e49535ebd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-2a.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 1 } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo1 (__v4sf x, float f)
+{
+  __v4sf y = { f, x[0], x[2], x[3] };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2b.c b/gcc/testsuite/gcc.target/i386/pr88828-2b.c
new file mode 100644
index 00000000000..adfd7002a4d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-2b.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo1 (__v4sf x, float f)
+{
+  __v4sf y = { f, x[0], x[2], x[3] };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2c.c b/gcc/testsuite/gcc.target/i386/pr88828-2c.c
new file mode 100644
index 00000000000..149967ea0b9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-2c.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 1 } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__attribute__((noinline, noclone))
+__v4sf
+foo2 (__v4sf x, float f)
+{
+  return vector_init (f, x[0], x[2], x[3]) ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2d.c b/gcc/testsuite/gcc.target/i386/pr88828-2d.c
new file mode 100644
index 00000000000..21088496730
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-2d.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__attribute__((noinline, noclone))
+__v4sf
+foo (__v4sf x, float f)
+{
+  return vector_init (f, x[0], x[2], x[3]) ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3.c b/gcc/testsuite/gcc.target/i386/pr88828-3.c
new file mode 100644
index 00000000000..adbc46dbf3b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-3.c
@@ -0,0 +1,54 @@
+/* { dg-do run { target sse2_runtime } } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "pr88828-3a.c"
+#include "pr88828-3b.c"
+#include "pr88828-3c.c"
+
+extern void abort ();
+
+void
+do_check (__v4sf y, float f[4], float z)
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (i == 3)
+      {
+	if (y[i] != z)
+	  abort ();
+      }
+    else if (i == 0)
+      {
+	if (y[i] != f[i])
+	  abort ();
+      }
+    else
+      {
+	if (y[i] != f[i + 1])
+	  abort ();
+      }
+}
+
+int
+main (void)
+{
+  float f[4] = { -11, 2, 55553, -4 };
+  float z = 134567;
+  __v4sf x = { f[0], f[1], f[2], f[3] };
+  __v4sf y;
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (x[i] != f[i])
+      abort ();
+
+  y = foo1 (x, z);
+  do_check (y, f, z);
+  y = foo2 (x, z);
+  do_check (y, f, z);
+  y = foo3 (x, z);
+  do_check (y, f, z);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3a.c b/gcc/testsuite/gcc.target/i386/pr88828-3a.c
new file mode 100644
index 00000000000..e5cb95c1275
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-3a.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 2 } } */
+/* { dg-final { scan-assembler-times "movaps" 1 } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo1 (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], f };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3b.c b/gcc/testsuite/gcc.target/i386/pr88828-3b.c
new file mode 100644
index 00000000000..0349f35b08a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-3b.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vinsertps" 1 } } */
+/* { dg-final { scan-assembler-not "vmovss" } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo2 (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], f };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3c.c b/gcc/testsuite/gcc.target/i386/pr88828-3c.c
new file mode 100644
index 00000000000..fb668a55f1d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-3c.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vinsertps" 1 } } */
+/* { dg-final { scan-assembler-not "vmovss" } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__attribute__((noinline, noclone))
+__v4sf
+foo3 (__v4sf x, float f)
+{
+  return vector_init (x[0], x[2], x[3], f);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4a.c b/gcc/testsuite/gcc.target/i386/pr88828-4a.c
new file mode 100644
index 00000000000..64043b9855f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-4a.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 1 } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[1] };
+  y[0] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4b.c b/gcc/testsuite/gcc.target/i386/pr88828-4b.c
new file mode 100644
index 00000000000..ad8d2b985d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-4b.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[1] };
+  y[0] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5a.c b/gcc/testsuite/gcc.target/i386/pr88828-5a.c
new file mode 100644
index 00000000000..5e908faef5c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-5a.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 2 } } */
+/* { dg-final { scan-assembler-times "movaps" 1 } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[0] };
+  y[3] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5b.c b/gcc/testsuite/gcc.target/i386/pr88828-5b.c
new file mode 100644
index 00000000000..988a48823e6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-5b.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vinsertps" 1 } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovss" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[0] };
+  y[3] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6.c b/gcc/testsuite/gcc.target/i386/pr88828-6.c
new file mode 100644
index 00000000000..8d920396896
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-6.c
@@ -0,0 +1,47 @@
+/* { dg-do run { target sse2_runtime } } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "pr88828-6a.c"
+#include "pr88828-6b.c"
+
+extern void abort ();
+
+void
+do_check (__v4sf x, float f1[4], float f2[4])
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (i == 0)
+      {
+	if (x[i] != (f1[i] + f2[i]))
+	  abort ();
+      }
+    else
+      {
+	if (x[i] != f1[i])
+	  abort ();
+      }
+}
+
+int
+main (void)
+{
+  float f1[4] = { -11, 2, 55553, -4 };
+  float f2[4] = { 111, 3.3, -55.553, 4.8 };
+  __v4sf x = { f1[0], f1[1], f1[2], f1[3] };
+  __v4sf y = { f2[0], f2[1], f2[2], f2[3] };
+  __v4sf z;
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (x[i] != f1[i] || y[i] != f2[i] )
+      abort ();
+
+  z = foo1 (x, y);
+  do_check (z, f1, f2);
+  x = foo2 (x, y);
+  do_check (z, f1, f2);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6a.c b/gcc/testsuite/gcc.target/i386/pr88828-6a.c
new file mode 100644
index 00000000000..4094f25a1fb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-6a.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "addss" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpckhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo1 (__v4sf x, __v4sf y)
+{
+  __v4sf z = { x[0] + y[0], x[1], x[2], x[3] };
+  return z;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6b.c b/gcc/testsuite/gcc.target/i386/pr88828-6b.c
new file mode 100644
index 00000000000..a423a089963
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-6b.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "addss" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpckhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__v4sf
+foo2 (__v4sf x, __v4sf y)
+{
+  return vector_init (x[0] + y[0], x[1], x[2], x[3]) ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-7.c b/gcc/testsuite/gcc.target/i386/pr88828-7.c
new file mode 100644
index 00000000000..471028d417d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-7.c
@@ -0,0 +1,53 @@
+/* { dg-do run { target sse2_runtime } } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "pr88828-7a.c"
+#include "pr88828-7b.c"
+
+extern void abort ();
+
+float
+bar (float x, float y)
+{
+  return x / y - y * x;
+}
+
+void
+do_check (__v4sf x, float f1[4], float f2[4])
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (i == 0)
+      {
+	if (x[i] != bar (f1[i], f2[i]))
+	  abort ();
+      }
+    else
+      {
+	if (x[i] != f1[i])
+	  abort ();
+      }
+}
+
+int
+main (void)
+{
+  float f1[4] = { -11, 2, 55553, -4 };
+  float f2[4] = { 111, 3.3, -55.553, 4.8 };
+  __v4sf x = { f1[0], f1[1], f1[2], f1[3] };
+  __v4sf y = { f2[0], f2[1], f2[2], f2[3] };
+  __v4sf z;
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (x[i] != f1[i] || y[i] != f2[i] )
+      abort ();
+
+  z = foo1 (x, y);
+  do_check (z, f1, f2);
+  x = foo2 (x, y);
+  do_check (z, f1, f2);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-7a.c b/gcc/testsuite/gcc.target/i386/pr88828-7a.c
new file mode 100644
index 00000000000..f1ae57422d9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-7a.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpckhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+extern float bar (float, float);
+
+__v4sf
+foo1 (__v4sf x, __v4sf y)
+{
+  __v4sf z = { bar (x[0], y[0]), x[1], x[2], x[3] };
+  return z;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-7b.c b/gcc/testsuite/gcc.target/i386/pr88828-7b.c
new file mode 100644
index 00000000000..c027c56948d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-7b.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpckhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+extern float bar (float, float);
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__v4sf
+foo2 (__v4sf x, __v4sf y)
+{
+  return vector_init (bar (x[0], y[0]), x[1], x[2], x[3]) ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-8.c b/gcc/testsuite/gcc.target/i386/pr88828-8.c
new file mode 100644
index 00000000000..3b8eabd225f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-8.c
@@ -0,0 +1,46 @@
+/* { dg-do run { target sse2_runtime } } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "pr88828-8a.c"
+#include "pr88828-8b.c"
+
+extern void abort ();
+
+void
+do_check (__v4sf y, float f[4], float z)
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (i == 0)
+      {
+	if (y[i] != z)
+	  abort ();
+      }
+    else
+      {
+	if (y[i] != f[i])
+	  abort ();
+      }
+}
+
+int
+main (void)
+{
+  float f[4] = { -11, 2, 55553, -4 };
+  float z = 11.4;
+  __v4sf x = { f[0], f[1], f[2], f[3] };
+  __v4sf y;
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (x[i] != f[i])
+      abort ();
+
+  y = foo1 (x);
+  do_check (y, f, z);
+  y = foo2 (x);
+  do_check (y, f, z);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-8a.c b/gcc/testsuite/gcc.target/i386/pr88828-8a.c
new file mode 100644
index 00000000000..5d383dfd081
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-8a.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpckhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo1 (__v4sf x)
+{
+  __v4sf z = { 11.4, x[1], x[2], x[3] };
+  return z;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-8b.c b/gcc/testsuite/gcc.target/i386/pr88828-8b.c
new file mode 100644
index 00000000000..5ffbc9c3103
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-8b.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpckhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__v4sf
+foo2 (__v4sf x)
+{
+  return vector_init (11.4, x[1], x[2], x[3]) ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-9.c b/gcc/testsuite/gcc.target/i386/pr88828-9.c
new file mode 100644
index 00000000000..c33907b4a6f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-9.c
@@ -0,0 +1,46 @@
+/* { dg-do run { target sse2_runtime } } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "pr88828-9a.c"
+#include "pr88828-9b.c"
+
+extern void abort ();
+
+void
+do_check (__v4sf y, float f[4], float z)
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (i == 0)
+      {
+	if (y[i] != z)
+	  abort ();
+      }
+    else
+      {
+	if (y[i] != f[i])
+	  abort ();
+      }
+}
+
+int
+main (void)
+{
+  float f[4] = { -11, 2, 55553, -4 };
+  float z = 11.4;
+  __m128 x = (__m128) (__v4sf) { f[0], f[1], f[2], f[3] };
+  __m128 y;
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (x[i] != f[i])
+      abort ();
+
+  y = foo1 (x);
+  do_check (y, f, z);
+  y = foo2 (x);
+  do_check (y, f, z);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-9a.c b/gcc/testsuite/gcc.target/i386/pr88828-9a.c
new file mode 100644
index 00000000000..7f830657732
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-9a.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpckhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
+
+__m128
+foo1 (__m128 x)
+{
+  __v4sf z = { 11.4, ((__v4sf) x)[1], ((__v4sf) x)[2], ((__v4sf) x) [3] };
+  return (__m128) z;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-9b.c b/gcc/testsuite/gcc.target/i386/pr88828-9b.c
new file mode 100644
index 00000000000..6588ad15a9b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-9b.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpckhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
+
+static __m128
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return (__m128) y;
+}
+
+__m128
+foo2 (__m128 x)
+{
+  return vector_init (11.4, ((__v4sf) x)[1], ((__v4sf) x)[2],
+		      ((__v4sf) x) [3]);
+}
diff --git a/gcc/tree-ssa-forwprop.c b/gcc/tree-ssa-forwprop.c
index ab27a5ddbb1..0ca00da6a8d 100644
--- a/gcc/tree-ssa-forwprop.c
+++ b/gcc/tree-ssa-forwprop.c
@@ -2059,6 +2059,14 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
   elem_type = TREE_TYPE (type);
   elem_size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type));
 
+  tree rhs_vector = NULL;
+  /* The single scalar element.  */
+  tree scalar_element = NULL;
+  unsigned int scalar_idx = 0;
+  bool insert = false;
+  unsigned int nscalars = 0;
+  unsigned int nvectors = 0;
+
   vec_perm_builder sel (nelts, nelts, 1);
   orig[0] = NULL;
   orig[1] = NULL;
@@ -2110,6 +2118,18 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
 	    elem += nelts;
 	  if (elem != i)
 	    maybe_ident = false;
+
+	  if (useless_type_conversion_p (type, TREE_TYPE (ref)))
+	    {
+	      /* The RHS vector has the same type as LHS.  */
+	      if (rhs_vector == NULL)
+		rhs_vector = ref;
+	      /* Check if all RHS vector elements come fome the same
+		 vector.  */
+	      if (rhs_vector == ref)
+		nvectors++;
+	    }
+
 	  sel.quick_push (elem);
 	}
       else if (CONSTANT_CLASS_P (elt->value))
@@ -2125,11 +2145,28 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
 	  maybe_ident = false;
 	}
       else
-	return false;
+	{
+	  /* Only allow one scalar insert.  */
+	  if (nscalars != 0)
+	    return false;
+
+	  nscalars = 1;
+	  insert = true;
+	  scalar_idx = i;
+	  sel.quick_push (i);
+	  scalar_element = elt->value;
+	  continue;
+	}
     }
   if (i < nelts)
     return false;
 
+  if (insert
+      && (nvectors == 0
+	  || (TYPE_VECTOR_SUBPARTS (type).to_constant ()
+	      != (nscalars + nvectors))))
+    return false;
+
   if (! VECTOR_TYPE_P (TREE_TYPE (orig[0]))
       || maybe_ne (TYPE_VECTOR_SUBPARTS (type),
 		   TYPE_VECTOR_SUBPARTS (TREE_TYPE (orig[0]))))
@@ -2157,18 +2194,26 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
 
       vec_perm_indices indices (sel, orig[1] ? 2 : 1, nelts);
       if (!can_vec_perm_const_p (TYPE_MODE (type), indices))
-	return false;
+	{
+	  if (insert)
+	    gcc_unreachable ();
+	  return false;
+	}
       mask_type
 	= build_vector_type (build_nonstandard_integer_type (elem_size, 1),
 			     nelts);
       if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_VECTOR_INT
 	  || maybe_ne (GET_MODE_SIZE (TYPE_MODE (mask_type)),
 		       GET_MODE_SIZE (TYPE_MODE (type))))
-	return false;
+	{
+	  if (insert)
+	    gcc_unreachable ();
+	  return false;
+	}
       op2 = vec_perm_indices_to_tree (mask_type, indices);
       if (!orig[1])
 	orig[1] = orig[0];
-      if (orig[1] == error_mark_node)
+      if (orig[1] == error_mark_node && !insert)
 	{
 	  tree_vector_builder vec (type, nelts, 1);
 	  for (unsigned i = 0; i < nelts; ++i)
@@ -2198,10 +2243,25 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
 				   VEC_PERM_EXPR, orig[0], orig[1], op2);
 	  orig[0] = gimple_assign_lhs (perm);
 	  gsi_insert_before (gsi, perm, GSI_SAME_STMT);
-	  gimple_assign_set_rhs_with_ops (gsi, conv_code, orig[0],
+	  gimple_assign_set_rhs_with_ops (gsi,
+					  (conv_code != ERROR_MARK
+					   ? conv_code
+					   : NOP_EXPR),
+					  orig[0],
 					  NULL_TREE, NULL_TREE);
 	}
     }
+  if (insert)
+    {
+      /* Generate a single scalar insert.  */
+      tree var = make_ssa_name (type);
+      gimple *copy = gimple_copy (gsi_stmt (*gsi));
+      gimple_assign_set_lhs (copy, var);
+      gsi_insert_before (gsi, copy, GSI_SAME_STMT);
+      tree bitpos = bitsize_int (scalar_idx * elem_size);
+      gimple_assign_set_rhs_with_ops (gsi, BIT_INSERT_EXPR, var,
+				      scalar_element, bitpos);
+    }
   update_stmt (gsi_stmt (*gsi));
   return true;
 }
-- 
2.20.1

Reply via email to