On Tue, Mar 5, 2019 at 1:46 AM H.J. Lu <hjl.to...@gmail.com> wrote: > > On Mon, Mar 04, 2019 at 12:55:04PM +0100, Richard Biener wrote: > > On Sun, Mar 3, 2019 at 10:13 PM H.J. Lu <hjl.to...@gmail.com> wrote: > > > > > > On Sun, Mar 03, 2019 at 06:40:09AM -0800, Andrew Pinski wrote: > > > > ) > > > > ,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <hjl.to...@gmail.com> wrote: > > > > > > > > > > For vector init constructor: > > > > > > > > > > --- > > > > > typedef float __v4sf __attribute__ ((__vector_size__ (16))); > > > > > > > > > > __v4sf > > > > > foo (__v4sf x, float f) > > > > > { > > > > > __v4sf y = { f, x[1], x[2], x[3] }; > > > > > return y; > > > > > } > > > > > --- > > > > > > > > > > we can optimize vector init constructor with vector copy or permute > > > > > followed by a single scalar insert: > > > and you want to advance to the _1 = BIT_INSERT_EXPR here. The easiest way > > is to emit a new stmt for _2 = copy ...; and do the set_rhs with the > > BIT_INSERT_EXPR. > > Thanks for BIT_INSERT_EXPR suggestion. I am testing this patch. > > > H.J. > --- > We can optimize vector constructor with vector copy or permute followed > by a single scalar insert: > > __v4sf y; > __v4sf D.1930; > float _1; > float _2; > float _3; > > <bb 2> : > _1 = BIT_FIELD_REF <x_9(D), 32, 96>; > _2 = BIT_FIELD_REF <x_9(D), 32, 64>; > _3 = BIT_FIELD_REF <x_9(D), 32, 32>; > y_6 = {f_5(D), _3, _2, _1}; > return y_6; > > with > > __v4sf y; > __v4sf D.1930; > float _1; > float _2; > float _3; > vector(4) float _8; > > <bb 2> : > _1 = BIT_FIELD_REF <x_9(D), 32, 96>; > _2 = BIT_FIELD_REF <x_9(D), 32, 64>; > _3 = BIT_FIELD_REF <x_9(D), 32, 32>; > _8 = x_9(D); > y_6 = BIT_INSERT_EXPR <x_9(D), f_5(D), 0 (32 bits)>; > return y_6; > > gcc/ > > PR tree-optimization/88828 > * tree-ssa-forwprop.c (simplify_vector_constructor): Optimize > vector init constructor with vector copy or permute followed > by a single scalar insert. > > gcc/testsuite/ > > PR tree-optimization/88828 > * gcc.target/i386/pr88828-1a.c: New test. > * gcc.target/i386/pr88828-2b.c: Likewise. > * gcc.target/i386/pr88828-2.c: Likewise. > * gcc.target/i386/pr88828-3a.c: Likewise. > * gcc.target/i386/pr88828-3b.c: Likewise. > * gcc.target/i386/pr88828-3c.c: Likewise. > * gcc.target/i386/pr88828-3d.c: Likewise. > * gcc.target/i386/pr88828-4a.c: Likewise. > * gcc.target/i386/pr88828-4b.c: Likewise. > * gcc.target/i386/pr88828-5a.c: Likewise. > * gcc.target/i386/pr88828-5b.c: Likewise. > * gcc.target/i386/pr88828-6a.c: Likewise. > * gcc.target/i386/pr88828-6b.c: Likewise.
Here is the updated patch with run-time tests. -- H.J.
From b2bc0bf3a8ee17d53bf39f0aeabe7025b33e9c96 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" <hjl.to...@gmail.com> Date: Tue, 5 Feb 2019 15:39:27 -0800 Subject: [PATCH] Optimize vector constructor We can optimize vector constructor with vector copy or permute followed by a single scalar insert: __v4sf y; __v4sf D.1930; float _1; float _2; float _3; <bb 2> : _1 = BIT_FIELD_REF <x_9(D), 32, 96>; _2 = BIT_FIELD_REF <x_9(D), 32, 64>; _3 = BIT_FIELD_REF <x_9(D), 32, 32>; y_6 = {f_5(D), _3, _2, _1}; return y_6; with __v4sf y; __v4sf D.1930; float _1; float _2; float _3; vector(4) float _8; <bb 2> : _1 = BIT_FIELD_REF <x_9(D), 32, 96>; _2 = BIT_FIELD_REF <x_9(D), 32, 64>; _3 = BIT_FIELD_REF <x_9(D), 32, 32>; _8 = x_9(D); y_6 = BIT_INSERT_EXPR <x_9(D), f_5(D), 0 (32 bits)>; return y_6; gcc/ PR tree-optimization/88828 * tree-ssa-forwprop.c (simplify_vector_constructor): Optimize vector init constructor with vector copy or permute followed by a single scalar insert. gcc/testsuite/ PR tree-optimization/88828 * gcc.target/i386/pr88828-1.c: New test. * gcc.target/i386/pr88828-1a.c: Likewise. * gcc.target/i386/pr88828-1b.c: Likewise. * gcc.target/i386/pr88828-1c.c: Likewise. * gcc.target/i386/pr88828-2.c: Likewise. * gcc.target/i386/pr88828-2a.c: Likewise. * gcc.target/i386/pr88828-2b.c: Likewise. * gcc.target/i386/pr88828-2c.c: Likewise. * gcc.target/i386/pr88828-2d.c: Likewise. * gcc.target/i386/pr88828-3.c: Likewise. * gcc.target/i386/pr88828-3a.c: Likewise. * gcc.target/i386/pr88828-3b.c: Likewise. * gcc.target/i386/pr88828-3c.c: Likewise. * gcc.target/i386/pr88828-3d.c: Likewise. * gcc.target/i386/pr88828-4a.c: Likewise. * gcc.target/i386/pr88828-4b.c: Likewise. * gcc.target/i386/pr88828-5a.c: Likewise. * gcc.target/i386/pr88828-5b.c: Likewise. --- gcc/testsuite/gcc.target/i386/pr88828-1.c | 49 +++++++++++++ gcc/testsuite/gcc.target/i386/pr88828-1a.c | 17 +++++ gcc/testsuite/gcc.target/i386/pr88828-1b.c | 23 ++++++ gcc/testsuite/gcc.target/i386/pr88828-1c.c | 18 +++++ gcc/testsuite/gcc.target/i386/pr88828-2.c | 51 +++++++++++++ gcc/testsuite/gcc.target/i386/pr88828-2a.c | 17 +++++ gcc/testsuite/gcc.target/i386/pr88828-2b.c | 19 +++++ gcc/testsuite/gcc.target/i386/pr88828-2c.c | 23 ++++++ gcc/testsuite/gcc.target/i386/pr88828-2d.c | 25 +++++++ gcc/testsuite/gcc.target/i386/pr88828-3.c | 54 ++++++++++++++ gcc/testsuite/gcc.target/i386/pr88828-3a.c | 17 +++++ gcc/testsuite/gcc.target/i386/pr88828-3b.c | 19 +++++ gcc/testsuite/gcc.target/i386/pr88828-3c.c | 25 +++++++ gcc/testsuite/gcc.target/i386/pr88828-4a.c | 18 +++++ gcc/testsuite/gcc.target/i386/pr88828-4b.c | 21 ++++++ gcc/testsuite/gcc.target/i386/pr88828-5a.c | 18 +++++ gcc/testsuite/gcc.target/i386/pr88828-5b.c | 20 +++++ gcc/tree-ssa-forwprop.c | 85 +++++++++++++++++++--- 18 files changed, 509 insertions(+), 10 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1a.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1b.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1c.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2a.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2b.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2c.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2d.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3a.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3b.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3c.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-4a.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-4b.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-5a.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-5b.c diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1.c b/gcc/testsuite/gcc.target/i386/pr88828-1.c new file mode 100644 index 00000000000..a15d1fea3f5 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-1.c @@ -0,0 +1,49 @@ +/* { dg-do run { target sse2_runtime } } */ +/* { dg-options "-O2 -msse2" } */ + +#include "pr88828-1a.c" +#include "pr88828-1b.c" +#include "pr88828-1c.c" + +extern void abort (); + +void +do_check (__v4sf y, float f[4], float z) +{ + int i; + + for (i = 0; i < 4; i++) + if (i == 0) + { + if (y[i] != z) + abort (); + } + else + { + if (y[i] != f[i]) + abort (); + } +} + +int +main (void) +{ + float f[4] = { -11, 2, 55553, -4 }; + float z = 134567; + __v4sf x = { f[0], f[1], f[2], f[3] }; + __v4sf y; + int i; + + for (i = 0; i < 4; i++) + if (x[i] != f[i]) + abort (); + + y = foo1 (x, z); + do_check (y, f, z); + y = foo2 (x, z); + do_check (y, f, z); + y = foo3 (x, z); + do_check (y, f, z); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1a.c b/gcc/testsuite/gcc.target/i386/pr88828-1a.c new file mode 100644 index 00000000000..d37b24c6661 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-1a.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse -mno-sse4" } */ +/* { dg-final { scan-assembler "movss" } } */ +/* { dg-final { scan-assembler-not "movaps" } } */ +/* { dg-final { scan-assembler-not "movlhps" } } */ +/* { dg-final { scan-assembler-not "unpcklps" } } */ +/* { dg-final { scan-assembler-not "shufps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__attribute__((noinline, noclone)) +__v4sf +foo1 (__v4sf x, float f) +{ + __v4sf y = { f, x[1], x[2], x[3] }; + return y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1b.c b/gcc/testsuite/gcc.target/i386/pr88828-1b.c new file mode 100644 index 00000000000..af4aced65f8 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-1b.c @@ -0,0 +1,23 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse -mno-sse4" } */ +/* { dg-final { scan-assembler "movss" } } */ +/* { dg-final { scan-assembler-not "movaps" } } */ +/* { dg-final { scan-assembler-not "movlhps" } } */ +/* { dg-final { scan-assembler-not "unpcklps" } } */ +/* { dg-final { scan-assembler-not "shufps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +static __v4sf +vector_init (float f0,float f1, float f2,float f3) +{ + __v4sf y = { f0, f1, f2, f3 }; + return y; +} + +__attribute__((noinline, noclone)) +__v4sf +foo2 (__v4sf x, float f) +{ + return vector_init (f, x[1], x[2], x[3]) ; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1c.c b/gcc/testsuite/gcc.target/i386/pr88828-1c.c new file mode 100644 index 00000000000..a117f3ec7b1 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-1c.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse -mno-sse4" } */ +/* { dg-final { scan-assembler "movss" } } */ +/* { dg-final { scan-assembler-not "movaps" } } */ +/* { dg-final { scan-assembler-not "movlhps" } } */ +/* { dg-final { scan-assembler-not "unpcklps" } } */ +/* { dg-final { scan-assembler-not "shufps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__attribute__((noinline, noclone)) +__v4sf +foo3 (__v4sf x, float f) +{ + __v4sf y = x; + y[0] = f; + return y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2.c b/gcc/testsuite/gcc.target/i386/pr88828-2.c new file mode 100644 index 00000000000..011fd486bb1 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-2.c @@ -0,0 +1,51 @@ +/* { dg-do run { target sse2_runtime } } */ +/* { dg-options "-O2 -msse2" } */ + +#include "pr88828-2a.c" +#include "pr88828-2c.c" + +extern void abort (); + +void +do_check (__v4sf y, float f[4], float z) +{ + int i; + + for (i = 0; i < 4; i++) + if (i == 0) + { + if (y[i] != z) + abort (); + } + else if (i == 1) + { + if (y[i] != f[0]) + abort (); + } + else + { + if (y[i] != f[i]) + abort (); + } +} + +int +main (void) +{ + float f[4] = { -11, 2, 55553, -4 }; + float z = 134567; + __v4sf x = { f[0], f[1], f[2], f[3] }; + __v4sf y; + int i; + + for (i = 0; i < 4; i++) + if (x[i] != f[i]) + abort (); + + y = foo1 (x, z); + do_check (y, f, z); + y = foo2 (x, z); + do_check (y, f, z); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2a.c b/gcc/testsuite/gcc.target/i386/pr88828-2a.c new file mode 100644 index 00000000000..85e49535ebd --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-2a.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse -mno-sse4" } */ +/* { dg-final { scan-assembler "movss" } } */ +/* { dg-final { scan-assembler-times "shufps" 1 } } */ +/* { dg-final { scan-assembler-not "movaps" } } */ +/* { dg-final { scan-assembler-not "movlhps" } } */ +/* { dg-final { scan-assembler-not "unpcklps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__attribute__((noinline, noclone)) +__v4sf +foo1 (__v4sf x, float f) +{ + __v4sf y = { f, x[0], x[2], x[3] }; + return y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2b.c b/gcc/testsuite/gcc.target/i386/pr88828-2b.c new file mode 100644 index 00000000000..adfd7002a4d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-2b.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx" } */ +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */ +/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */ +/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */ +/* { dg-final { scan-assembler-not "vmovaps" } } */ +/* { dg-final { scan-assembler-not "vmovlhps" } } */ +/* { dg-final { scan-assembler-not "vunpcklps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__attribute__((noinline, noclone)) +__v4sf +foo1 (__v4sf x, float f) +{ + __v4sf y = { f, x[0], x[2], x[3] }; + return y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2c.c b/gcc/testsuite/gcc.target/i386/pr88828-2c.c new file mode 100644 index 00000000000..149967ea0b9 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-2c.c @@ -0,0 +1,23 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse -mno-sse4" } */ +/* { dg-final { scan-assembler "movss" } } */ +/* { dg-final { scan-assembler-times "shufps" 1 } } */ +/* { dg-final { scan-assembler-not "movaps" } } */ +/* { dg-final { scan-assembler-not "movlhps" } } */ +/* { dg-final { scan-assembler-not "unpcklps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +static __v4sf +vector_init (float f0,float f1, float f2,float f3) +{ + __v4sf y = { f0, f1, f2, f3 }; + return y; +} + +__attribute__((noinline, noclone)) +__v4sf +foo2 (__v4sf x, float f) +{ + return vector_init (f, x[0], x[2], x[3]) ; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2d.c b/gcc/testsuite/gcc.target/i386/pr88828-2d.c new file mode 100644 index 00000000000..21088496730 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-2d.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx" } */ +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */ +/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */ +/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */ +/* { dg-final { scan-assembler-not "vmovaps" } } */ +/* { dg-final { scan-assembler-not "vmovlhps" } } */ +/* { dg-final { scan-assembler-not "vunpcklps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +static __v4sf +vector_init (float f0,float f1, float f2,float f3) +{ + __v4sf y = { f0, f1, f2, f3 }; + return y; +} + +__attribute__((noinline, noclone)) +__v4sf +foo (__v4sf x, float f) +{ + return vector_init (f, x[0], x[2], x[3]) ; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3.c b/gcc/testsuite/gcc.target/i386/pr88828-3.c new file mode 100644 index 00000000000..adbc46dbf3b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-3.c @@ -0,0 +1,54 @@ +/* { dg-do run { target sse2_runtime } } */ +/* { dg-options "-O2 -msse2" } */ + +#include "pr88828-3a.c" +#include "pr88828-3b.c" +#include "pr88828-3c.c" + +extern void abort (); + +void +do_check (__v4sf y, float f[4], float z) +{ + int i; + + for (i = 0; i < 4; i++) + if (i == 3) + { + if (y[i] != z) + abort (); + } + else if (i == 0) + { + if (y[i] != f[i]) + abort (); + } + else + { + if (y[i] != f[i + 1]) + abort (); + } +} + +int +main (void) +{ + float f[4] = { -11, 2, 55553, -4 }; + float z = 134567; + __v4sf x = { f[0], f[1], f[2], f[3] }; + __v4sf y; + int i; + + for (i = 0; i < 4; i++) + if (x[i] != f[i]) + abort (); + + y = foo1 (x, z); + do_check (y, f, z); + y = foo2 (x, z); + do_check (y, f, z); + y = foo3 (x, z); + do_check (y, f, z); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3a.c b/gcc/testsuite/gcc.target/i386/pr88828-3a.c new file mode 100644 index 00000000000..e5cb95c1275 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-3a.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse -mno-sse4" } */ +/* { dg-final { scan-assembler "movss" } } */ +/* { dg-final { scan-assembler-times "shufps" 2 } } */ +/* { dg-final { scan-assembler-times "movaps" 1 } } */ +/* { dg-final { scan-assembler-not "movlhps" } } */ +/* { dg-final { scan-assembler-not "unpcklps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__attribute__((noinline, noclone)) +__v4sf +foo1 (__v4sf x, float f) +{ + __v4sf y = { x[0], x[2], x[3], f }; + return y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3b.c b/gcc/testsuite/gcc.target/i386/pr88828-3b.c new file mode 100644 index 00000000000..0349f35b08a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-3b.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx" } */ +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */ +/* { dg-final { scan-assembler-times "vinsertps" 1 } } */ +/* { dg-final { scan-assembler-not "vmovss" } } */ +/* { dg-final { scan-assembler-not "vshufps" } } */ +/* { dg-final { scan-assembler-not "vmovaps" } } */ +/* { dg-final { scan-assembler-not "vmovlhps" } } */ +/* { dg-final { scan-assembler-not "vunpcklps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__attribute__((noinline, noclone)) +__v4sf +foo2 (__v4sf x, float f) +{ + __v4sf y = { x[0], x[2], x[3], f }; + return y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3c.c b/gcc/testsuite/gcc.target/i386/pr88828-3c.c new file mode 100644 index 00000000000..fb668a55f1d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-3c.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx" } */ +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */ +/* { dg-final { scan-assembler-times "vinsertps" 1 } } */ +/* { dg-final { scan-assembler-not "vmovss" } } */ +/* { dg-final { scan-assembler-not "vshufps" } } */ +/* { dg-final { scan-assembler-not "vmovaps" } } */ +/* { dg-final { scan-assembler-not "vmovlhps" } } */ +/* { dg-final { scan-assembler-not "vunpcklps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +static __v4sf +vector_init (float f0,float f1, float f2,float f3) +{ + __v4sf y = { f0, f1, f2, f3 }; + return y; +} + +__attribute__((noinline, noclone)) +__v4sf +foo3 (__v4sf x, float f) +{ + return vector_init (x[0], x[2], x[3], f); +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4a.c b/gcc/testsuite/gcc.target/i386/pr88828-4a.c new file mode 100644 index 00000000000..64043b9855f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-4a.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse -mno-sse4" } */ +/* { dg-final { scan-assembler "movss" } } */ +/* { dg-final { scan-assembler-times "shufps" 1 } } */ +/* { dg-final { scan-assembler-not "movaps" } } */ +/* { dg-final { scan-assembler-not "movlhps" } } */ +/* { dg-final { scan-assembler-not "unpcklps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__attribute__((noinline, noclone)) +__v4sf +foo (__v4sf x, float f) +{ + __v4sf y = { x[0], x[2], x[3], x[1] }; + y[0] = f; + return y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4b.c b/gcc/testsuite/gcc.target/i386/pr88828-4b.c new file mode 100644 index 00000000000..ad8d2b985d4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-4b.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx" } */ +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */ +/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */ +/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */ +/* { dg-final { scan-assembler-not "vshufps" } } */ +/* { dg-final { scan-assembler-not "vmovaps" } } */ +/* { dg-final { scan-assembler-not "vmovlhps" } } */ +/* { dg-final { scan-assembler-not "vunpcklps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__attribute__((noinline, noclone)) +__v4sf +foo (__v4sf x, float f) +{ + __v4sf y = { x[0], x[2], x[3], x[1] }; + y[0] = f; + return y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5a.c b/gcc/testsuite/gcc.target/i386/pr88828-5a.c new file mode 100644 index 00000000000..5e908faef5c --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-5a.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse -mno-sse4" } */ +/* { dg-final { scan-assembler "movss" } } */ +/* { dg-final { scan-assembler-times "shufps" 2 } } */ +/* { dg-final { scan-assembler-times "movaps" 1 } } */ +/* { dg-final { scan-assembler-not "movlhps" } } */ +/* { dg-final { scan-assembler-not "unpcklps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__attribute__((noinline, noclone)) +__v4sf +foo (__v4sf x, float f) +{ + __v4sf y = { x[0], x[2], x[3], x[0] }; + y[3] = f; + return y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5b.c b/gcc/testsuite/gcc.target/i386/pr88828-5b.c new file mode 100644 index 00000000000..988a48823e6 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-5b.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx" } */ +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */ +/* { dg-final { scan-assembler-times "vinsertps" 1 } } */ +/* { dg-final { scan-assembler-not "vshufps" } } */ +/* { dg-final { scan-assembler-not "vmovss" } } */ +/* { dg-final { scan-assembler-not "vmovaps" } } */ +/* { dg-final { scan-assembler-not "vmovlhps" } } */ +/* { dg-final { scan-assembler-not "vunpcklps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__attribute__((noinline, noclone)) +__v4sf +foo (__v4sf x, float f) +{ + __v4sf y = { x[0], x[2], x[3], x[0] }; + y[3] = f; + return y; +} diff --git a/gcc/tree-ssa-forwprop.c b/gcc/tree-ssa-forwprop.c index eeb6281c652..85d9f86288b 100644 --- a/gcc/tree-ssa-forwprop.c +++ b/gcc/tree-ssa-forwprop.c @@ -2008,7 +2008,7 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) unsigned elem_size, i; unsigned HOST_WIDE_INT nelts; enum tree_code code, conv_code; - constructor_elt *elt; + constructor_elt *ce; bool maybe_ident; gcc_checking_assert (gimple_assign_rhs_code (stmt) == CONSTRUCTOR); @@ -2027,18 +2027,42 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) orig[1] = NULL; conv_code = ERROR_MARK; maybe_ident = true; - FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, elt) + + tree rhs_vector = NULL; + /* The single scalar element. */ + tree scalar_element = NULL; + unsigned int scalar_idx = 0; + bool insert = false; + unsigned int nscalars = 0; + unsigned int nvectors = 0; + FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, ce) { tree ref, op1; if (i >= nelts) return false; - if (TREE_CODE (elt->value) != SSA_NAME) + if (TREE_CODE (ce->value) != SSA_NAME) return false; - def_stmt = get_prop_source_stmt (elt->value, false, NULL); + def_stmt = get_prop_source_stmt (ce->value, false, NULL); if (!def_stmt) - return false; + { + if (gimple_nop_p (SSA_NAME_DEF_STMT (ce->value))) + { + /* Only allow one scalar insert. */ + if (nscalars != 0) + return false; + + nscalars = 1; + insert = true; + scalar_idx = i; + sel.quick_push (i); + scalar_element = ce->value; + continue; + } + else + return false; + } code = gimple_assign_rhs_code (def_stmt); if (code == FLOAT_EXPR || code == FIX_TRUNC_EXPR) @@ -2046,7 +2070,7 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) op1 = gimple_assign_rhs1 (def_stmt); if (conv_code == ERROR_MARK) { - if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (elt->value))), + if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (ce->value))), GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (op1))))) return false; conv_code = code; @@ -2095,11 +2119,29 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) elt += nelts; if (elt != i) maybe_ident = false; + + if (type == TREE_TYPE (ref)) + { + /* The RHS vector has the same type as LHS. */ + if (rhs_vector == NULL) + rhs_vector = ref; + /* Check if all RHS vector elements come fome the same + vector. */ + if (rhs_vector == ref) + nvectors++; + } + sel.quick_push (elt); } if (i < nelts) return false; + if (insert + && (nvectors == 0 + || (TYPE_VECTOR_SUBPARTS (type).to_constant () + != (nscalars + nvectors)))) + return false; + if (! VECTOR_TYPE_P (TREE_TYPE (orig[0])) || maybe_ne (TYPE_VECTOR_SUBPARTS (type), TYPE_VECTOR_SUBPARTS (TREE_TYPE (orig[0])))) @@ -2127,18 +2169,26 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) vec_perm_indices indices (sel, orig[1] ? 2 : 1, nelts); if (!can_vec_perm_const_p (TYPE_MODE (type), indices)) - return false; + { + if (insert) + gcc_unreachable (); + return false; + } mask_type = build_vector_type (build_nonstandard_integer_type (elem_size, 1), nelts); if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_VECTOR_INT || maybe_ne (GET_MODE_SIZE (TYPE_MODE (mask_type)), GET_MODE_SIZE (TYPE_MODE (type)))) - return false; + { + if (insert) + gcc_unreachable (); + return false; + } op2 = vec_perm_indices_to_tree (mask_type, indices); if (!orig[1]) orig[1] = orig[0]; - if (conv_code == ERROR_MARK) + if (conv_code == ERROR_MARK && !insert) gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, orig[0], orig[1], op2); else @@ -2148,10 +2198,25 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) VEC_PERM_EXPR, orig[0], orig[1], op2); orig[0] = gimple_assign_lhs (perm); gsi_insert_before (gsi, perm, GSI_SAME_STMT); - gimple_assign_set_rhs_with_ops (gsi, conv_code, orig[0], + gimple_assign_set_rhs_with_ops (gsi, + (conv_code != ERROR_MARK + ? conv_code + : NOP_EXPR), + orig[0], NULL_TREE, NULL_TREE); } } + if (insert) + { + /* Generate a single scalar insert. */ + tree var = make_ssa_name (type); + tree val = gimple_assign_rhs1 (stmt); + gimple *copy = gimple_build_assign (var, val); + gsi_insert_before (gsi, copy, GSI_SAME_STMT); + tree bitpos = bitsize_int (scalar_idx * elem_size); + gimple_assign_set_rhs_with_ops (gsi, BIT_INSERT_EXPR, var, + scalar_element, bitpos); + } update_stmt (gsi_stmt (*gsi)); return true; } -- 2.20.1