Re: [PATCH] vect: Multistep float->int conversion only with no trapping math

2024-08-20 Thread Juergen Christ
Am Tue, Aug 20, 2024 at 02:51:02PM +0200 schrieb Richard Biener:
> On Tue, Aug 20, 2024 at 11:16 AM Juergen Christ  wrote:
> >
> > Am Tue, Aug 20, 2024 at 10:15:22AM +0200 schrieb Richard Biener:
> > > On Fri, Aug 9, 2024 at 2:58 PM Juergen Christ  
> > > wrote:
> > > >
> > > > Am Thu, Aug 08, 2024 at 02:06:44PM +0200 schrieb Richard Biener:
> > > > > On Mon, Aug 5, 2024 at 4:02 PM Juergen Christ  
> > > > > wrote:
> > > > > >
> > > > > > Am Mon, Aug 05, 2024 at 01:00:31PM +0200 schrieb Richard Biener:
> > > > > > > On Fri, Aug 2, 2024 at 2:43 PM Juergen Christ 
> > > > > > >  wrote:
> > > > > > > >
> > > > > > > > Do not convert floats to ints in multiple step if trapping math 
> > > > > > > > is
> > > > > > > > enabled.  This might hide some inexact signals.
> > > > > > > >
> > > > > > > > Also use correct sign (the sign of the target integer type) for 
> > > > > > > > the
> > > > > > > > intermediate steps.  This only affects undefined behaviour 
> > > > > > > > (casting
> > > > > > > > floats to unsigned datatype where the float is negative).
> > > > > > > >
> > > > > > > > gcc/ChangeLog:
> > > > > > > >
> > > > > > > > * tree-vect-stmts.cc (vectorizable_conversion): 
> > > > > > > > multi-step
> > > > > > > >   float to int conversion only with trapping math and 
> > > > > > > > correct
> > > > > > > >   sign.
> > > > > > > >
> > > > > > > > Signed-off-by: Juergen Christ 
> > > > > > > >
> > > > > > > > Bootstrapped and tested on x84 and s390.  Ok for trunk?
> > > > > > > >
> > > > > > > > ---
> > > > > > > >  gcc/tree-vect-stmts.cc | 8 +---
> > > > > > > >  1 file changed, 5 insertions(+), 3 deletions(-)
> > > > > > > >
> > > > > > > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> > > > > > > > index fdcda0d2abae..2ddd13383193 100644
> > > > > > > > --- a/gcc/tree-vect-stmts.cc
> > > > > > > > +++ b/gcc/tree-vect-stmts.cc
> > > > > > > > @@ -5448,7 +5448,8 @@ vectorizable_conversion (vec_info *vinfo,
> > > > > > > > break;
> > > > > > > >
> > > > > > > >   cvt_type
> > > > > > > > -   = build_nonstandard_integer_type (GET_MODE_BITSIZE 
> > > > > > > > (rhs_mode), 0);
> > > > > > > > +   = build_nonstandard_integer_type (GET_MODE_BITSIZE 
> > > > > > > > (rhs_mode),
> > > > > > > > + TYPE_UNSIGNED 
> > > > > > > > (lhs_type));
> > > > > > >
> > > > > > > But lhs_type should be a float type here, the idea that for a
> > > > > > > FLOAT_EXPR (int -> float)
> > > > > > > a signed integer type is the natural one to use - as it's 2x wider
> > > > > > > than the original
> > > > > > > RHS type it's signedness doesn't matter.  Note all float types 
> > > > > > > should be
> > > > > > > !TYPE_UNSIGNED so this hunk is a no-op but still less clear on 
> > > > > > > the intent IMO.
> > > > > > >
> > > > > > > Please drop it.
> > > > > >
> > > > > > Will do.  Sorry about that.
> > > > > >
> > > > > > > >   cvt_type = get_same_sized_vectype (cvt_type, 
> > > > > > > > vectype_in);
> > > > > > > >   if (cvt_type == NULL_TREE)
> > > > > > > > goto unsupported;
> > > > > > > > @@ -5505,10 +5506,11 @@ vectorizable_conversion (vec_info 
> > > > > > > > *vinfo,
> > > > > > > >if (GET_MODE_SIZE (lhs_mode) >= GET_MODE

Re: [PATCH] vect: Multistep float->int conversion only with no trapping math

2024-08-20 Thread Juergen Christ
Am Tue, Aug 20, 2024 at 10:15:22AM +0200 schrieb Richard Biener:
> On Fri, Aug 9, 2024 at 2:58 PM Juergen Christ  wrote:
> >
> > Am Thu, Aug 08, 2024 at 02:06:44PM +0200 schrieb Richard Biener:
> > > On Mon, Aug 5, 2024 at 4:02 PM Juergen Christ  
> > > wrote:
> > > >
> > > > Am Mon, Aug 05, 2024 at 01:00:31PM +0200 schrieb Richard Biener:
> > > > > On Fri, Aug 2, 2024 at 2:43 PM Juergen Christ  
> > > > > wrote:
> > > > > >
> > > > > > Do not convert floats to ints in multiple step if trapping math is
> > > > > > enabled.  This might hide some inexact signals.
> > > > > >
> > > > > > Also use correct sign (the sign of the target integer type) for the
> > > > > > intermediate steps.  This only affects undefined behaviour (casting
> > > > > > floats to unsigned datatype where the float is negative).
> > > > > >
> > > > > > gcc/ChangeLog:
> > > > > >
> > > > > > * tree-vect-stmts.cc (vectorizable_conversion): multi-step
> > > > > >   float to int conversion only with trapping math and 
> > > > > > correct
> > > > > >   sign.
> > > > > >
> > > > > > Signed-off-by: Juergen Christ 
> > > > > >
> > > > > > Bootstrapped and tested on x84 and s390.  Ok for trunk?
> > > > > >
> > > > > > ---
> > > > > >  gcc/tree-vect-stmts.cc | 8 +---
> > > > > >  1 file changed, 5 insertions(+), 3 deletions(-)
> > > > > >
> > > > > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> > > > > > index fdcda0d2abae..2ddd13383193 100644
> > > > > > --- a/gcc/tree-vect-stmts.cc
> > > > > > +++ b/gcc/tree-vect-stmts.cc
> > > > > > @@ -5448,7 +5448,8 @@ vectorizable_conversion (vec_info *vinfo,
> > > > > > break;
> > > > > >
> > > > > >   cvt_type
> > > > > > -   = build_nonstandard_integer_type (GET_MODE_BITSIZE 
> > > > > > (rhs_mode), 0);
> > > > > > +   = build_nonstandard_integer_type (GET_MODE_BITSIZE 
> > > > > > (rhs_mode),
> > > > > > + TYPE_UNSIGNED 
> > > > > > (lhs_type));
> > > > >
> > > > > But lhs_type should be a float type here, the idea that for a
> > > > > FLOAT_EXPR (int -> float)
> > > > > a signed integer type is the natural one to use - as it's 2x wider
> > > > > than the original
> > > > > RHS type it's signedness doesn't matter.  Note all float types should 
> > > > > be
> > > > > !TYPE_UNSIGNED so this hunk is a no-op but still less clear on the 
> > > > > intent IMO.
> > > > >
> > > > > Please drop it.
> > > >
> > > > Will do.  Sorry about that.
> > > >
> > > > > >   cvt_type = get_same_sized_vectype (cvt_type, vectype_in);
> > > > > >   if (cvt_type == NULL_TREE)
> > > > > > goto unsupported;
> > > > > > @@ -5505,10 +5506,11 @@ vectorizable_conversion (vec_info *vinfo,
> > > > > >if (GET_MODE_SIZE (lhs_mode) >= GET_MODE_SIZE (rhs_mode))
> > > > > > goto unsupported;
> > > > > >
> > > > > > -  if (code == FIX_TRUNC_EXPR)
> > > > > > +  if (code == FIX_TRUNC_EXPR && !flag_trapping_math)
> > > > > > {
> > > > > >   cvt_type
> > > > > > -   = build_nonstandard_integer_type (GET_MODE_BITSIZE 
> > > > > > (rhs_mode), 0);
> > > > > > +   = build_nonstandard_integer_type (GET_MODE_BITSIZE 
> > > > > > (rhs_mode),
> > > > > > + TYPE_UNSIGNED 
> > > > > > (lhs_type));
> > > > >
> > > > > Here it might be relevant for correctness - we have to choose between
> > > > > sfix and ufix for the float -> [u]int conversion.
> > > > >
> > > > > Do  you have a testcase?  Shouldn't the exactness be independent of 
> > > > >

Re: [PATCH] vect: Multistep float->int conversion only with no trapping math

2024-08-09 Thread Juergen Christ
Am Thu, Aug 08, 2024 at 02:06:44PM +0200 schrieb Richard Biener:
> On Mon, Aug 5, 2024 at 4:02 PM Juergen Christ  wrote:
> >
> > Am Mon, Aug 05, 2024 at 01:00:31PM +0200 schrieb Richard Biener:
> > > On Fri, Aug 2, 2024 at 2:43 PM Juergen Christ  
> > > wrote:
> > > >
> > > > Do not convert floats to ints in multiple step if trapping math is
> > > > enabled.  This might hide some inexact signals.
> > > >
> > > > Also use correct sign (the sign of the target integer type) for the
> > > > intermediate steps.  This only affects undefined behaviour (casting
> > > > floats to unsigned datatype where the float is negative).
> > > >
> > > > gcc/ChangeLog:
> > > >
> > > > * tree-vect-stmts.cc (vectorizable_conversion): multi-step
> > > >   float to int conversion only with trapping math and correct
> > > >   sign.
> > > >
> > > > Signed-off-by: Juergen Christ 
> > > >
> > > > Bootstrapped and tested on x84 and s390.  Ok for trunk?
> > > >
> > > > ---
> > > >  gcc/tree-vect-stmts.cc | 8 +---
> > > >  1 file changed, 5 insertions(+), 3 deletions(-)
> > > >
> > > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> > > > index fdcda0d2abae..2ddd13383193 100644
> > > > --- a/gcc/tree-vect-stmts.cc
> > > > +++ b/gcc/tree-vect-stmts.cc
> > > > @@ -5448,7 +5448,8 @@ vectorizable_conversion (vec_info *vinfo,
> > > > break;
> > > >
> > > >   cvt_type
> > > > -   = build_nonstandard_integer_type (GET_MODE_BITSIZE 
> > > > (rhs_mode), 0);
> > > > +   = build_nonstandard_integer_type (GET_MODE_BITSIZE 
> > > > (rhs_mode),
> > > > + TYPE_UNSIGNED (lhs_type));
> > >
> > > But lhs_type should be a float type here, the idea that for a
> > > FLOAT_EXPR (int -> float)
> > > a signed integer type is the natural one to use - as it's 2x wider
> > > than the original
> > > RHS type it's signedness doesn't matter.  Note all float types should be
> > > !TYPE_UNSIGNED so this hunk is a no-op but still less clear on the intent 
> > > IMO.
> > >
> > > Please drop it.
> >
> > Will do.  Sorry about that.
> >
> > > >   cvt_type = get_same_sized_vectype (cvt_type, vectype_in);
> > > >   if (cvt_type == NULL_TREE)
> > > > goto unsupported;
> > > > @@ -5505,10 +5506,11 @@ vectorizable_conversion (vec_info *vinfo,
> > > >if (GET_MODE_SIZE (lhs_mode) >= GET_MODE_SIZE (rhs_mode))
> > > > goto unsupported;
> > > >
> > > > -  if (code == FIX_TRUNC_EXPR)
> > > > +  if (code == FIX_TRUNC_EXPR && !flag_trapping_math)
> > > > {
> > > >   cvt_type
> > > > -   = build_nonstandard_integer_type (GET_MODE_BITSIZE 
> > > > (rhs_mode), 0);
> > > > +   = build_nonstandard_integer_type (GET_MODE_BITSIZE 
> > > > (rhs_mode),
> > > > + TYPE_UNSIGNED (lhs_type));
> > >
> > > Here it might be relevant for correctness - we have to choose between
> > > sfix and ufix for the float -> [u]int conversion.
> > >
> > > Do  you have a testcase?  Shouldn't the exactness be independent of the 
> > > integer
> > > type we convert to?
> >
> > I was looking at this little program which contains undefined behaviour:
> >
> > #include 
> >
> > __attribute__((noinline,noclone,noipa))
> > void
> > vec_pack_ufix_trunc_v2df (double *in, unsigned int *out);
> >
> > void
> > vec_pack_ufix_trunc_v2df (double *in, unsigned int *out)
> > {
> > out[0] = in[0];
> > out[1] = in[1];
> > out[2] = in[2];
> > out[3] = in[3];
> > }
> >
> > int main()
> > {
> > double in[] = {-1,-2,-3,-4};
> > unsigned int out[4];
> >
> > vec_pack_ufix_trunc_v2df (in, out);
> > for (int i = 0; i < 4; ++i)
> > printf("out[%d] = %u\n", i, out[i]);
> > return 0;
> > }
> >
> > On s390x, I get different results after vectorization:
> >
> > out[0]

Re: [PATCH] vect: Multistep float->int conversion only with no trapping math

2024-08-05 Thread Juergen Christ
Am Mon, Aug 05, 2024 at 01:00:31PM +0200 schrieb Richard Biener:
> On Fri, Aug 2, 2024 at 2:43 PM Juergen Christ  wrote:
> >
> > Do not convert floats to ints in multiple step if trapping math is
> > enabled.  This might hide some inexact signals.
> >
> > Also use correct sign (the sign of the target integer type) for the
> > intermediate steps.  This only affects undefined behaviour (casting
> > floats to unsigned datatype where the float is negative).
> >
> > gcc/ChangeLog:
> >
> > * tree-vect-stmts.cc (vectorizable_conversion): multi-step
> >   float to int conversion only with trapping math and correct
> >   sign.
> >
> > Signed-off-by: Juergen Christ 
> >
> > Bootstrapped and tested on x84 and s390.  Ok for trunk?
> >
> > ---
> >  gcc/tree-vect-stmts.cc | 8 +---
> >  1 file changed, 5 insertions(+), 3 deletions(-)
> >
> > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> > index fdcda0d2abae..2ddd13383193 100644
> > --- a/gcc/tree-vect-stmts.cc
> > +++ b/gcc/tree-vect-stmts.cc
> > @@ -5448,7 +5448,8 @@ vectorizable_conversion (vec_info *vinfo,
> > break;
> >
> >   cvt_type
> > -   = build_nonstandard_integer_type (GET_MODE_BITSIZE (rhs_mode), 
> > 0);
> > +   = build_nonstandard_integer_type (GET_MODE_BITSIZE (rhs_mode),
> > + TYPE_UNSIGNED (lhs_type));
> 
> But lhs_type should be a float type here, the idea that for a
> FLOAT_EXPR (int -> float)
> a signed integer type is the natural one to use - as it's 2x wider
> than the original
> RHS type it's signedness doesn't matter.  Note all float types should be
> !TYPE_UNSIGNED so this hunk is a no-op but still less clear on the intent IMO.
> 
> Please drop it.

Will do.  Sorry about that.

> >   cvt_type = get_same_sized_vectype (cvt_type, vectype_in);
> >   if (cvt_type == NULL_TREE)
> > goto unsupported;
> > @@ -5505,10 +5506,11 @@ vectorizable_conversion (vec_info *vinfo,
> >if (GET_MODE_SIZE (lhs_mode) >= GET_MODE_SIZE (rhs_mode))
> > goto unsupported;
> >
> > -  if (code == FIX_TRUNC_EXPR)
> > +  if (code == FIX_TRUNC_EXPR && !flag_trapping_math)
> > {
> >   cvt_type
> > -   = build_nonstandard_integer_type (GET_MODE_BITSIZE (rhs_mode), 
> > 0);
> > +   = build_nonstandard_integer_type (GET_MODE_BITSIZE (rhs_mode),
> > + TYPE_UNSIGNED (lhs_type));
> 
> Here it might be relevant for correctness - we have to choose between
> sfix and ufix for the float -> [u]int conversion.
> 
> Do  you have a testcase?  Shouldn't the exactness be independent of the 
> integer
> type we convert to?

I was looking at this little program which contains undefined behaviour:

#include 

__attribute__((noinline,noclone,noipa))
void
vec_pack_ufix_trunc_v2df (double *in, unsigned int *out);

void
vec_pack_ufix_trunc_v2df (double *in, unsigned int *out)
{
out[0] = in[0];
out[1] = in[1];
out[2] = in[2];
out[3] = in[3];
}

int main()
{
double in[] = {-1,-2,-3,-4};
unsigned int out[4];

vec_pack_ufix_trunc_v2df (in, out);
for (int i = 0; i < 4; ++i)
printf("out[%d] = %u\n", i, out[i]);
return 0;
}

On s390x, I get different results after vectorization:

out[0] = 4294967295
out[1] = 4294967294
out[2] = 4294967293
out[3] = 4294967292

than without vectorization:

out[0] = 0
out[1] = 0
out[2] = 0
out[3] = 0

Even if this is undefined behaviour, I think it would be nice to have
consistent results here.

Also, while I added an expander to circumvent this problem in a
previous patch, reviewers requested to hide this behind trapping math.
Thus, I looked into this.

Seeing the result from the CI for aarch64, I guess there are some
tests that actually expect this vectorization to always happen even
though it might not be save w.r.t. trapping math.

> 
> >   cvt_type = get_same_sized_vectype (cvt_type, vectype_in);
> >   if (cvt_type == NULL_TREE)
> > goto unsupported;
> > --
> > 2.43.5
> >


[PATCH] vect: Multistep float->int conversion only with no trapping math

2024-08-02 Thread Juergen Christ
Do not convert floats to ints in multiple step if trapping math is
enabled.  This might hide some inexact signals.

Also use correct sign (the sign of the target integer type) for the
intermediate steps.  This only affects undefined behaviour (casting
floats to unsigned datatype where the float is negative).

gcc/ChangeLog:

* tree-vect-stmts.cc (vectorizable_conversion): multi-step
  float to int conversion only with trapping math and correct
  sign.

Signed-off-by: Juergen Christ 

Bootstrapped and tested on x84 and s390.  Ok for trunk?

---
 gcc/tree-vect-stmts.cc | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index fdcda0d2abae..2ddd13383193 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -5448,7 +5448,8 @@ vectorizable_conversion (vec_info *vinfo,
break;
 
  cvt_type
-   = build_nonstandard_integer_type (GET_MODE_BITSIZE (rhs_mode), 0);
+   = build_nonstandard_integer_type (GET_MODE_BITSIZE (rhs_mode),
+ TYPE_UNSIGNED (lhs_type));
  cvt_type = get_same_sized_vectype (cvt_type, vectype_in);
  if (cvt_type == NULL_TREE)
goto unsupported;
@@ -5505,10 +5506,11 @@ vectorizable_conversion (vec_info *vinfo,
   if (GET_MODE_SIZE (lhs_mode) >= GET_MODE_SIZE (rhs_mode))
goto unsupported;
 
-  if (code == FIX_TRUNC_EXPR)
+  if (code == FIX_TRUNC_EXPR && !flag_trapping_math)
{
  cvt_type
-   = build_nonstandard_integer_type (GET_MODE_BITSIZE (rhs_mode), 0);
+   = build_nonstandard_integer_type (GET_MODE_BITSIZE (rhs_mode),
+ TYPE_UNSIGNED (lhs_type));
  cvt_type = get_same_sized_vectype (cvt_type, vectype_in);
  if (cvt_type == NULL_TREE)
goto unsupported;
-- 
2.43.5



[PATCH] s390: define single step vector casts

2024-06-20 Thread Juergen Christ
Some casts were missing leading to missed of bad vectorizations where
casting was done scalar followed by a vector creation from the
individual elements.

gcc/ChangeLog:

* config/s390/vector.md (VEC_HALF_NARROWED): New mode iterator.
(vec_half_narrowed): ditto.
(trunc2): New pattern.
(vec_pack_ufix_trunc_v2df): ditto.
(vec_pack_sfix_trunc_v2df): ditto.
(vec_unpack_sfix_trunc_lo_v4sf): ditto.
(vec_unpack_sfix_trunc_hi_v4sf): ditto.
(vec_unpack_ufix_trunc_lo_v4sf): ditto.
(vec_unpack_ufix_trunc_hi_v4sf): ditto.
(floatv2siv2sf2): ditto.
(floatunsv2siv2sf2): ditto.
(vec_unpacks_float_hi_v4si): ditto.
(vec_unpacks_float_lo_v4si): ditto.
(vec_unpacku_float_hi_v4si): ditto.
(vec_unpacku_float_lo_v4si): ditto.

gcc/testsuite/ChangeLog:

* gcc.target/s390/vector/vec-cast-single.c: New test.
* gcc.target/s390/vector/vec_pack_ufix_trunc_v2df.c: New test.

Bootstrapped and regtested on s390x.  Ok for trunk?

Signed-off-by: Juergen Christ 
---
 gcc/config/s390/vector.md | 170 ++-
 .../gcc.target/s390/vector/vec-cast-single.c  | 271 ++
 .../s390/vector/vec_pack_ufix_trunc_v2df.c|  30 ++
 3 files changed, 463 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-cast-single.c
 create mode 100644 
gcc/testsuite/gcc.target/s390/vector/vec_pack_ufix_trunc_v2df.c

diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md
index 40de0c75a7cf..356f25d26deb 100644
--- a/gcc/config/s390/vector.md
+++ b/gcc/config/s390/vector.md
@@ -89,6 +89,8 @@
 
 (define_mode_iterator VI_EXTEND [V2QI V2HI V2SI V4QI V4HI])
 
+(define_mode_iterator VI_TRUNC [V2HI V2SI V2DI V4HI V4SI])
+
 ; Empty string for all but TImode.  This is used to hide the TImode
 ; expander name in case it is defined already.  See addti3 for an
 ; example.
@@ -211,6 +213,14 @@
   (V1SF "v1df") (V2SF "v2df") (V4SF "v4df")
   (V1DF "v1tf") (V2DF "v2tf")])
 
+; Vector with narrowed element size and the same number of elements.
+(define_mode_attr VEC_HALF_NARROWED [(V1HI "V1QI") (V2HI "V2QI") (V4HI "V4QI") 
(V8HI "V8QI")
+   (V1SI "V1HI") (V2SI "V2HI") (V4SI "V4HI")
+  (V1DI "V1DI") (V2DI "V2SI")])
+(define_mode_attr vec_half_narrowed [(V1HI "v1qi") (V2HI "v2qi") (V4HI "v4qi") 
(V8HI "v8qi")
+   (V1SI "v1hi") (V2SI "v2hi") (V4SI "v4hi")
+  (V1DI "v1di") (V2DI "v2si")])
+
 ; Vector with half the element size AND half the number of elements.
 (define_mode_attr vec_halfhalf
   [(V2HI "V2QI") (V4HI "V4QI") (V8HI "V8QI")
@@ -2422,6 +2432,17 @@
   operands[2] = gen_reg_rtx (V4SFmode);
 })
 
+;; vector truncate
+
+; downcasts
+
+(define_insn "trunc2"
+  [(set (match_operand: 0 "register_operand" "=v")
+(truncate: (match_operand:VI_TRUNC 1 "register_operand" 
"v")))]
+  "TARGET_VX"
+  "vpk\t %0,%1,%1"
+  [(set_attr "op_type" "VRR")])
+
 ;; vector unpack v16qi
 
 ; signed
@@ -3177,17 +3198,150 @@
   emit_move_insn (len, gen_rtx_ZERO_EXTEND (SImode, operands[2]));
   emit_insn (gen_vstlv16qi (operands[1], len, mem));
   DONE;
-});;
+})
+
+(define_expand "vec_pack_ufix_trunc_v2df"
+  [(match_operand:V4SI 0 "register_operand")
+   (match_operand:V2DF 1 "register_operand")
+   (match_operand:V2DF 2 "register_operand")]
+  "TARGET_VX"
+{
+  rtx r1 = gen_reg_rtx (V2DImode);
+  rtx r2 = gen_reg_rtx (V2DImode);
+
+  emit_insn (gen_fixuns_truncv2dfv2di2 (r1, operands[1]));
+  emit_insn (gen_fixuns_truncv2dfv2di2 (r2, operands[2]));
+  emit_insn (gen_vec_pack_trunc_v2di (operands[0], r1, r2));
+  DONE;
+})
+
+(define_expand "vec_pack_sfix_trunc_v2df"
+  [(match_operand:V4SI 0 "register_operand")
+   (match_operand:V2DF 1 "register_operand")
+   (match_operand:V2DF 2 "register_operand")]
+  "TARGET_VX"
+{
+  rtx r1 = gen_reg_rtx (V2DImode);
+  rtx r2 = gen_reg_rtx (V2DImode);
+
+  emit_insn (gen_fix_truncv2dfv2di2 (r1, operands[1]));
+  emit_insn (gen_fix_truncv2dfv2di2 (r2, operands[2]));
+  emit_insn (gen_vec_pack_trunc_v2di (operands[0], r1, r2));
+  DONE;
+})
+
+; v4sf -> v2di
+(define_expand "vec_unpack_sfix_trunc_lo_v4sf"
+  [(match_operand:V2DI 0 "register_operand")
+   (match_operand:V4SF 1 "register_operand")]
+  "TARGET_VX"
+{
+  rtx r = gen_reg_rtx(V4SImode);
+
+  emit_insn (gen_fix_truncv4sfv4si2 (r, operands

Re: [PATCH v2] s390x: Optimize vector permute with constant indexes

2024-04-09 Thread Juergen Christ
Am Tue, Apr 09, 2024 at 05:01:18PM +0200 schrieb Andreas Krebbel:
> On 4/9/24 16:31, Juergen Christ wrote:
> > Loop vectorizer can generate vector permutes with constant indexes
> > where all indexes are equal.  Optimize this case to use vector
> > replicate instead of vector permute.
> > 
> > gcc/ChangeLog:
> > 
> > * config/s390/s390.cc (expand_perm_as_replicate): Implement.
> > (vectorize_vec_perm_const_1): Call new function.
> > * config/s390/vx-builtins.md (vec_splat): Change to...
> > (@vec_splat): ...this.
> > 
> > gcc/testsuite/ChangeLog:
> > 
> > * gcc.target/s390/vector/vec-expand-replicate.c: New test.
> > 
> > Bootstrapped and regtested on s390x.  Ok for trunk?
> 
> Does this also work when using the vec_perm intrinsic or would we need to 
> define a matching RTX for
> that?

Unfortunately, it does not work with vec_perm.

> Ok. Thanks!

Pushed.

Juergen


[PATCH v2] s390x: Optimize vector permute with constant indexes

2024-04-09 Thread Juergen Christ
Loop vectorizer can generate vector permutes with constant indexes
where all indexes are equal.  Optimize this case to use vector
replicate instead of vector permute.

gcc/ChangeLog:

* config/s390/s390.cc (expand_perm_as_replicate): Implement.
(vectorize_vec_perm_const_1): Call new function.
* config/s390/vx-builtins.md (vec_splat): Change to...
(@vec_splat): ...this.

gcc/testsuite/ChangeLog:

* gcc.target/s390/vector/vec-expand-replicate.c: New test.

Bootstrapped and regtested on s390x.  Ok for trunk?

Signed-off-by: Juergen Christ 
---
 gcc/config/s390/s390.cc   | 33 ++
 gcc/config/s390/vx-builtins.md|  2 +-
 .../s390/vector/vec-expand-replicate.c| 60 +++
 3 files changed, 94 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c

diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc
index 372a23244032..3148f163627c 100644
--- a/gcc/config/s390/s390.cc
+++ b/gcc/config/s390/s390.cc
@@ -17923,6 +17923,36 @@ expand_perm_as_a_vlbr_vstbr_candidate (const struct 
expand_vec_perm_d &d)
   return false;
 }
 
+static bool
+expand_perm_as_replicate (const struct expand_vec_perm_d &d)
+{
+  unsigned char i;
+  unsigned char elem;
+  rtx base = d.op0;
+  rtx insn;
+  /* Needed to silence maybe-uninitialized warning.  */
+  gcc_assert (d.nelt > 0);
+  elem = d.perm[0];
+  for (i = 1; i < d.nelt; ++i)
+if (d.perm[i] != elem)
+  return false;
+  if (!d.testing_p)
+{
+  if (elem >= d.nelt)
+   {
+ base = d.op1;
+ elem -= d.nelt;
+   }
+  insn = maybe_gen_vec_splat (d.vmode, d.target, base, GEN_INT (elem));
+  if (insn == NULL_RTX)
+   return false;
+  emit_insn (insn);
+  return true;
+}
+  else
+return maybe_code_for_vec_splat (d.vmode) != CODE_FOR_nothing;
+}
+
 /* Try to find the best sequence for the vector permute operation
described by D.  Return true if the operation could be
expanded.  */
@@ -17941,6 +17971,9 @@ vectorize_vec_perm_const_1 (const struct 
expand_vec_perm_d &d)
   if (expand_perm_as_a_vlbr_vstbr_candidate (d))
 return true;
 
+  if (expand_perm_as_replicate (d))
+return true;
+
   return false;
 }
 
diff --git a/gcc/config/s390/vx-builtins.md b/gcc/config/s390/vx-builtins.md
index 432d81a719fc..93c0d408a43e 100644
--- a/gcc/config/s390/vx-builtins.md
+++ b/gcc/config/s390/vx-builtins.md
@@ -424,7 +424,7 @@
 
 
 ; Replicate from vector element
-(define_expand "vec_splat"
+(define_expand "@vec_splat"
   [(set (match_operand:V_HW  0 "register_operand"  "")
(vec_duplicate:V_HW (vec_select:
 (match_operand:V_HW 1 "register_operand"  "")
diff --git a/gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c 
b/gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c
new file mode 100644
index ..872b1c9321cd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c
@@ -0,0 +1,60 @@
+/* Check that the vectorize_vec_perm_const expander correctly deals with
+   replication.  Extracted from spec "nab".  */
+
+/* { dg-do compile } */
+/* { dg-options "-O3 -mzarch -march=z13 -fvect-cost-model=unlimited" } */
+
+typedef double POINT_T[3];
+typedef double MATRIX_T[][4];
+typedef struct {
+  POINT_T a_pos;
+} ATOM_T;
+typedef struct {
+  ATOM_T *r_atoms;
+} RESIDUE_T;
+typedef struct strand_t {
+  RESIDUE_T *s_residues;
+} STRAND_T;
+typedef struct strand_t MOLECULE_T;
+double xfm_xyz_oxyz4[4];
+MOLECULE_T add_he2o3transformmol_mol, add_he2o3transformmol_sp;
+RESIDUE_T add_he2o3transformmol_res;
+int add_he2o3transformmol_r, add_he2o3transformmol_a, add_he2o3transformmol_i;
+ATOM_T *add_he2o3transformmol_ap;
+POINT_T add_he2o3transformmol_xyz, add_he2o3transformmol_nxyz;
+static void xfm_xyz(POINT_T oxyz, MATRIX_T mat, POINT_T nxyz) {
+  int i, j;
+  double nxyz4[4];
+  for (i = 0; i < 3; i++)
+xfm_xyz_oxyz4[i] = oxyz[i];
+  xfm_xyz_oxyz4[3] = 1.0;
+  for (i = 0; i < 4; i++) {
+nxyz4[i] = 0.0;
+for (j = 0; j < 4; j++)
+  nxyz4[i] += xfm_xyz_oxyz4[j] * mat[j][i];
+  }
+  for (i = 0; i < 3; i++)
+nxyz[i] = nxyz4[i];
+}
+void add_he2o3transformmol(MATRIX_T mat, int n) {
+  for (add_he2o3transformmol_sp = add_he2o3transformmol_mol;;)
+for (add_he2o3transformmol_r = 0;;) {
+  add_he2o3transformmol_res =
+  add_he2o3transformmol_sp.s_residues[add_he2o3transformmol_r];
+  for (add_he2o3transformmol_a = 0; add_he2o3transformmol_a < n; 
add_he2o3transformmol_a++) {
+add_he2o3transformmol_ap =
+&add_he2o3transformmol_res.r_atoms[add_he2o3transformmol_a];
+for (add_he2o3transformmol_i = 0; add_he2o3transformmol_i < 3;
+ add_he2o3transformmol_i++)
+  add_he2o3transformm

Re: [PATCH] s390x: Optimize vector permute with constant indexes

2024-04-09 Thread Juergen Christ
Am Tue, Apr 09, 2024 at 11:51:00AM +0200 schrieb Stefan Schulze Frielinghaus:
> > +static bool expand_perm_as_replicate (const struct expand_vec_perm_d &d)
>^~~~
> Function names start on a new line.

Fixed

> > +{
> > +  unsigned char i;
> > +  unsigned char elem;
> > +  rtx base = d.op0;
> > +  rtx insn;
> > +  /* Needed to silence maybe-uninitialized warning.  */
> > +  gcc_assert(d.nelt > 0);
>  ~~^~~~
> Between function name and open bracket whitespace is missing.

Fixed.

> Curiously enough, the error is about d which is a reference and cannot
> be null.  If you are eager you could reduce this and open a PR.
> 
> s390.cc:17935:8: warning: ‘d’ may be used uninitialized 
> [-Wmaybe-uninitialized]
> 17935 |   elem = d.perm[0];
>   |   ~^~~

Weirdly enough it is not `d`, but `d.perm[0]` that seems to be the
problem.  But I did not reduce this.  As the assertion suggests, it is
known that all elements in d.perm in the range [0,d.nelts) are
initialized.  I would like to defer that to a time when I (hopefully)
have some more spare time.

> > +  if (expand_perm_as_replicate(d))
>  ^~~
> Between function name and open bracket whitespace is missing.

Fixed

> > diff --git a/gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c 
> > b/gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c
> > new file mode 100644
> > index ..27563a00f22b
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c
> > @@ -0,0 +1,30 @@
> > +/* Check that the vectorize_vec_perm_const expander correctly deals with
> > +   replication.  Extracted from spec "nab".  */
> > +
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -mzarch -march=z13 -fvect-cost-model=unlimited" } */
> > +
> > +
> > +#define REAL_T  double
> > +typedef REAL_T  MATRIX_T[ 4 ][ 4 ];
> > +
> > +int concat_mat_i, concat_mat_j;
> > +static void concat_mat(MATRIX_T m1, MATRIX_T, MATRIX_T m3);
> > +MATRIX_T *rot4p() {
> > +  MATRIX_T mat3, mat4;
> > +  static MATRIX_T mat5;
> > +  concat_mat(mat4, mat3, mat5);
> > +}
> > +void concat_mat(MATRIX_T m1, MATRIX_T, MATRIX_T m3) {
> > +  int k;
> > +  for (;; concat_mat_i++) {
> > +concat_mat_j = 0;
> > +for (; 4; concat_mat_j++) {
> > +  k = 0;
> > +  for (; k < 4; k++)
> > +m3[concat_mat_i][concat_mat_j] += m1[concat_mat_i][k];
> > +}
> 
> Just nitpicking, if we could come up with a test case which does not
> involve integer overflows due to non-terminating loops, I would prefer
> that.

Well, I have a version without integer overflows, but it still has
non-terminating loops...

Will send a v2,

Juergen


[PATCH] s390x: Optimize vector permute with constant indexes

2024-04-02 Thread Juergen Christ
Loop vectorizer can generate vector permutes with constant indexes
where all indexes are equal.  Optimize this case to use vector
replicate instead of vector permute.

gcc/ChangeLog:

* config/s390/s390.cc (expand_perm_as_replicate): Implement.
(vectorize_vec_perm_const_1): Call new function.
* config/s390/vx-builtins.md (vec_splat): Change to...
(@vec_splat): ...this.

gcc/testsuite/ChangeLog:

* gcc.target/s390/vector/vec-expand-replicate.c: New test.

Bootstrapped and regtested on s390x.  Ok for trunk?

Signed-off-by: Juergen Christ 
---
 gcc/config/s390/s390.cc   | 32 +++
 gcc/config/s390/vx-builtins.md|  2 +-
 .../s390/vector/vec-expand-replicate.c| 30 +
 3 files changed, 63 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c

diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc
index 372a23244032..4b4014ebe444 100644
--- a/gcc/config/s390/s390.cc
+++ b/gcc/config/s390/s390.cc
@@ -17923,6 +17923,35 @@ expand_perm_as_a_vlbr_vstbr_candidate (const struct 
expand_vec_perm_d &d)
   return false;
 }
 
+static bool expand_perm_as_replicate (const struct expand_vec_perm_d &d)
+{
+  unsigned char i;
+  unsigned char elem;
+  rtx base = d.op0;
+  rtx insn;
+  /* Needed to silence maybe-uninitialized warning.  */
+  gcc_assert(d.nelt > 0);
+  elem = d.perm[0];
+  for (i = 1; i < d.nelt; ++i)
+if (d.perm[i] != elem)
+  return false;
+  if (!d.testing_p)
+{
+  if (elem >= d.nelt)
+   {
+ base = d.op1;
+ elem -= d.nelt;
+   }
+  insn = maybe_gen_vec_splat (d.vmode, d.target, base, GEN_INT (elem));
+  if (insn == NULL_RTX)
+   return false;
+  emit_insn (insn);
+  return true;
+}
+  else
+return maybe_code_for_vec_splat (d.vmode) != CODE_FOR_nothing;
+}
+
 /* Try to find the best sequence for the vector permute operation
described by D.  Return true if the operation could be
expanded.  */
@@ -17941,6 +17970,9 @@ vectorize_vec_perm_const_1 (const struct 
expand_vec_perm_d &d)
   if (expand_perm_as_a_vlbr_vstbr_candidate (d))
 return true;
 
+  if (expand_perm_as_replicate(d))
+return true;
+
   return false;
 }
 
diff --git a/gcc/config/s390/vx-builtins.md b/gcc/config/s390/vx-builtins.md
index 432d81a719fc..93c0d408a43e 100644
--- a/gcc/config/s390/vx-builtins.md
+++ b/gcc/config/s390/vx-builtins.md
@@ -424,7 +424,7 @@
 
 
 ; Replicate from vector element
-(define_expand "vec_splat"
+(define_expand "@vec_splat"
   [(set (match_operand:V_HW  0 "register_operand"  "")
(vec_duplicate:V_HW (vec_select:
 (match_operand:V_HW 1 "register_operand"  "")
diff --git a/gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c 
b/gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c
new file mode 100644
index ..27563a00f22b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c
@@ -0,0 +1,30 @@
+/* Check that the vectorize_vec_perm_const expander correctly deals with
+   replication.  Extracted from spec "nab".  */
+
+/* { dg-do compile } */
+/* { dg-options "-O3 -mzarch -march=z13 -fvect-cost-model=unlimited" } */
+
+
+#define REAL_T  double
+typedef REAL_T  MATRIX_T[ 4 ][ 4 ];
+
+int concat_mat_i, concat_mat_j;
+static void concat_mat(MATRIX_T m1, MATRIX_T, MATRIX_T m3);
+MATRIX_T *rot4p() {
+  MATRIX_T mat3, mat4;
+  static MATRIX_T mat5;
+  concat_mat(mat4, mat3, mat5);
+}
+void concat_mat(MATRIX_T m1, MATRIX_T, MATRIX_T m3) {
+  int k;
+  for (;; concat_mat_i++) {
+concat_mat_j = 0;
+for (; 4; concat_mat_j++) {
+  k = 0;
+  for (; k < 4; k++)
+m3[concat_mat_i][concat_mat_j] += m1[concat_mat_i][k];
+}
+  }
+}
+
+/* { dg-final { scan-assembler-not "vperm" } } */
-- 
2.39.3



[PATCH] s390x: Implement vector cost model

2024-03-20 Thread Juergen Christ
Hi,

s390x used the basic cost model which does not correctly model the cost of
register file crossing or the availability of certain instructions to
simplify reversed operations.  Implement an own cost model to better control
when to vectorize.

gcc/ChangeLog:

* config/s390/s390.cc (class s390_vector_costs): Implement.
(s390_vector_costs::s390_vector_costs): Dito.
(s390_vector_costs::add_stmt_cost): Dito.
(s390_vectorize_create_costs): Dito.
(TARGET_VECTORIZE_CREATE_COSTS): Dito.

gcc/testsuite/ChangeLog:

* gcc.target/s390/vector/loop-1.c: New test.
* gcc.target/s390/vector/slp-1.c: New test.
* gcc.target/s390/vector/slp-2.c: New test.

Signed-off-by: Juergen Christ 

Bootstrapped and tested on s390x.  Ok for master?

---
 gcc/config/s390/s390.cc   | 127 ++
 gcc/testsuite/gcc.target/s390/vector/loop-1.c |  82 +++
 gcc/testsuite/gcc.target/s390/vector/slp-1.c  |  68 ++
 gcc/testsuite/gcc.target/s390/vector/slp-2.c  |  31 +
 4 files changed, 308 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/s390/vector/loop-1.c
 create mode 100644 gcc/testsuite/gcc.target/s390/vector/slp-1.c
 create mode 100644 gcc/testsuite/gcc.target/s390/vector/slp-2.c

diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc
index 372a23244032..b9dab1cf8a85 100644
--- a/gcc/config/s390/s390.cc
+++ b/gcc/config/s390/s390.cc
@@ -88,6 +88,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "ipa-prop.h"
 #include "ipa-fnsummary.h"
 #include "sched-int.h"
+#include "tree-vectorizer.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -4199,6 +4200,130 @@ s390_builtin_vectorization_cost (enum 
vect_cost_for_stmt type_of_cost,
 }
 }
 
+/* s390-specific vector costs */
+class s390_vector_costs : public vector_costs
+{
+  stmt_vec_info skipfinalpart;
+public:
+  s390_vector_costs (vec_info *, bool);
+
+  unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind,
+ stmt_vec_info stmt_info, slp_tree node,
+ tree vectype, int misalign,
+ vect_cost_model_location where) override;
+};
+
+s390_vector_costs::s390_vector_costs(vec_info *vinfo, bool costing_for_scalar)
+  : vector_costs(vinfo, costing_for_scalar)
+{
+}
+
+unsigned int
+s390_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
+ stmt_vec_info stmt_info, slp_tree node,
+ tree vectype, int misalign,
+ vect_cost_model_location where)
+{
+  bool fp = false;
+  int costs = s390_builtin_vectorization_cost (kind, vectype, misalign);
+
+  if (vectype != NULL)
+fp = FLOAT_TYPE_P (vectype);
+
+  if ((kind == scalar_to_vec || kind == vec_construct)
+  && node
+  && SLP_TREE_DEF_TYPE (node) == vect_external_def)
+{
+  unsigned int i;
+  tree op;
+  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
+   if (TREE_CODE (op) == SSA_NAME)
+ TREE_VISITED (op) = 0;
+  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
+   {
+ if (TREE_CODE (op) != SSA_NAME
+ || TREE_VISITED (op))
+   continue;
+ TREE_VISITED (op) = 1;
+ gimple *def = SSA_NAME_DEF_STMT (op);
+ tree temp;
+ if (is_gimple_assign(def)
+ && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def))
+ && (temp = gimple_assign_rhs1(def))
+ && TREE_CODE (temp) == SSA_NAME
+ && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (def)),
+   TREE_TYPE (temp)))
+   def = SSA_NAME_DEF_STMT (temp);
+ if (!gimple_assign_load_p (def))
+   {
+ /* For scalar_to_vec from a fp register, we might not
+cross the register files.  So keep the penalty small.
+??? If we have to cross, we actually cross twice
+leading to a huge runtime penalty.  Should we reflect
+this here?  */
+ if (kind == scalar_to_vec && fp)
+   costs += 2;
+ else
+   costs += 3;
+   }
+   }
+  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
+   if (TREE_CODE (op) == SSA_NAME)
+ TREE_VISITED (op) = 0;
+}
+  if (kind == scalar_stmt && stmt_info && is_gimple_assign (stmt_info->stmt))
+{
+  const gassign *assign = dyn_cast (stmt_info->stmt);
+  tree comptype = NULL_TREE;
+  if (gimple_assign_rhs_code (assign) == BIT_INSERT_EXPR)
+   comptype = TREE_TYPE (gimple_assign_rhs1 (assign));
+  if (gimple_assign_rhs_code (assign) == BIT_FIELD_REF)
+   comptype = TREE_TYPE (TREE_OPERAND (gimple_assign_rhs1 (ass

[PATCH] Add myself to write after approval and DCO.

2024-02-26 Thread Juergen Christ
Hello,

I have added myself to write after approval and DCO.

Thanks,

Juergen Christ

ChangeLog:

* MAINTAINERS: Add myself to write after approval and DCO.

Signed-off-by: Juergen Christ 
---
 MAINTAINERS | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index cb5a42501dd2..ca6a27b4c11b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -375,6 +375,7 @@ Dehao Chen  

 Fabien Chêne   
 Clément Chigot 
 Harshit Chopra 
+Juergen Christ 
 Tamar Christina

 Eric Christopher   
 Paul Clarke
@@ -756,6 +757,7 @@ Certificate of Origin Version 1.1.  See 
https://gcc.gnu.org/dco.html for more
 information.
 
 
+Juergen Christ  
 Robin Dapp 
 Robin Dapp 
 Michal Jires   
-- 
2.39.3



Re: [PATCH v2] Do not emulate vectors containing floats.

2024-02-23 Thread Juergen Christ
Am Fri, Feb 23, 2024 at 01:57:12PM + schrieb Sam James:
> 
> Juergen Christ  writes:
> 
> > The emulation via word mode tries to perform integer arithmetic on floating
> > point values instead of floating point arithmetic.  This leads to
> > mis-compilations.
> 
> Is the bug ref + test missing?

Sorry, forgot to add the "bootstrapped and tested on s390x and x86_64".

Not sure how to reference a bugzilla here.  There is 114075 that
should be solved with this, too.

> >
> > Failure occured on s390x on these existing test cases:
> > gcc.dg/vect/tsvc/vect-tsvc-s112.c
> > gcc.dg/vect/tsvc/vect-tsvc-s113.c
> > gcc.dg/vect/tsvc/vect-tsvc-s119.c
> > gcc.dg/vect/tsvc/vect-tsvc-s121.c
> > gcc.dg/vect/tsvc/vect-tsvc-s131.c
> > gcc.dg/vect/tsvc/vect-tsvc-s132.c
> > gcc.dg/vect/tsvc/vect-tsvc-s2233.c
> > gcc.dg/vect/tsvc/vect-tsvc-s421.c
> > gcc.dg/vect/vect-alias-check-14.c
> > gcc.target/s390/vector/partial/s390-vec-length-epil-run-1.c
> > gcc.target/s390/vector/partial/s390-vec-length-epil-run-3.c
> > gcc.target/s390/vector/partial/s390-vec-length-full-run-3.c
> >
> > gcc/ChangeLog:
> >
> > * tree-vect-stmts.cc (vectorizable_operation): Don't emulate floating
> >   point vectors
> >
> > Signed-off-by: Juergen Christ 
> > ---
> >  gcc/tree-vect-stmts.cc | 3 ++-
> >  1 file changed, 2 insertions(+), 1 deletion(-)
> >
> > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> > index 09749ae38174..f95ff2c2aa34 100644
> > --- a/gcc/tree-vect-stmts.cc
> > +++ b/gcc/tree-vect-stmts.cc
> > @@ -6756,7 +6756,8 @@ vectorizable_operation (vec_info *vinfo,
> >  those through even when the mode isn't word_mode.  For
> >  ops we have to lower the lowering code assumes we are
> >  dealing with word_mode.  */
> > -  if code == PLUS_EXPR || code == MINUS_EXPR || code == 
> > NEGATE_EXPR)
> > +  if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype))
> > + || (((code == PLUS_EXPR || code == MINUS_EXPR || code == NEGATE_EXPR)
> > || !target_support_p)
> >&& maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD))
> >   /* Check only during analysis.  */
> 


[PATCH v2] Do not emulate vectors containing floats.

2024-02-23 Thread Juergen Christ
The emulation via word mode tries to perform integer arithmetic on floating
point values instead of floating point arithmetic.  This leads to
mis-compilations.

Failure occured on s390x on these existing test cases:
gcc.dg/vect/tsvc/vect-tsvc-s112.c
gcc.dg/vect/tsvc/vect-tsvc-s113.c
gcc.dg/vect/tsvc/vect-tsvc-s119.c
gcc.dg/vect/tsvc/vect-tsvc-s121.c
gcc.dg/vect/tsvc/vect-tsvc-s131.c
gcc.dg/vect/tsvc/vect-tsvc-s132.c
gcc.dg/vect/tsvc/vect-tsvc-s2233.c
gcc.dg/vect/tsvc/vect-tsvc-s421.c
gcc.dg/vect/vect-alias-check-14.c
gcc.target/s390/vector/partial/s390-vec-length-epil-run-1.c
gcc.target/s390/vector/partial/s390-vec-length-epil-run-3.c
gcc.target/s390/vector/partial/s390-vec-length-full-run-3.c

gcc/ChangeLog:

* tree-vect-stmts.cc (vectorizable_operation): Don't emulate floating
  point vectors

Signed-off-by: Juergen Christ 
---
 gcc/tree-vect-stmts.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 09749ae38174..f95ff2c2aa34 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -6756,7 +6756,8 @@ vectorizable_operation (vec_info *vinfo,
 those through even when the mode isn't word_mode.  For
 ops we have to lower the lowering code assumes we are
 dealing with word_mode.  */
-  if code == PLUS_EXPR || code == MINUS_EXPR || code == NEGATE_EXPR)
+  if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype))
+ || (((code == PLUS_EXPR || code == MINUS_EXPR || code == NEGATE_EXPR)
|| !target_support_p)
   && maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD))
  /* Check only during analysis.  */
-- 
2.39.3



[PATCH] Do not emulate vectors containing floats.

2024-02-19 Thread Juergen Christ
Fixes various test failures on s390x.

gcc/ChangeLog:

* tree-vect-stmts.cc (vectorizable_operation): Don't emulate floating
  point vectors

Signed-off-by: Juergen Christ 

Regtested and bootstrapped on x86_64-pc-linux-gnu and
s390x-ibm-linux-gnu.  Okay for trunk?

---
 gcc/tree-vect-stmts.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 09749ae38174..4164f254fd6e 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -6756,7 +6756,8 @@ vectorizable_operation (vec_info *vinfo,
 those through even when the mode isn't word_mode.  For
 ops we have to lower the lowering code assumes we are
 dealing with word_mode.  */
-  if code == PLUS_EXPR || code == MINUS_EXPR || code == NEGATE_EXPR)
+  if (FLOAT_MODE_P (vec_mode)
+ || (((code == PLUS_EXPR || code == MINUS_EXPR || code == NEGATE_EXPR)
|| !target_support_p)
   && maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD))
  /* Check only during analysis.  */
-- 
2.39.3



[PATCH] s390x: Fix PR112753

2023-11-30 Thread Juergen Christ
Commit 466b100e5fee808d77598e0f294654deec281150 introduced a bug in
s390_md_asm_adjust if vector extensions are not available.  Fix the control
flow of this function to not adjust long double values.

gcc/ChangeLog:

* config/s390/s390.cc (s390_md_asm_adjust): Fix.

gcc/testsuite/ChangeLog:

* gcc.target/s390/pr112753.c: New test.

Bootstrapped and tested on s390x.

Signed-off-by: Juergen Christ 
---
 gcc/config/s390/s390.cc  | 4 
 gcc/testsuite/gcc.target/s390/pr112753.c | 8 
 2 files changed, 12 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/s390/pr112753.c

diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc
index 29b5dc979207..3a4d2d346f0c 100644
--- a/gcc/config/s390/s390.cc
+++ b/gcc/config/s390/s390.cc
@@ -17604,6 +17604,10 @@ s390_md_asm_adjust (vec &outputs, vec 
&inputs,
   outputs[i] = fprx2;
 }
 
+  if (!TARGET_VXE)
+/* Long doubles are stored in FPR pairs - nothing left to do.  */
+return after_md_seq;
+
   for (unsigned i = 0; i < ninputs; i++)
 {
   if (GET_MODE (inputs[i]) != TFmode)
diff --git a/gcc/testsuite/gcc.target/s390/pr112753.c 
b/gcc/testsuite/gcc.target/s390/pr112753.c
new file mode 100644
index ..7183b3f12bed
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/pr112753.c
@@ -0,0 +1,8 @@
+/* This caused an ICE on s390x due to a bug in s390_md_asm_adjust when no
+   vector extension is available.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=zEC12" } */
+
+long double strtold_l_internal___x;
+void strtold_l_internal() { __asm__("" : : 
"fm"(strtold_l_internal___x)); }
-- 
2.39.3



[PATCH] s390: implement flags output

2023-11-15 Thread Juergen Christ
Implement flags output for inline assemblies.  Only use one output constraint
that captures the whole condition code.  No breakout into different condition
codes is allowed.  Also, only one condition code variable is allowed.

Add further logic to canonicalize various cases where we combine different
cases of possible condition codes.

Bootstrapped and tested on s390.  OK for mainline?

gcc/ChangeLog:

* config/s390/s390-c.cc (s390_cpu_cpp_builtins): Define
__GCC_ASM_FLAG_OUTPUTS__.
* config/s390/s390.cc (s390_canonicalize_comparison): More
UNSPEC_CC_TO_INT cases.
(s390_md_asm_adjust): Implement flags output.
* config/s390/s390.md (ccstore4): Allow mask operands.
* doc/extend.texi: Document flags output.

gcc/testsuite/ChangeLog:

* gcc.target/s390/ccor.c: New test.

Signed-off-by: Juergen Christ 
---
 gcc/config/s390/s390-c.cc|   1 +
 gcc/config/s390/s390.cc  | 139 ++-
 gcc/config/s390/s390.md  |   8 +-
 gcc/doc/extend.texi  |   5 +
 gcc/testsuite/gcc.target/s390/ccor.c |  88 +
 5 files changed, 232 insertions(+), 9 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/s390/ccor.c

diff --git a/gcc/config/s390/s390-c.cc b/gcc/config/s390/s390-c.cc
index 269f4f8e978d..c126e6d323d7 100644
--- a/gcc/config/s390/s390-c.cc
+++ b/gcc/config/s390/s390-c.cc
@@ -409,6 +409,7 @@ s390_cpu_cpp_builtins (cpp_reader *pfile)
 cpp_define (pfile, "__LONG_DOUBLE_128__");
   cl_target_option_save (&opts, &global_options, &global_options_set);
   s390_cpu_cpp_builtins_internal (pfile, &opts, NULL);
+  cpp_define (pfile, "__GCC_ASM_FLAG_OUTPUTS__");
 }
 
 #if S390_USE_TARGET_ATTRIBUTE
diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc
index 61c5f88de8af..a19dd7849b84 100644
--- a/gcc/config/s390/s390.cc
+++ b/gcc/config/s390/s390.cc
@@ -1877,6 +1877,97 @@ s390_canonicalize_comparison (int *code, rtx *op0, rtx 
*op1,
  *code = new_code;
}
 }
+  /* Remove UNSPEC_CC_TO_INT from connectives.  This happens for
+ checks against multiple condition codes. */
+  if (GET_CODE (*op0) == AND
+  && GET_CODE (XEXP (*op0, 0)) == UNSPEC
+  && XINT (XEXP (*op0, 0), 1) == UNSPEC_CC_TO_INT
+  && XVECLEN (XEXP (*op0, 0), 0) == 1
+  && REGNO (XVECEXP (XEXP (*op0, 0), 0, 0)) == CC_REGNUM
+  && CONST_INT_P (XEXP (*op0, 1))
+  && CONST_INT_P (*op1)
+  && INTVAL (XEXP (*op0, 1)) == -3
+  && *code == EQ)
+{
+  if (INTVAL (*op1) == 0)
+   {
+ /* case cc == 0 || cc = 2 => mask = 0xa */
+ *op0 = XVECEXP (XEXP (*op0, 0), 0, 0);
+ *op1 = gen_rtx_CONST_INT (VOIDmode, 0xa);
+   }
+  else if (INTVAL (*op1) == 1)
+   {
+ /* case cc == 1 || cc == 3 => mask = 0x5 */
+ *op0 = XVECEXP (XEXP (*op0, 0), 0, 0);
+ *op1 = gen_rtx_CONST_INT (VOIDmode, 0x5);
+   }
+}
+  if (GET_CODE (*op0) == PLUS
+  && GET_CODE (XEXP (*op0, 0)) == UNSPEC
+  && XINT (XEXP (*op0, 0), 1) == UNSPEC_CC_TO_INT
+  && XVECLEN (XEXP (*op0, 0), 0) == 1
+  && REGNO (XVECEXP (XEXP (*op0, 0), 0, 0)) == CC_REGNUM
+  && CONST_INT_P (XEXP (*op0, 1))
+  && CONST_INT_P (*op1)
+  && (*code == LEU || *code == GTU))
+{
+  if (INTVAL (*op1) == 1)
+   {
+ if (INTVAL (XEXP (*op0, 1)) == -1)
+   {
+ /* case cc == 1 || cc == 2 => mask = 0x6 */
+ *op0 = XVECEXP (XEXP (*op0, 0), 0, 0);
+ *op1 = gen_rtx_CONST_INT (VOIDmode, 0x6);
+ *code = *code == GTU ? NE : EQ;
+   }
+ else if (INTVAL (XEXP (*op0, 1)) == -2)
+   {
+ /* case cc == 2 || cc == 3 => mask = 0x3 */
+ *op0 = XVECEXP (XEXP (*op0, 0), 0, 0);
+ *op1 = gen_rtx_CONST_INT (VOIDmode, 0x3);
+ *code = *code == GTU ? NE : EQ;
+   }
+   }
+  else if (INTVAL (*op1) == 2
+  && INTVAL (XEXP (*op0, 1)) == -1)
+   {
+ /* case cc == 1 || cc == 2 || cc == 3 => mask = 0x7 */
+ *op0 = XVECEXP (XEXP (*op0, 0), 0, 0);
+ *op1 = gen_rtx_CONST_INT (VOIDmode, 0x7);
+ *code = *code == GTU ? NE : EQ;
+   }
+}
+  else if (*code == LEU || *code == GTU)
+{
+  if (GET_CODE (*op0) == UNSPEC
+ && XINT (*op0, 1) == UNSPEC_CC_TO_INT
+ && XVECLEN (*op0, 0) == 1
+ && REGNO (XVECEXP (*op0, 0, 0)) == CC_REGNUM
+ && CONST_INT_P (*op1))
+   {
+ if (INTVAL (*op1) == 1)
+   {
+ /* case cc == 0 || cc == 1 => mask = 0xc */
+ *op0 = XVECEXP (*op0, 0, 0);
+ *op1 = gen_rtx_CONST_INT (VOIDmode, 0xc);
+ *code = *code =

[PATCH] s390: split int128 load

2023-11-15 Thread Juergen Christ
Issue two loads when using GPRs instead of one load-multiple.

Bootstrapped and tested on s390.  OK for mainline?

gcc/ChangeLog:

* config/s390/s390.md: Split TImode loads.

gcc/testsuite/ChangeLog:

* gcc.target/s390/int128load.c: New test.

Signed-off-by: Juergen Christ 
---
 gcc/config/s390/s390.md|  4 
 gcc/testsuite/gcc.target/s390/int128load.c | 14 ++
 2 files changed, 14 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/s390/int128load.c

diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
index 3f29ba214427..5bff69aeb350 100644
--- a/gcc/config/s390/s390.md
+++ b/gcc/config/s390/s390.md
@@ -1687,8 +1687,6 @@
   [(set (match_operand:TI 0 "nonimmediate_operand" "")
 (match_operand:TI 1 "general_operand" ""))]
   "TARGET_ZARCH && reload_completed
-   && !s_operand (operands[0], TImode)
-   && !s_operand (operands[1], TImode)
&& s390_split_ok_p (operands[0], operands[1], TImode, 0)"
   [(set (match_dup 2) (match_dup 4))
(set (match_dup 3) (match_dup 5))]
@@ -1703,8 +1701,6 @@
   [(set (match_operand:TI 0 "nonimmediate_operand" "")
 (match_operand:TI 1 "general_operand" ""))]
   "TARGET_ZARCH && reload_completed
-   && !s_operand (operands[0], TImode)
-   && !s_operand (operands[1], TImode)
&& s390_split_ok_p (operands[0], operands[1], TImode, 1)"
   [(set (match_dup 2) (match_dup 4))
(set (match_dup 3) (match_dup 5))]
diff --git a/gcc/testsuite/gcc.target/s390/int128load.c 
b/gcc/testsuite/gcc.target/s390/int128load.c
new file mode 100644
index ..35d5380704b6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/int128load.c
@@ -0,0 +1,14 @@
+/* Check that int128 loads and stores are split.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O3 -mzarch -march=zEC12" } */
+
+__int128 global;
+
+void f(__int128 x)
+{
+  global = x;
+}
+
+/* { dg-final { scan-assembler-times "lg\t" 2 } } */
+/* { dg-final { scan-assembler-times "stg\t" 2 } } */
-- 
2.39.3



[PATCH] s390: Fix ICE in testcase pr89233

2023-11-15 Thread Juergen Christ
When using GNU vector extensions, an access outside of the vector size
caused an ICE on s390.  Fix this by aligning with the vec_extract
builtin, i.e., computing constant index modulo number of lanes.

Fixes testcase gcc.target/s390/pr89233.c.

Bootstrapped and tested on s390.  OK for mainline?

gcc/ChangeLog:

* config/s390/vector.md: (*vec_extract) Fix.

Signed-off-by: Juergen Christ 
---
 gcc/config/s390/vector.md | 14 --
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md
index 7d1eb36e8446..deda5990a035 100644
--- a/gcc/config/s390/vector.md
+++ b/gcc/config/s390/vector.md
@@ -532,12 +532,14 @@
  (match_operand:V1 "nonmemory_operand"  "v,v")
  (parallel
   [(match_operand:SI 2 "nonmemory_operand" "an,I")])))]
-  "TARGET_VX
-   && (!CONST_INT_P (operands[2])
-   || UINTVAL (operands[2]) < GET_MODE_NUNITS (mode))"
-  "@
-   vlgv\t%0,%v1,%Y2
-   vste\t%v1,%0,%2"
+  "TARGET_VX"
+  {
+if (CONST_INT_P (operands[2]))
+ operands[2] = GEN_INT (UINTVAL (operands[2]) & (GET_MODE_NUNITS 
(mode) - 1));
+if (which_alternative == 0)
+  return "vlgv\t%0,%v1,%Y2";
+   return "vste\t%v1,%0,%2";
+  }
   [(set_attr "op_type" "VRS,VRX")])
 
 ; vlgvb, vlgvh, vlgvf, vlgvg
-- 
2.39.3



[PATCH] s390: fix htm-builtins test cases

2023-10-25 Thread Juergen Christ
Transactional and non-transactional stores to the same cache line cause
transactions to abort on newer generations.  Add sufficient padding to make
sure another cache line is used.

Tested on s390.

gcc/testsuite/ChangeLog:

* gcc.target/s390/htm-builtins-1.c: Fix.
* gcc.target/s390/htm-builtins-2.c: Fix.

Signed-off-by: Juergen Christ 
---
 gcc/testsuite/gcc.target/s390/htm-builtins-1.c | 4 +++-
 gcc/testsuite/gcc.target/s390/htm-builtins-2.c | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/gcc/testsuite/gcc.target/s390/htm-builtins-1.c 
b/gcc/testsuite/gcc.target/s390/htm-builtins-1.c
index ff43be9fe736..4f95bf3accaa 100644
--- a/gcc/testsuite/gcc.target/s390/htm-builtins-1.c
+++ b/gcc/testsuite/gcc.target/s390/htm-builtins-1.c
@@ -53,9 +53,11 @@ __attribute__ ((aligned(256))) struct
 __attribute__ ((aligned(256))) struct
 {
   volatile uint64_t c1;
+  char pad1[256 - sizeof(uint64_t)];
   volatile uint64_t c2;
+  char pad2[256 - sizeof(uint64_t)];
   volatile uint64_t c3;
-} counters = { 0, 0, 0 };
+} counters = { 0 };
 
 /*  local helper functions - */
 
diff --git a/gcc/testsuite/gcc.target/s390/htm-builtins-2.c 
b/gcc/testsuite/gcc.target/s390/htm-builtins-2.c
index bb9d346ea560..2e838caacc8c 100644
--- a/gcc/testsuite/gcc.target/s390/htm-builtins-2.c
+++ b/gcc/testsuite/gcc.target/s390/htm-builtins-2.c
@@ -94,9 +94,11 @@ float global_float_3 = 0.0;
 __attribute__ ((aligned(256))) struct
 {
   volatile uint64_t c1;
+  char pad1[256 - sizeof(uint64_t)];
   volatile uint64_t c2;
+  char pad2[256 - sizeof(uint64_t)];
   volatile uint64_t c3;
-} counters = { 0, 0, 0 };
+} counters = { 0 };
 
 /*  local helper functions - */
 
-- 
2.39.3



[PATCH] s390: Optimize vec_cmpge followed by vec_sel

2023-07-17 Thread Juergen Christ via Gcc-patches
A vec_cmpge produces a negation.  Replace this negation by swapping the two
selection choices of a vec_sel based on the result of the vec_cmpge.

Bootstrapped and regression tested on s390x.

gcc/ChangeLog:

* config/s390/vx-builtins.md: New vsel pattern.

gcc/testsuite/ChangeLog:

* gcc.target/s390/vector/vec-cmpge.c: New test.

Signed-off-by: Juergen Christ 
---
 gcc/config/s390/vx-builtins.md | 11 +++
 .../gcc.target/s390/vector/vec-cmpge.c | 18 ++
 2 files changed, 29 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-cmpge.c

diff --git a/gcc/config/s390/vx-builtins.md b/gcc/config/s390/vx-builtins.md
index f4248c55d4ec..0ce3ff6ef4a6 100644
--- a/gcc/config/s390/vx-builtins.md
+++ b/gcc/config/s390/vx-builtins.md
@@ -530,6 +530,17 @@
   "vsel\t%v0,%1,%2,%3"
   [(set_attr "op_type" "VRR")])
 
+(define_insn "vsel_swapped"
+  [(set (match_operand:V_HW_FT   0 "register_operand" "=v")
+   (ior:V_HW_FT
+(and:V_HW_FT (not:V_HW_FT (match_operand:V_HW_FT 3 "register_operand"  
"v"))
+ (match_operand:V_HW_FT 1 "register_operand"  "v"))
+(and:V_HW_FT (match_dup 3)
+ (match_operand:V_HW_FT 2 "register_operand"  "v"]
+  "TARGET_VX"
+  "vsel\t%v0,%2,%1,%3"
+  [(set_attr "op_type" "VRR")])
+
 
 ; Vector sign extend to doubleword
 
diff --git a/gcc/testsuite/gcc.target/s390/vector/vec-cmpge.c 
b/gcc/testsuite/gcc.target/s390/vector/vec-cmpge.c
new file mode 100644
index ..eb188690ae41
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/vector/vec-cmpge.c
@@ -0,0 +1,18 @@
+/* Check that vec_sel absorbs a negation generated by vec_cmpge.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O3 -mzarch -march=z13" } */
+
+typedef __attribute__((vector_size(16))) unsigned char uv16qi;
+
+#include 
+
+void f(char *res, uv16qi ctrl)
+{
+  uv16qi a = vec_splat_u8(0xfe);
+  uv16qi b = vec_splat_u8(0x80);
+  uv16qi mask = vec_cmpge(ctrl, b);
+  *(uv16qi *)res = vec_sel(a, b, mask);
+}
+
+/* { dg-final { scan-assembler-not "vno\t" } } */
-- 
2.39.3



[PATCH] s390: Fix vec_init default expander

2023-07-07 Thread Juergen Christ via Gcc-patches
Do not reinitialize vector lanes to zero since they are already initialized to
zero.

Bootstrapped and regression tested on s390x.

gcc/ChangeLog:

* config/s390/s390.cc (vec_init): Fix default case

gcc/Testsuite/ChangeLog:

* gcc.target/s390/vector/vec-init-3.c: New test.

Signed-off-by: Juergen Christ 
---
 gcc/config/s390/s390.cc | 11 ++-
 .../gcc.target/s390/vector/vec-init-3.c | 17 +
 2 files changed, 23 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-init-3.c

diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc
index 505de995da87..31b646782721 100644
--- a/gcc/config/s390/s390.cc
+++ b/gcc/config/s390/s390.cc
@@ -7130,11 +7130,12 @@ s390_expand_vec_init (rtx target, rtx vals)
   if (!general_operand (elem, GET_MODE (elem)))
elem = force_reg (inner_mode, elem);
 
-  emit_insn (gen_rtx_SET (target,
- gen_rtx_UNSPEC (mode,
- gen_rtvec (3, elem,
-GEN_INT (i), target),
- UNSPEC_VEC_SET)));
+  if (elem != const0_rtx)
+   emit_insn (gen_rtx_SET (target,
+   gen_rtx_UNSPEC (mode,
+   gen_rtvec (3, elem,
+  GEN_INT (i), target),
+   UNSPEC_VEC_SET)));
 }
 }
 
diff --git a/gcc/testsuite/gcc.target/s390/vector/vec-init-3.c 
b/gcc/testsuite/gcc.target/s390/vector/vec-init-3.c
new file mode 100644
index ..12008a963ffb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/vector/vec-init-3.c
@@ -0,0 +1,17 @@
+/* Check that the default case of the vec_init expander does its job.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O3 -mzarch -march=z13" } */
+
+typedef __attribute__((vector_size(16))) signed int v4si;
+
+extern v4si G;
+
+v4si
+n (signed int a)
+{
+  return G == (v4si){ a };
+}
+/* { dg-final { scan-assembler-times "vzero" 1 } } */
+/* { dg-final { scan-assembler-times "vlvgf\t" 1 } } */
+/* { dg-final { scan-assembler-not "vleif\t" } } */
-- 
2.39.3