On Thu, Jun 20, 2024 at 09:06:11AM +0200, Juergen Christ wrote: > Some casts were missing leading to missed of bad vectorizations where > casting was done scalar followed by a vector creation from the > individual elements. > > gcc/ChangeLog: > > * config/s390/vector.md (VEC_HALF_NARROWED): New mode iterator. > (vec_half_narrowed): ditto. > (trunc<VI_TRUNC:mode><vec_half_narrowed>2): New pattern. > (vec_pack_ufix_trunc_v2df): ditto. > (vec_pack_sfix_trunc_v2df): ditto. > (vec_unpack_sfix_trunc_lo_v4sf): ditto. > (vec_unpack_sfix_trunc_hi_v4sf): ditto. > (vec_unpack_ufix_trunc_lo_v4sf): ditto. > (vec_unpack_ufix_trunc_hi_v4sf): ditto. > (floatv2siv2sf2): ditto. > (floatunsv2siv2sf2): ditto. > (vec_unpacks_float_hi_v4si): ditto. > (vec_unpacks_float_lo_v4si): ditto. > (vec_unpacku_float_hi_v4si): ditto. > (vec_unpacku_float_lo_v4si): ditto. > > gcc/testsuite/ChangeLog: > > * gcc.target/s390/vector/vec-cast-single.c: New test. > * gcc.target/s390/vector/vec_pack_ufix_trunc_v2df.c: New test. > > Bootstrapped and regtested on s390x. Ok for trunk? > > Signed-off-by: Juergen Christ <jchr...@linux.ibm.com> > --- > gcc/config/s390/vector.md | 170 ++++++++++- > .../gcc.target/s390/vector/vec-cast-single.c | 271 ++++++++++++++++++ > .../s390/vector/vec_pack_ufix_trunc_v2df.c | 30 ++ > 3 files changed, 463 insertions(+), 8 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-cast-single.c > create mode 100644 > gcc/testsuite/gcc.target/s390/vector/vec_pack_ufix_trunc_v2df.c > > diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md > index 40de0c75a7cf..356f25d26deb 100644 > --- a/gcc/config/s390/vector.md > +++ b/gcc/config/s390/vector.md > @@ -89,6 +89,8 @@ > > (define_mode_iterator VI_EXTEND [V2QI V2HI V2SI V4QI V4HI]) > > +(define_mode_iterator VI_TRUNC [V2HI V2SI V2DI V4HI V4SI]) > + > ; Empty string for all but TImode. This is used to hide the TImode > ; expander name in case it is defined already. See addti3 for an > ; example. > @@ -211,6 +213,14 @@ > (V1SF "v1df") (V2SF "v2df") (V4SF "v4df") > (V1DF "v1tf") (V2DF "v2tf")]) > > +; Vector with narrowed element size and the same number of elements. > +(define_mode_attr VEC_HALF_NARROWED [(V1HI "V1QI") (V2HI "V2QI") (V4HI > "V4QI") (V8HI "V8QI") > + (V1SI "V1HI") (V2SI "V2HI") (V4SI "V4HI") > + (V1DI "V1DI") (V2DI "V2SI")]) > +(define_mode_attr vec_half_narrowed [(V1HI "v1qi") (V2HI "v2qi") (V4HI > "v4qi") (V8HI "v8qi") > + (V1SI "v1hi") (V2SI "v2hi") (V4SI "v4hi") > + (V1DI "v1di") (V2DI "v2si")]) > + > ; Vector with half the element size AND half the number of elements. > (define_mode_attr vec_halfhalf > [(V2HI "V2QI") (V4HI "V4QI") (V8HI "V8QI") > @@ -2422,6 +2432,17 @@ > operands[2] = gen_reg_rtx (V4SFmode); > }) > > +;; vector truncate > + > +; downcasts > + > +(define_insn "trunc<VI_TRUNC:mode><vec_half_narrowed>2" > + [(set (match_operand:<VEC_HALF_NARROWED> 0 "register_operand" "=v") > + (truncate:<VEC_HALF_NARROWED> (match_operand:VI_TRUNC 1 > "register_operand" "v")))] > + "TARGET_VX" > + "vpk<bhfgq>\t %0,%1,%1" ~~~~^~~~~ whitespace
> + [(set_attr "op_type" "VRR")]) > + > ;; vector unpack v16qi > > ; signed > @@ -3177,17 +3198,150 @@ > emit_move_insn (len, gen_rtx_ZERO_EXTEND (SImode, operands[2])); > emit_insn (gen_vstlv16qi (operands[1], len, mem)); > DONE; > -});; > +}) > + > +(define_expand "vec_pack_ufix_trunc_v2df" > + [(match_operand:V4SI 0 "register_operand") > + (match_operand:V2DF 1 "register_operand") > + (match_operand:V2DF 2 "register_operand")] > + "TARGET_VX" > +{ > + rtx r1 = gen_reg_rtx (V2DImode); > + rtx r2 = gen_reg_rtx (V2DImode); > + > + emit_insn (gen_fixuns_truncv2dfv2di2 (r1, operands[1])); > + emit_insn (gen_fixuns_truncv2dfv2di2 (r2, operands[2])); > + emit_insn (gen_vec_pack_trunc_v2di (operands[0], r1, r2)); > + DONE; > +}) I haven't really wrapped my head around this, however, this two step conversion could miss an IEEE-inexact-exception if a double fits into a 64-bit integer but not in a 32-bit integer. What does the IL/vectorizer say about exceptions? Ok to miss some or do we have to guard this by no-trapping-math et al.? > + > +(define_expand "vec_pack_sfix_trunc_v2df" > + [(match_operand:V4SI 0 "register_operand") > + (match_operand:V2DF 1 "register_operand") > + (match_operand:V2DF 2 "register_operand")] > + "TARGET_VX" > +{ > + rtx r1 = gen_reg_rtx (V2DImode); > + rtx r2 = gen_reg_rtx (V2DImode); > + > + emit_insn (gen_fix_truncv2dfv2di2 (r1, operands[1])); > + emit_insn (gen_fix_truncv2dfv2di2 (r2, operands[2])); > + emit_insn (gen_vec_pack_trunc_v2di (operands[0], r1, r2)); > + DONE; > +}) same as above > + > +; v4sf -> v2di > +(define_expand "vec_unpack_sfix_trunc_lo_v4sf" > + [(match_operand:V2DI 0 "register_operand") > + (match_operand:V4SF 1 "register_operand")] > + "TARGET_VX" > +{ > + rtx r = gen_reg_rtx(V4SImode); ~~~^~~~ whitespace > + > + emit_insn (gen_fix_truncv4sfv4si2 (r, operands[1])); > + emit_insn (gen_vec_unpacks_lo_v4si (operands[0], r)); > + DONE; > +}) The wording of the internals document leaves some room for interpretation. When is widening supposed to happen? The point in time when widening happens makes a difference if a rounded float does not fit in a 32-bit integer but in a 64-bit integer. My gut feeling is that the current implementation is correct, i.e., first converting a 32-bit float into a 32-bit integer and then extending it to a 64-bit integer. > + > +(define_expand "vec_unpack_sfix_trunc_hi_v4sf" > + [(match_operand:V2DI 0 "register_operand") > + (match_operand:V4SF 1 "register_operand")] > + "TARGET_VX" > +{ > + rtx r = gen_reg_rtx(V4SImode); ~~~^~~~ whitespace > + > + emit_insn (gen_fix_truncv4sfv4si2 (r, operands[1])); > + emit_insn (gen_vec_unpacks_hi_v4si (operands[0], r)); > + DONE; > +}) same as above > + > +(define_expand "vec_unpack_ufix_trunc_lo_v4sf" > + [(match_operand:V2DI 0 "register_operand") > + (match_operand:V4SF 1 "register_operand")] > + "TARGET_VX" > +{ > + rtx r = gen_reg_rtx(V4SImode); ~~~^~~~ whitespace > + > + emit_insn (gen_fixuns_truncv4sfv4si2 (r, operands[1])); > + emit_insn (gen_vec_unpacku_lo_v4si (operands[0], r)); > + DONE; > +}) same as above > + > +(define_expand "vec_unpack_ufix_trunc_hi_v4sf" > + [(match_operand:V2DI 0 "register_operand") > + (match_operand:V4SF 1 "register_operand")] > + "TARGET_VX" > +{ > + rtx r = gen_reg_rtx(V4SImode); ~~~^~~~ whitespace > + > + emit_insn (gen_fixuns_truncv4sfv4si2 (r, operands[1])); > + emit_insn (gen_vec_unpacku_hi_v4si (operands[0], r)); > + DONE; > +}) same as above > > +(define_insn "floatv2siv2sf2" > + [(set (match_operand:V2SF 0 "register_operand" "=v") > + (float:V2SF (match_operand:V2SI 1 "register_operand" "v")))] > + "TARGET_VXE2" > + "vcefb\t%v0,%v1,0,5" > + [(set_attr "op_type" "VRR")]) > + > +(define_insn "floatunsv2siv2sf2" > + [(set (match_operand:V2SF 0 "register_operand" "=v") > + (unsigned_float:V2SF (match_operand:V2SI 1 "register_operand" "v")))] > + "TARGET_VXE2" > + "vcelfb\t%v0,%v1,0,5" > + [(set_attr "op_type" "VRR")]) > + > +(define_expand "vec_unpacks_float_hi_v4si" > + [(match_operand:V2DF 0 "register_operand") > + (match_operand:V4SI 1 "register_operand")] > + "TARGET_VX" > +{ > + rtx r = gen_reg_rtx(V2DImode); ~~~^~~~ whitespace > + ~~^~~ trailing whitespace > + emit_insn (gen_vec_unpacks_hi_v4si (r, operands[1])); > + emit_insn (gen_floatv2div2df2 (operands[0], r)); > + DONE; > +}) > + > +(define_expand "vec_unpacks_float_lo_v4si" > + [(match_operand:V2DF 0 "register_operand") > + (match_operand:V4SI 1 "register_operand")] > + "TARGET_VX" > +{ > + rtx r = gen_reg_rtx(V2DImode); ~~~^~~~ whitespace > + ~~^~~ trailing whitespace > + emit_insn (gen_vec_unpacks_lo_v4si (r, operands[1])); > + emit_insn (gen_floatv2div2df2 (operands[0], r)); > + DONE; > +}) > + > +(define_expand "vec_unpacku_float_hi_v4si" > + [(match_operand:V2DF 0 "register_operand") > + (match_operand:V4SI 1 "register_operand")] > + "TARGET_VX" > +{ > + rtx r = gen_reg_rtx(V2DImode); ~~~^~~~ whitespace > + ~~^~~ trailing whitespace > + emit_insn (gen_vec_unpacku_hi_v4si (r, operands[1])); > + emit_insn (gen_floatunsv2div2df2 (operands[0], r)); > + DONE; > +}) > + > +(define_expand "vec_unpacku_float_lo_v4si" > + [(match_operand:V2DF 0 "register_operand") > + (match_operand:V4SI 1 "register_operand")] > + "TARGET_VX" > +{ > + rtx r = gen_reg_rtx(V2DImode); ~~~^~~~ whitespace > + ~~^~~ trailing whitespace > + emit_insn (gen_vec_unpacku_lo_v4si (r, operands[1])); > + emit_insn (gen_floatunsv2div2df2 (operands[0], r)); > + DONE; > +}) > > ; reduc_smin > ; reduc_smax > ; reduc_umin > ; reduc_umax > - > -; vec_pack_sfix_trunc: convert + pack ? > -; vec_pack_ufix_trunc > -; vec_unpacks_float_hi > -; vec_unpacks_float_lo > -; vec_unpacku_float_hi > -; vec_unpacku_float_lo > diff --git a/gcc/testsuite/gcc.target/s390/vector/vec-cast-single.c > b/gcc/testsuite/gcc.target/s390/vector/vec-cast-single.c > new file mode 100644 > index 000000000000..59a154594e9f > --- /dev/null > +++ b/gcc/testsuite/gcc.target/s390/vector/vec-cast-single.c > @@ -0,0 +1,271 @@ > +/* Check that the single-step vector conversions work. */ > + > +/* { dg-do compile } */ > +/* { dg-options "-O3 -mzarch -march=z15 -ftree-vectorize > -fvect-cost-model=unlimited -fdump-tree-slp-all" } */ > +/* { dg-final { scan-tree-dump-not "conversion not supported by target" > "slp" } } */ > + > +void > +extendv4hiv4si2 (short *in, int *out) > +{ > + out[0] = in[0]; > + out[1] = in[1]; > + out[2] = in[2]; > + out[3] = in[3]; > +} > +void > +zero_extendv4hiv4si2 (unsigned short *in, unsigned int *out) > +{ > + out[0] = in[0]; > + out[1] = in[1]; > + out[2] = in[2]; > + out[3] = in[3]; > +} > +void > +vec_unpacks_v4si (int *in, long *out) > +{ > + out[0] = in[0]; > + out[1] = in[1]; > + out[2] = in[2]; > + out[3] = in[3]; > +} > +void > +vec_unpacku_v4si (unsigned int *in, unsigned long *out) > +{ > + out[0] = in[0]; > + out[1] = in[1]; > + out[2] = in[2]; > + out[3] = in[3]; > +} > + > +void > +extedv2siv2di2 (int *in, long *out) > +{ > + out[0] = in[0]; > + out[1] = in[1]; > +} > + > +void > +zero_extedv2siv2di2 (unsigned int *in, unsigned long *out) > +{ > + out[0] = in[0]; > + out[1] = in[1]; > +} > + > +void > +truncv4siv4hi2_signed (int *in, short *out) > +{ > + out[0] = in[0]; > + out[1] = in[1]; > + out[2] = in[2]; > + out[3] = in[3]; > +} > + > +void > +truncv4siv4hi2_unsigned (unsigned int *in, unsigned short *out) > +{ > + out[0] = in[0]; > + out[1] = in[1]; > + out[2] = in[2]; > + out[3] = in[3]; > +} > + > +void > +truncv2div2si2_signed (long *in, int *out) > +{ > + out[0] = in[0]; > + out[1] = in[1]; > +} > + > +void > +truncv2div2si2_unsigned (unsigned long *in, unsigned int *out) > +{ > + out[0] = in[0]; > + out[1] = in[1]; > +} > + > +void > +fix_truncv4sfv4si2 (float *in, int *out) > +{ > + out[0] = in[0]; > + out[1] = in[1]; > + out[2] = in[2]; > + out[3] = in[3]; > +} > + > +void > +fixuns_truncv4sfv4si2 (float *in, unsigned int *out) > +{ > + out[0] = in[0]; > + out[1] = in[1]; > + out[2] = in[2]; > + out[3] = in[3]; > +} > + > +void > +vec_pack_trunc_v2di__signed (long *in, int *out) > +{ > + out[0] = in[0]; > + out[1] = in[1]; > + out[2] = in[2]; > + out[3] = in[3]; > +} > + > +void > +vec_pack_trunc_v2di__unsigned (unsigned long *in, unsigned int *out) > +{ > + out[0] = in[0]; > + out[1] = in[1]; > + out[2] = in[2]; > + out[3] = in[3]; > +} > + > +void > +vec_pack_sfix_trunc_v2df (double *in, int *out) > +{ > + out[0] = in[0]; > + out[1] = in[1]; > + out[2] = in[2]; > + out[3] = in[3]; > +} > + > +void > +vec_pack_ufix_trunc_v2df (double *in, unsigned int *out) > +{ > + out[0] = in[0]; > + out[1] = in[1]; > + out[2] = in[2]; > + out[3] = in[3]; > +} > + > +void > +vec_unpack_sfix_trunc (float *in, long *out) > +{ > + out[0] = in[0]; > + out[1] = in[1]; > + out[2] = in[2]; > + out[3] = in[3]; > +} > + > +void > +vec_unpack_ufix_trunc (float *in, unsigned long *out) > +{ > + out[0] = in[0]; > + out[1] = in[1]; > + out[2] = in[2]; > + out[3] = in[3]; > +} > + > +void > +fix_truncv2dfv2di2 (double *in, long *out) > +{ > + out[0] = in[0]; > + out[1] = in[1]; > +} > + > +void > +fixuns_truncv2dfv2di2 (double *in, unsigned long *out) > +{ > + out[0] = in[0]; > + out[1] = in[1]; > +} > + > +void > +floatv4hiv4sf2 (short *in, float *out) > +{ > + out[0] = in[0]; > + out[1] = in[1]; > + out[2] = in[2]; > + out[3] = in[3]; > +} > + > +void > +floatunsv4hiv4sf2 (unsigned short *in, float *out) > +{ > + out[0] = in[0]; > + out[1] = in[1]; > + out[2] = in[2]; > + out[3] = in[3]; > +} > + > +void > +floatv4siv4sf2 (int *in, float *out) > +{ > + out[0] = in[0]; > + out[1] = in[1]; > + out[2] = in[2]; > + out[3] = in[3]; > +} > + > +void > +floatunsv4siv4sf2 (unsigned int *in, float *out) > +{ > + out[0] = in[0]; > + out[1] = in[1]; > + out[2] = in[2]; > + out[3] = in[3]; > +} > + > +void > +vec_packs_float_v2di (int *in, float *out) > +{ > + out[0] = in[0]; > + out[1] = in[1]; > + out[2] = in[2]; > + out[3] = in[3]; > +} > + > +void > +vec_packu_float_v2di (unsigned int *in, float *out) > +{ > + out[0] = in[0]; > + out[1] = in[1]; > + out[2] = in[2]; > + out[3] = in[3]; > +} > + > +void > +floatv2div2df2 (long *in, double *out) > +{ > + out[0] = in[0]; > + out[1] = in[1]; > +} > + > +void > +floatunsv2div2df2 (unsigned long *in, double *out) > +{ > + out[0] = in[0]; > + out[1] = in[1]; > +} > + > + > +void > +floatv2siv2sf2 (int *in, float *out) > +{ > + out[0] = in[0]; > + out[1] = in[1]; > +} > + > +void > +floatunsv2siv2sf2 (unsigned int *in, float *out) > +{ > + out[0] = in[0]; > + out[1] = in[1]; > +} > +void > +vec_unpacks_float_hi_v4si (int *in, double *out) > +{ > + out[0] = in[0]; > + out[1] = in[1]; > + out[2] = in[2]; > + out[3] = in[3]; > +} > + > +void > +vec_unpacku_float_hi_v4si (unsigned int *in, double *out) > +{ > + out[0] = in[0]; > + out[1] = in[1]; > + out[2] = in[2]; > + out[3] = in[3]; > +} > + > diff --git a/gcc/testsuite/gcc.target/s390/vector/vec_pack_ufix_trunc_v2df.c > b/gcc/testsuite/gcc.target/s390/vector/vec_pack_ufix_trunc_v2df.c > new file mode 100644 > index 000000000000..4fcfbd88abe4 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/s390/vector/vec_pack_ufix_trunc_v2df.c > @@ -0,0 +1,30 @@ > +/* Check that vec_pack_ufix_trunc_v2df pattern is correctly used. Even > without > + this pattern, we will vectorize this code, but produce wrong output. */ > + > +/* { dg-do run } */ > +/* { dg-options "-O3 -mzarch -march=z13 -ftree-vectorize > -fvect-cost-model=unlimited" } */ > + > +__attribute__((noinline,noclone,noipa)) > +void > +vec_pack_ufix_trunc_v2df (double *in, unsigned int *out); > + > +void > +vec_pack_ufix_trunc_v2df (double *in, unsigned int *out) > +{ > + out[0] = in[0]; > + out[1] = in[1]; > + out[2] = in[2]; > + out[3] = in[3]; > +} > + > +int main() > +{ > + double in[] = {-1,-2,-3,-4}; > + unsigned int out[4]; > + > + vec_pack_ufix_trunc_v2df (in, out); > + for (int i = 0; i < 4; ++i) > + if (out[i] != 0) > + __builtin_abort(); > + return 0; > +} > -- > 2.43.0 >