Re: [PATCH 2/2] s390: Implement reduction optabs

Stefan Schulze Frielinghaus Mon, 14 Jul 2025 06:17:26 -0700

On Thu, Jul 10, 2025 at 09:14:24AM +0200, Juergen Christ wrote:
> Implementation and tests for the standard reduction optabs.
> 
> Bootstrapped and regtested on s390.  Ok for trunk?
> 
> Signed-off-by: Juergen Christ <jchr...@linux.ibm.com>
> 
> gcc/ChangeLog:
> 
>       * config/s390/vector.md (reduc_plus_scal_<mode>): Implement.
>       (reduc_plus_scal_v2df): Implement.
>       (reduc_plus_scal_v4sf): Implement.
>       (REDUC_FMINMAX): New int iterator.
>       (reduc_fminmax_name): New int attribute.
>       (reduc_minmax): New code iterator.
>       (reduc_minmax_name): New code attribute.
>       (reduc_<reduc_fminmax_name>_scal_v2df): Implement.
>       (reduc_<reduc_fminmax_name>_scal_v4sf): Implement.
>       (reduc_<reduc_minmax_name>_scal_v2df): Implement.
>       (reduc_<reduc_minmax_name>_scal_v4sf): Implement.
>       (REDUCBIN): New code iterator.
>       (reduc_bin_insn): New code attribute.
>       (reduc_<reduc_bin_insn>_scal_v2di): Implement.
>       (reduc_<reduc_bin_insn>_scal_v4si): Implement.
>       (reduc_<reduc_bin_insn>_scal_v8hi): Implement.
>       (reduc_<reduc_bin_insn>_scal_v16qi): Implement.
> 
> gcc/testsuite/ChangeLog:
> 
>       * lib/target-supports.exp: Add s390 to vect_logical_reduc targets.
>       * gcc.target/s390/vector/reduc-binops-1.c: New test.
>       * gcc.target/s390/vector/reduc-minmax-1.c: New test.
>       * gcc.target/s390/vector/reduc-plus-1.c: New test.
> ---
>  gcc/config/s390/vector.md                     | 293 +++++++++++++++++-
>  .../gcc.target/s390/vector/reduc-binops-1.c   |  40 +++
>  .../gcc.target/s390/vector/reduc-minmax-1.c   | 234 ++++++++++++++
>  .../gcc.target/s390/vector/reduc-plus-1.c     | 152 +++++++++
>  gcc/testsuite/lib/target-supports.exp         |   4 +-
>  5 files changed, 717 insertions(+), 6 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/s390/vector/reduc-binops-1.c
>  create mode 100644 gcc/testsuite/gcc.target/s390/vector/reduc-minmax-1.c
>  create mode 100644 gcc/testsuite/gcc.target/s390/vector/reduc-plus-1.c
> 
> diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md
> index 26753c099cda..98427b37e884 100644
> --- a/gcc/config/s390/vector.md
> +++ b/gcc/config/s390/vector.md
> @@ -3572,11 +3572,6 @@
>    "veval\t%v0,%v1,%v2,%v3,%b4"
>    [(set_attr "op_type" "VRI")])
>  
> -; reduc_smin
> -; reduc_smax
> -; reduc_umin
> -; reduc_umax
> -
>  ; vec_pack_sfix_trunc: convert + pack ?
>  ; vec_pack_ufix_trunc
>  ; vec_unpacks_float_hi
> @@ -3627,3 +3622,291 @@
>              (const_int 4)]
>             UNSPEC_FMIN))]
>    "TARGET_VXE")
> +
> +; reduc_plus
> +(define_expand "reduc_plus_scal_<mode>"
> +  [(set (match_dup 4)
> +   (unspec:V4SI [(match_operand:VI_HW_QH 1 "register_operand")
      ^
nitpicking: align with (match_dup 4)


> +              (match_dup 2)]
> +             UNSPEC_VEC_VSUM))
> +   (set (match_dup 5)
> +     (unspec:V2DI [(match_dup 4) (match_dup 3)] UNSPEC_VEC_VSUMQ))
> +   (set (match_operand:<non_vec> 0 "register_operand")
> +     (vec_select:<non_vec> (match_dup 6)
> +                           (parallel [(match_dup 7)])))]
> +  "TARGET_VX"
> +{
> +  operands[2] = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode));
> +  operands[3] = simplify_gen_subreg (V4SImode, operands[2], <MODE>mode, 0);
> +  operands[4] = gen_reg_rtx (V4SImode);
> +  operands[5] = gen_reg_rtx (V2DImode);
> +  operands[6] = simplify_gen_subreg(<MODE>mode, operands[5], V2DImode, 0);
                                      ^
nitpicking: space

> +  operands[7] = GEN_INT (16 / GET_MODE_SIZE (<non_vec>mode) - 1);
> +})
> +
> +(define_expand "reduc_plus_scal_<mode>"
> +  [(set (match_dup 3)
> +     (unspec:V2DI [(match_operand:VI_HW_SD 1 "register_operand")
        ^
nitpicking: align with (match_dup 3)

> +                (match_dup 2)]
> +               UNSPEC_VEC_VSUMQ))
> +   (set (match_operand:<non_vec> 0 "register_operand")
> +     (vec_select:<non_vec> (match_dup 4)
> +                           (parallel [(match_dup 5)])))]
> +  "TARGET_VX"
> +{
> +  operands[2] = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode));
> +  operands[3] = gen_reg_rtx (V2DImode);
> +  operands[4] = simplify_gen_subreg (<MODE>mode, operands[3], V2DImode, 0);
> +  operands[5] = GEN_INT (16 / GET_MODE_SIZE (<non_vec>mode) - 1);
> +})
> +
> +(define_expand "reduc_plus_scal_v2df"
> +  [(set (match_dup 2)
> +   (unspec:V2DF [(match_operand:V2DF 1 "register_operand")
      ^
nitpicking: align with (match_dup 2)

> +              (match_dup 1)
> +              (const_int 8)]
> +             UNSPEC_VEC_SLDBYTE))
> +   (set (match_dup 3) (plus:V2DF (match_dup 1) (match_dup 2)))
> +   (set (match_operand:DF 0 "register_operand")
> +     (vec_select:DF (match_dup 3) (parallel [(const_int 0)])))]
> +  "TARGET_VX"
> +{
> +  operands[2] = gen_reg_rtx (V2DFmode);
> +  operands[3] = gen_reg_rtx (V2DFmode);
> +})
> +
> +(define_expand "reduc_plus_scal_v4sf"
> +  [(set (match_dup 2)
> +   (unspec:V4SF [(match_operand:V4SF 1 "register_operand")
      ^
nitpicking: align with (match_dup 2)

> +              (match_dup 1)
> +              (const_int 4)]
> +             UNSPEC_VEC_SLDBYTE))
> +   (set (match_dup 3) (plus:V4SF (match_dup 1) (match_dup 2)))
> +   (set (match_dup 4)
> +     (unspec:V4SF [(match_dup 3) (match_dup 3) (const_int 8)]
> +                  UNSPEC_VEC_SLDBYTE))
> +   (set (match_dup 5) (plus:V4SF (match_dup 3) (match_dup 4)))
> +   (set (match_operand:SF 0 "register_operand")
> +     (vec_select:SF (match_dup 5) (parallel [(const_int 0)])))]
> +  "TARGET_VXE"
> +{
> +  operands[2] = gen_reg_rtx (V4SFmode);
> +  operands[3] = gen_reg_rtx (V4SFmode);
> +  operands[4] = gen_reg_rtx (V4SFmode);
> +  operands[5] = gen_reg_rtx (V4SFmode);
> +})
> +
> +; reduc_fmin, reduc_fmax, reduc_smin, reduc_smax
> +
> +(define_int_iterator REDUC_FMINMAX [UNSPEC_FMAX UNSPEC_FMIN])
> +(define_int_attr reduc_fminmax_name [(UNSPEC_FMAX "fmax") (UNSPEC_FMIN 
> "fmin")])
> +(define_code_iterator reduc_minmax [smin smax])
                         ^
nitpicking: most (code) iterators are written uppercase which we should
stick to

> +(define_code_attr reduc_minmax_name [(smin "smin") (smax "smax")])
> +
> +(define_expand "reduc_<reduc_fminmax_name>_scal_v2df"
> +  [(set (match_dup 2)
> +     (unspec:V2DF [(match_operand:V2DF 1 "register_operand")
> +                   (match_dup 1)
> +                   (const_int 8)]
> +                  UNSPEC_VEC_SLDBYTE))
> +   (set (match_dup 3)
> +     (unspec:V2DF [(match_dup 1) (match_dup 2) (const_int 4)] REDUC_FMINMAX))
> +   (set (match_operand:DF 0 "register_operand" "")
> +     (vec_select:DF (match_dup 3) (parallel [(const_int 0)])))]
> +  "TARGET_VX"
> +{
> +  operands[2] = gen_reg_rtx (V2DFmode);
> +  operands[3] = gen_reg_rtx (V2DFmode);
> +})
> +
> +(define_expand "reduc_<reduc_fminmax_name>_scal_v4sf"
> +  [(set (match_dup 2)
> +     (unspec:V4SF [(match_operand:V4SF 1 "register_operand")
> +                   (match_dup 1)
> +                   (const_int 4)]
> +                  UNSPEC_VEC_SLDBYTE))
> +   (set (match_dup 3)
> +     (unspec:V4SF [(match_dup 1) (match_dup 2) (const_int 4)] REDUC_FMINMAX))
> +   (set (match_dup 4)
> +     (unspec:V4SF [(match_dup 3)
> +                  (match_dup 3)
                     ^
> +                  (const_int 8)]
                     ^
> +                 UNSPEC_VEC_SLDBYTE))
                    ^
nitpicking: indent

> +   (set (match_dup 5)
> +     (unspec:V4SF [(match_dup 3) (match_dup 4) (const_int 4)] REDUC_FMINMAX))
> +   (set (match_operand:SF 0 "register_operand")
> +     (vec_select:SF (match_dup 5) (parallel [(const_int 0)])))]
> +   "TARGET_VXE"
> +{
> +  operands[2] = gen_reg_rtx (V4SFmode);
> +  operands[3] = gen_reg_rtx (V4SFmode);
> +  operands[4] = gen_reg_rtx (V4SFmode);
> +  operands[5] = gen_reg_rtx (V4SFmode);
> +})
> +
> +(define_expand "reduc_<reduc_minmax_name>_scal_v2df"
> +  [(set (match_dup 2)
> +     (unspec:V2DF [(match_operand:V2DF 1 "register_operand")
> +                   (match_dup 1)
> +                   (const_int 8)]
> +                  UNSPEC_VEC_SLDBYTE))
> +   (set (match_dup 3)
> +     (reduc_minmax:V2DF (match_dup 1) (match_dup 2)))
> +   (set (match_operand:DF 0 "register_operand" "")
> +     (vec_select:DF (match_dup 3) (parallel [(const_int 0)])))]
> +  "TARGET_VX"
> +{
> +  operands[2] = gen_reg_rtx (V2DFmode);
> +  operands[3] = gen_reg_rtx (V2DFmode);
> +})
> +
> +(define_expand "reduc_<reduc_minmax_name>_scal_v4sf"
> +  [(set (match_dup 2)
> +     (unspec:V4SF [(match_operand:V4SF 1 "register_operand")
> +                   (match_dup 1)
> +                   (const_int 4)]
> +                  UNSPEC_VEC_SLDBYTE))
> +   (set (match_dup 3)
> +     (reduc_minmax:V4SF (match_dup 1) (match_dup 2)))
> +   (set (match_dup 4)
> +     (unspec:V4SF [(match_dup 3)
> +                   (match_dup 3)
> +                   (const_int 8)]
> +                  UNSPEC_VEC_SLDBYTE))
> +   (set (match_dup 5)
> +     (reduc_minmax:V4SF (match_dup 3) (match_dup 4)))
> +   (set (match_operand:SF 0 "register_operand" "")
> +     (vec_select:SF (match_dup 5) (parallel [(const_int 0)])))]
> +   "TARGET_VXE"
> +{
> +  operands[2] = gen_reg_rtx (V4SFmode);
> +  operands[3] = gen_reg_rtx (V4SFmode);
> +  operands[4] = gen_reg_rtx (V4SFmode);
> +  operands[5] = gen_reg_rtx (V4SFmode);
> +})
> +
> +; reduce_and, reduc_ior, reduc_xor
> +; reduc_smin, reduc_smax, reduc_umin, reduc_umax
> +
> +(define_code_iterator REDUCBIN [and xor ior smin smax umin umax])
> +(define_code_attr reduc_bin_insn [(and "and") (xor "xor") (ior "ior")
> +                               (smin "smin") (smax "smax")
> +                               (umin "umin") (umax "umax")])
> +
> +(define_expand "reduc_<reduc_bin_insn>_scal_v2di"
> +  [(set (match_dup 2)
> +     (unspec:V2DI [(match_operand:V2DI 1 "register_operand")
> +                   (match_dup 1)
> +                   (const_int 8)]
> +                  UNSPEC_VEC_SLDBYTE))
> +   (set (match_dup 3)
> +     (REDUCBIN:V2DI (match_dup 1) (match_dup 2)))
> +   (set (match_operand:DI 0 "register_operand" "")
> +     (vec_select:DI (match_dup 3) (parallel [(const_int 0)])))]
> +  "TARGET_VX"
> +{
> +  operands[2] = gen_reg_rtx (V2DImode);
> +  operands[3] = gen_reg_rtx (V2DImode);
> +})
> +
> +(define_expand "reduc_<reduc_bin_insn>_scal_v4si"
> +  [(set (match_dup 2)
> +     (unspec:V4SI [(match_operand:V4SI 1 "register_operand")
> +                   (match_dup 1)
> +                   (const_int 4)]
> +                  UNSPEC_VEC_SLDBYTE))
> +   (set (match_dup 3)
> +     (REDUCBIN:V4SI (match_dup 1) (match_dup 2)))
> +   (set (match_dup 4)
> +     (unspec:V4SI [(match_dup 3)
> +                   (match_dup 3)
> +                   (const_int 8)]
> +                  UNSPEC_VEC_SLDBYTE))
> +   (set (match_dup 5)
> +     (REDUCBIN:V4SI (match_dup 3) (match_dup 4)))
> +   (set (match_operand:SI 0 "register_operand" "")
> +     (vec_select:SI (match_dup 5) (parallel [(const_int 0)])))]
> +  "TARGET_VX"
> +{
> +  operands[2] = gen_reg_rtx (V4SImode);
> +  operands[3] = gen_reg_rtx (V4SImode);
> +  operands[4] = gen_reg_rtx (V4SImode);
> +  operands[5] = gen_reg_rtx (V4SImode);
> +})
> +
> +(define_expand "reduc_<reduc_bin_insn>_scal_v8hi"
> +  [(set (match_dup 2)
> +     (unspec:V8HI [(match_operand:V8HI 1 "register_operand")
> +                   (match_dup 1)
> +                   (const_int 2)]
> +                  UNSPEC_VEC_SLDBYTE))
> +   (set (match_dup 3)
> +     (REDUCBIN:V8HI (match_dup 1) (match_dup 2)))
> +   (set (match_dup 4)
> +     (unspec:V8HI [(match_dup 3)
> +                   (match_dup 3)
> +                   (const_int 4)]
> +                  UNSPEC_VEC_SLDBYTE))
> +   (set (match_dup 5)
> +     (REDUCBIN:V8HI (match_dup 3) (match_dup 4)))
> +   (set (match_dup 6)
> +     (unspec:V8HI [(match_dup 5)
> +                   (match_dup 5)
> +                   (const_int 8)]
> +                  UNSPEC_VEC_SLDBYTE))
> +   (set (match_dup 7)
> +     (REDUCBIN:V8HI (match_dup 5) (match_dup 6)))
> +   (set (match_operand:HI 0 "register_operand" "")
> +     (vec_select:HI (match_dup 7) (parallel [(const_int 0)])))]
> +  "TARGET_VX"
> +{
> +  operands[2] = gen_reg_rtx (V8HImode);
> +  operands[3] = gen_reg_rtx (V8HImode);
> +  operands[4] = gen_reg_rtx (V8HImode);
> +  operands[5] = gen_reg_rtx (V8HImode);
> +  operands[6] = gen_reg_rtx (V8HImode);
> +  operands[7] = gen_reg_rtx (V8HImode);
> +})
> +
> +(define_expand "reduc_<reduc_bin_insn>_scal_v16qi"
> +  [(set (match_dup 2)
> +     (unspec:V16QI [(match_operand:V16QI 1 "register_operand")
> +                    (match_dup 1)
> +                    (const_int 1)]
> +                   UNSPEC_VEC_SLDBYTE))
> +   (set (match_dup 3)
> +     (REDUCBIN:V16QI (match_dup 1) (match_dup 2)))
> +   (set (match_dup 4)
> +     (unspec:V16QI [(match_dup 3)
> +                    (match_dup 3)
> +                    (const_int 2)]
> +                   UNSPEC_VEC_SLDBYTE))
> +   (set (match_dup 5)
> +     (REDUCBIN:V16QI (match_dup 3) (match_dup 4)))
> +   (set (match_dup 6)
> +     (unspec:V16QI [(match_dup 5)
> +                    (match_dup 5)
> +                    (const_int 4)]
> +                   UNSPEC_VEC_SLDBYTE))
> +   (set (match_dup 7)
> +     (REDUCBIN:V16QI (match_dup 5) (match_dup 6)))
> +   (set (match_dup 8)
> +     (unspec:V16QI [(match_dup 7)
> +                    (match_dup 7)
> +                    (const_int 8)]
> +                   UNSPEC_VEC_SLDBYTE))
> +   (set (match_dup 9)
> +     (REDUCBIN:V16QI (match_dup 7) (match_dup 8)))
> +   (set (match_operand:QI 0 "register_operand" "")
> +     (vec_select:QI (match_dup 9) (parallel [(const_int 0)])))]
> +  "TARGET_VX"
> +{
> +  operands[2] = gen_reg_rtx (V16QImode);
> +  operands[3] = gen_reg_rtx (V16QImode);
> +  operands[4] = gen_reg_rtx (V16QImode);
> +  operands[5] = gen_reg_rtx (V16QImode);
> +  operands[6] = gen_reg_rtx (V16QImode);
> +  operands[7] = gen_reg_rtx (V16QImode);
> +  operands[8] = gen_reg_rtx (V16QImode);
> +  operands[9] = gen_reg_rtx (V16QImode);
> +})
> diff --git a/gcc/testsuite/gcc.target/s390/vector/reduc-binops-1.c 
> b/gcc/testsuite/gcc.target/s390/vector/reduc-binops-1.c
> new file mode 100644
> index 000000000000..efd3294a7350
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/s390/vector/reduc-binops-1.c
> @@ -0,0 +1,40 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mzarch -march=z13 -ftree-vectorize 
> -fdump-tree-optimized" } */
> +
> +#define T(X,N)                                  \
> +  unsigned X                                    \
> +  reduce_and_##X (unsigned X *in)               \
> +  {                                             \
> +  unsigned X acc = (unsigned X)-1;              \
> +  for (int i = 0; i < N; i++)                   \
> +    acc &= in[i];                               \
> +  return acc;                                   \
> +  }                                             \
> +  unsigned X                                    \
> +  reduce_ior_##X (unsigned X *in)               \
> +  {                                             \
> +  unsigned X acc = 0;                           \
> +  for (int i = 0; i < N; i++)                   \
> +    acc |= in[i];                               \
> +  return acc;                                   \
> +  }                                             \
> +  unsigned X                                    \
> +  redue_xor_##X (unsigned X *in)                \
> +  {                                             \
> +  unsigned X acc = 0;                           \
> +  for (int i = 0; i < N; i++)                   \
> +    acc ^= in[i];                               \
> +  return acc;                                   \
> +  }
> +
> +T(char,16)
> +
> +T(short, 8)
> +
> +T(int,4)
> +
> +T(long,4)
> +
> +/* { dg-final { scan-tree-dump-times "\.REDUC_AND" 4 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "\.REDUC_IOR" 4 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "\.REDUC_XOR" 4 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.target/s390/vector/reduc-minmax-1.c 
> b/gcc/testsuite/gcc.target/s390/vector/reduc-minmax-1.c
> new file mode 100644
> index 000000000000..f23e96f79fe0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/s390/vector/reduc-minmax-1.c
> @@ -0,0 +1,234 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mzarch -march=z16 -ftree-vectorize 
> -fdump-tree-optimized" } */
                                       ^
If the architecture is fixed in the testcase, we typically test for the
first were the feature is supported.  In that case it would be z14.
Another option would be to not fix the architecture and use something
like dg-require-effective-target s390_vxe and then test for multiple
target boards if you want to make sure that the behaviour is correct
over multiple architectures.

> +
> +#define MAX(a, b) ((a) > (b) ? (a) : (b))
> +#define MIN(a, b) ((a) > (b) ? (b) : (a))
> +
> +/* unsigned integers */
> +
> +unsigned char
> +reduce_umax_char (unsigned char *p)
> +{
> +  unsigned char res = p[0];
> +  for (int i = 0; i < 16; i++)
> +    res = MAX (res, p[i]);
> +  return res;
> +}
> +
> +unsigned char
> +reduce_umin_char (unsigned char *p)
> +{
> +  unsigned char res = p[0];
> +  for (int i = 0; i < 16; i++)
> +    res = MIN (res, p[i]);
> +  return res;
> +}
> +
> +unsigned short
> +reduce_umax_short (unsigned short *p)
> +{
> +  unsigned short res = p[0];
> +  for (int i = 0; i < 8; i++)
> +    res = MAX (res, p[i]);
> +  return res;
> +}
> +
> +unsigned short
> +reduce_umin_short (unsigned short *p)
> +{
> +  unsigned short res = p[0];
> +  for (int i = 0; i < 8; i++)
> +    res = MIN (res, p[i]);
> +  return res;
> +}
> +
> +unsigned int
> +reduce_umax_int (unsigned int* p)
> +{
> +  unsigned int res = p[0];
> +  for (int i = 0; i != 4; i++)
> +    res = MAX (res, p[i]);
> +  return res;
> +}
> +
> +unsigned int
> +reduce_umin_int (unsigned int* p)
> +{
> +  unsigned int res = p[0];
> +  for (int i = 0; i != 4; i++)
> +    res = MIN(res, p[i]);
> +  return res;
> +}
> +
> +unsigned long
> +reduce_umax_long (unsigned long* p)
> +{
> +  unsigned long res = p[0];
> +  for (int i = 0; i != 4; i++)
> +    res = MAX (res, p[i]);
> +  return res;
> +}
> +
> +unsigned long
> +reduce_umin_long (unsigned long* p)
> +{
> +  unsigned long res = p[0];
> +  for (int i = 0; i != 4; i++)
> +    res = MIN(res, p[i]);
> +  return res;
> +}
> +
> +/* signed integers */
> +
> +signed char
> +reduce_smax_char (signed char *p)
> +{
> +  signed char res = p[0];
> +  for (int i = 0; i < 16; i++)
> +    res = MAX (res, p[i]);
> +  return res;
> +}
> +
> +signed char
> +reduce_smin_char (signed char *p)
> +{
> +  signed char res = p[0];
> +  for (int i = 0; i < 16; i++)
> +    res = MIN (res, p[i]);
> +  return res;
> +}
> +
> +signed short
> +reduce_smax_short (signed short *p)
> +{
> +  signed short res = p[0];
> +  for (int i = 0; i < 8; i++)
> +    res = MAX (res, p[i]);
> +  return res;
> +}
> +
> +signed short
> +reduce_smin_short (signed short *p)
> +{
> +  signed short res = p[0];
> +  for (int i = 0; i < 8; i++)
> +    res = MIN (res, p[i]);
> +  return res;
> +}
> +
> +signed int
> +reduce_smax_int (signed int* p)
> +{
> +  signed int res = p[0];
> +  for (int i = 0; i != 4; i++)
> +    res = MAX (res, p[i]);
> +  return res;
> +}
> +
> +signed int
> +reduce_smin_int (signed int* p)
> +{
> +  signed int res = p[0];
> +  for (int i = 0; i != 4; i++)
> +    res = MIN(res, p[i]);
> +  return res;
> +}
> +
> +signed long
> +reduce_smax_long (signed long* p)
> +{
> +  signed long res = p[0];
> +  for (int i = 0; i != 4; i++)
> +    res = MAX (res, p[i]);
> +  return res;
> +}
> +
> +signed long
> +reduce_smin_long (signed long* p)
> +{
> +  signed long res = p[0];
> +  for (int i = 0; i != 4; i++)
> +    res = MIN(res, p[i]);
> +  return res;
> +}
> +
> +float
> +__attribute__((optimize("Ofast")))
> +reduce_smax_float (float* p)
> +{
> +  float res = p[0];
> +  for (int i = 0; i != 4; i++)
> +    res = MAX (res, p[i]);
> +  return res;
> +}
> +
> +float
> +__attribute__((optimize("Ofast")))
> +reduce_smin_float (float* p)
> +{
> +  float res = p[0];
> +  for (int i = 0; i != 4; i++)
> +    res = MIN (res, p[i]);
> +  return res;
> +}
> +
> +double
> +__attribute__((optimize("Ofast")))
> +reduce_smax_double (double* p)
> +{
> +  double res = p[0];
> +  for (int i = 0; i != 4; i++)
> +    res = MAX (res, p[i]);
> +  return res;
> +}
> +
> +double
> +__attribute__((optimize("Ofast")))
> +reduce_smin_double (double* p)
> +{
> +  double res = p[0];
> +  for (int i = 0; i != 4; i++)
> +    res = MIN (res, p[i]);
> +  return res;
> +}
> +
> +float
> +reduce_fmax_float (float* p)
> +{
> +  float res = p[0];
> +  for (int i = 0; i != 4; i++)
> +    res = __builtin_fmaxf (res, p[i]);
> +  return res;
> +}
> +
> +float
> +reduce_fmin_float (float* p)
> +{
> +  float res = p[0];
> +  for (int i = 0; i != 4; i++)
> +    res = __builtin_fminf (res, p[i]);
> +  return res;
> +}
> +
> +double
> +reduce_fmax_double (double* p)
> +{
> +  double res = p[0];
> +  for (int i = 0; i != 4; i++)
> +    res = __builtin_fmax (res, p[i]);
> +  return res;
> +}
> +
> +double
> +reduce_fmin_double (double* p)
> +{
> +  double res = p[0];
> +  for (int i = 0; i != 4; i++)
> +    res = __builtin_fmin (res, p[i]);
> +  return res;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\.REDUC_MAX" 10 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "\.REDUC_MIN" 10 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "\.REDUC_FMAX" 2 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "\.REDUC_FMIN" 2 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.target/s390/vector/reduc-plus-1.c 
> b/gcc/testsuite/gcc.target/s390/vector/reduc-plus-1.c
> new file mode 100644
> index 000000000000..ddbab23fecc6
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/s390/vector/reduc-plus-1.c
> @@ -0,0 +1,152 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mzarch -march=z14 -ftree-vectorize 
> -fdump-tree-optimized" } */
> +/* { dg-do run { target { s390_z14_hw } } } */
> +
> +/* signed integers */
> +
> +signed char
> +__attribute__((noipa, optimize("Ofast")))
> +reduce_add_char (signed char* p)
> +{
> +  signed char sum = 0;
> +  for (int i = 0; i != 16; i++)
> +    sum += p[i];
> +  return sum;
> +}
> +
> +short
> +__attribute__((noipa, optimize("Ofast")))
> +reduce_add_short (short* p)
> +{
> +  short sum = 0;
> +  for (int i = 0; i != 16; i++)
> +    sum += p[i];
> +  return sum;
> +}
> +
> +int
> +__attribute__((noipa, optimize("Ofast")))
> +reduce_add_int (int* p)
> +{
> +  int sum = 0;
> +  for (int i = 0; i != 16; i++)
> +    sum += p[i];
> +  return sum;
> +}
> +
> +long
> +__attribute__((noipa, optimize("Ofast")))
> +reduce_add_long (long* p)
> +{
> +  long sum = 0;
> +  for (int i = 0; i != 16; i++)
> +    sum += p[i];
> +  return sum;
> +}
> +
> +/* unsigned integers */
> +
> +unsigned char
> +__attribute__((noipa, optimize("Ofast")))
> +reduce_add_uchar (unsigned char* p)
> +{
> +  unsigned char sum = 0;
> +  for (int i = 0; i != 16; i++)
> +    sum += p[i];
> +  return sum;
> +}
> +
> +unsigned short
> +__attribute__((noipa, optimize("Ofast")))
> +reduce_add_ushort (unsigned short* p)
> +{
> +  unsigned short sum = 0;
> +  for (int i = 0; i != 16; i++)
> +    sum += p[i];
> +  return sum;
> +}
> +
> +unsigned int
> +__attribute__((noipa, optimize("Ofast")))
> +reduce_add_uint (unsigned int* p)
> +{
> +  unsigned int sum = 0;
> +  for (int i = 0; i != 16; i++)
> +    sum += p[i];
> +  return sum;
> +}
> +
> +unsigned long
> +__attribute__((noipa, optimize("Ofast")))
> +reduce_add_ulong (unsigned long* p)
> +{
> +  unsigned long sum = 0;
> +  for (int i = 0; i != 16; i++)
> +    sum += p[i];
> +  return sum;
> +}
> +
> +/* floating point */
> +
> +float
> +__attribute__((noipa, optimize("Ofast")))
> +reduce_add_float (float* p)
> +{
> +  float sum = 0;
> +  for (int i = 0; i != 16; i++)
> +    sum += p[i];
> +  return sum;
> +}
> +
> +double
> +__attribute__((noipa, optimize("Ofast")))
> +reduce_add_double (double* p)
> +{
> +  double sum = 0;
> +  for (int i = 0; i != 16; i++)
> +    sum += p[i];
> +  return sum;
> +}
> +
> +int
> +main()
> +{
> +  signed char chararr[] = 
> {-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16};
> +  signed short shortarr[] = 
> {-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16};
> +  signed int intarr[] = 
> {-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16};
> +  signed long longarr[] = 
> {-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16};
> +
> +  unsigned char uchararr[] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
> +  unsigned short ushortarr[] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
> +  unsigned int uintarr[] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
> +  unsigned long ulongarr[] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
> +  
    ^
Trailing whitespace.

> +  float floatarr[] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
> +  double doublearr[] = 
> {-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16};
> +
> +  if (reduce_add_char (chararr) != (-136 & 0xff))
> +    __builtin_abort();
> +  if (reduce_add_short (shortarr) != -136)
> +    __builtin_abort();
> +  if (reduce_add_int (intarr) != -136)
> +    __builtin_abort();
> +  if (reduce_add_long (longarr) != -136)
> +    __builtin_abort();
> +
> +  if (reduce_add_uchar (uchararr) != 136)
> +    __builtin_abort();
> +  if (reduce_add_ushort (ushortarr) != 136)
> +    __builtin_abort();
> +  if (reduce_add_uint (uintarr) != 136)
> +    __builtin_abort();
> +  if (reduce_add_ulong (ulongarr) != 136)
> +    __builtin_abort();
> +
> +  if (reduce_add_float (floatarr) != 136)
> +    __builtin_abort();
> +  if (reduce_add_double (doublearr) != -136)
> +    __builtin_abort();
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\.REDUC_PLUS" 10 "optimized" } } */
> diff --git a/gcc/testsuite/lib/target-supports.exp 
> b/gcc/testsuite/lib/target-supports.exp
> index 956bc0bc7ca4..48c1be73e92a 100644
> --- a/gcc/testsuite/lib/target-supports.exp
> +++ b/gcc/testsuite/lib/target-supports.exp
> @@ -9944,7 +9944,9 @@ proc check_effective_target_vect_logical_reduc { } {
>                  || [istarget amdgcn-*-*]
>                  || [check_effective_target_riscv_v]
>                  || [check_effective_target_loongarch_sx]
> -                || [check_effective_target_x86]}]
> +                || [check_effective_target_x86]
> +                || ([istarget s390*-*-*]
> +                    && [check_effective_target_s390_vx])}]

Should be enough to test for check_effective_target_s390_vx since that
includes the check for [istarget s390*-*-*].

Ok with those changes.

Thanks,
Stefan

>  }
>  
>  # Return 1 if the target supports the fold_extract_last optab.
> -- 
> 2.43.5
>

Re: [PATCH 2/2] s390: Implement reduction optabs

Reply via email to