This patch adds a series of built-in functions to allow users to write code to do a number of simple operations where the loop is done using the __vector_pair type. The __vector_pair type is an opaque type. These built-in functions keep the two 128-bit vectors within the __vector_pair together, and split the operation after register allocation.
This patch provides vector pair built-in functions to do a horizontal add on vector pair elements. Only floating point and 64-bit horizontal adds are provided in this patch. I have built and tested these patches on: * A little endian power10 server using --with-cpu=power10 * A little endian power9 server using --with-cpu=power9 * A big endian power9 server using --with-cpu=power9. Can I check this patch into the master branch after the preceeding patches have been checked in? 2023-11-08 Michael Meissner <meiss...@linux.ibm.com> gcc/ * config/rs6000/rs6000-builtins.def (__builtin_vpair_f32_add_elements): New built-in function. (__builtin_vpair_f64_add_elements): Likewise. (__builtin_vpair_i64_add_elements): Likewise. (__builtin_vpair_i64u_add_elements): Likewise. * config/rs6000/vector-pair.md (UNSPEC_VPAIR_REDUCE_PLUS_F32): New unspec. (UNSPEC_VPAIR_REDUCE_PLUS_F64): Likewise. (UNSPEC_VPAIR_REDUCE_PLUS_I64): Likewise. (vpair_reduc_plus_scale_v8sf): New insn. (vpair_reduc_plus_scale_v4df): Likewise. (vpair_reduc_plus_scale_v4di): Likewise. * doc/extend.texi (__builtin_vpair_f32_add_elements): Document. (__builtin_vpair_f64_add_elements): Likewise. (__builtin_vpair_i64_add_elements): Likewise. gcc/testsuite/ * gcc.target/powerpc/vector-pair-16.c: New test. --- gcc/config/rs6000/rs6000-builtins.def | 12 +++ gcc/config/rs6000/vector-pair.md | 93 +++++++++++++++++++ gcc/doc/extend.texi | 3 + .../gcc.target/powerpc/vector-pair-16.c | 45 +++++++++ 4 files changed, 153 insertions(+) create mode 100644 gcc/testsuite/gcc.target/powerpc/vector-pair-16.c diff --git a/gcc/config/rs6000/rs6000-builtins.def b/gcc/config/rs6000/rs6000-builtins.def index fbd416ceb87..b9a16c01420 100644 --- a/gcc/config/rs6000/rs6000-builtins.def +++ b/gcc/config/rs6000/rs6000-builtins.def @@ -4145,6 +4145,9 @@ v256 __builtin_vpair_f32_add (v256, v256); VPAIR_F32_ADD vpair_add_v8sf3 {mma,pair} + float __builtin_vpair_f32_add_elements (v256); + VPAIR_F32_ADD_ELEMENTS vpair_reduc_plus_scale_v8sf {mma,pair} + v256 __builtin_vpair_f32_assemble (vf, vf); VPAIR_F32_ASSEMBLE vpair_assemble_v8sf {mma,pair} @@ -4180,6 +4183,9 @@ v256 __builtin_vpair_f64_add (v256, v256); VPAIR_F64_ADD vpair_add_v4df3 {mma,pair} + double __builtin_vpair_f64_add_elements (v256); + VPAIR_F64_ADD_ELEMENTS vpair_reduc_plus_scale_v4df {mma,pair} + v256 __builtin_vpair_f64_assemble (vd, vd); VPAIR_F64_ASSEMBLE vpair_assemble_v4df {mma,pair} @@ -4375,6 +4381,9 @@ v256 __builtin_vpair_f64_assemble (vd, vd); v256 __builtin_vpair_i64_add (v256, v256); VPAIR_I64_ADD vpair_add_v4di3 {mma,pair} + long long __builtin_vpair_i64_add_elements (v256); + VPAIR_I64_ADD_ELEMENTS vpair_reduc_plus_scale_v4di {mma,pair,no32bit} + v256 __builtin_vpair_i64_and (v256, v256); VPAIR_I64_AND vpair_and_v4di3 {mma,pair} @@ -4408,6 +4417,9 @@ v256 __builtin_vpair_f64_assemble (vd, vd); v256 __builtin_vpair_i64_xor (v256, v256); VPAIR_I64_XOR vpair_xor_v4di3 {mma,pair} + unsigned long long __builtin_vpair_i64u_add_elements (v256); + VPAIR_I64U_ADD_ELEMENTS vpair_reduc_plus_scale_v4di {mma,pair,no32bit} + v256 __builtin_vpair_i64u_assemble (vull, vull); VPAIR_I64U_ASSEMBLE vpair_assemble_v4di {mma,pair} diff --git a/gcc/config/rs6000/vector-pair.md b/gcc/config/rs6000/vector-pair.md index f6d0b2a39fc..b5e9330e71f 100644 --- a/gcc/config/rs6000/vector-pair.md +++ b/gcc/config/rs6000/vector-pair.md @@ -35,6 +35,9 @@ (define_c_enum "unspec" UNSPEC_VPAIR_V4DI UNSPEC_VPAIR_ZERO UNSPEC_VPAIR_SPLAT + UNSPEC_VPAIR_REDUCE_PLUS_F32 + UNSPEC_VPAIR_REDUCE_PLUS_F64 + UNSPEC_VPAIR_REDUCE_PLUS_I64 ]) ;; Iterator doing unary/binary arithmetic on vector pairs @@ -577,6 +580,66 @@ (define_insn_and_split "*vpair_nfms_fpcontract_<vp_pmode>4" } [(set_attr "length" "8")]) + +;; Add all elements in a pair of V4SF vectors. +(define_insn_and_split "vpair_reduc_plus_scale_v8sf" + [(set (match_operand:SF 0 "vsx_register_operand" "=wa") + (unspec:SF [(match_operand:OO 1 "vsx_register_operand" "v")] + UNSPEC_VPAIR_REDUCE_PLUS_F32)) + (clobber (match_scratch:V4SF 2 "=&v")) + (clobber (match_scratch:V4SF 3 "=&v"))] + "TARGET_MMA" + "#" + "&& reload_completed" + [(pc)] +{ + rtx op0 = operands[0]; + rtx op1 = operands[1]; + rtx tmp1 = operands[2]; + rtx tmp2 = operands[3]; + unsigned r = reg_or_subregno (op1); + rtx op1_hi = gen_rtx_REG (V4SFmode, r); + rtx op1_lo = gen_rtx_REG (V4SFmode, r + 1); + + emit_insn (gen_addv4sf3 (tmp1, op1_hi, op1_lo)); + emit_insn (gen_altivec_vsldoi_v4sf (tmp2, tmp1, tmp1, GEN_INT (8))); + emit_insn (gen_addv4sf3 (tmp2, tmp1, tmp2)); + emit_insn (gen_altivec_vsldoi_v4sf (tmp1, tmp2, tmp2, GEN_INT (4))); + emit_insn (gen_addv4sf3 (tmp2, tmp1, tmp2)); + emit_insn (gen_vsx_xscvspdp_scalar2 (op0, tmp2)); + DONE; +} + [(set_attr "length" "24")]) + +;; Add all elements in a pair of V2DF vectors +(define_insn_and_split "vpair_reduc_plus_scale_v4df" + [(set (match_operand:DF 0 "vsx_register_operand" "=&wa") + (unspec:DF [(match_operand:OO 1 "vsx_register_operand" "wa")] + UNSPEC_VPAIR_REDUCE_PLUS_F64)) + (clobber (match_scratch:DF 2 "=&wa")) + (clobber (match_scratch:V2DF 3 "=&wa"))] + "TARGET_MMA" + "#" + "&& reload_completed" + [(set (match_dup 3) + (plus:V2DF (match_dup 4) + (match_dup 5))) + (set (match_dup 2) + (vec_select:DF (match_dup 3) + (parallel [(match_dup 6)]))) + (set (match_dup 0) + (plus:DF (match_dup 7) + (match_dup 2)))] +{ + unsigned reg1 = reg_or_subregno (operands[1]); + unsigned reg3 = reg_or_subregno (operands[3]); + + operands[4] = gen_rtx_REG (V2DFmode, reg1); + operands[5] = gen_rtx_REG (V2DFmode, reg1 + 1); + operands[6] = GEN_INT (BYTES_BIG_ENDIAN ? 1 : 0); + operands[7] = gen_rtx_REG (DFmode, reg3); +}) + ;; Vector pair integer negate support. (define_insn_and_split "vpair_neg_<vp_pmode>2" @@ -786,3 +849,33 @@ (define_insn_and_split "*vpair_nor_<vp_pmode>_2" DONE; } [(set_attr "length" "8")]) + +;; Add all elements in a pair of V2DI vectors +(define_insn_and_split "vpair_reduc_plus_scale_v4di" + [(set (match_operand:DI 0 "gpc_reg_operand" "=&r") + (unspec:DI [(match_operand:OO 1 "altivec_register_operand" "v")] + UNSPEC_VPAIR_REDUCE_PLUS_I64)) + (clobber (match_scratch:V2DI 2 "=&v")) + (clobber (match_scratch:DI 3 "=&r"))] + "TARGET_MMA && TARGET_POWERPC64" + "#" + "&& reload_completed" + [(set (match_dup 2) + (plus:V2DI (match_dup 4) + (match_dup 5))) + (set (match_dup 3) + (vec_select:DI (match_dup 2) + (parallel [(const_int 0)]))) + (set (match_dup 0) + (vec_select:DI (match_dup 2) + (parallel [(const_int 1)]))) + (set (match_dup 0) + (plus:DI (match_dup 0) + (match_dup 3)))] +{ + unsigned reg1 = reg_or_subregno (operands[1]); + + operands[4] = gen_rtx_REG (V2DImode, reg1); + operands[5] = gen_rtx_REG (V2DImode, reg1 + 1); +} + [(set_attr "length" "16")]) diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi index 600e2c393db..0e6e74b8087 100644 --- a/gcc/doc/extend.texi +++ b/gcc/doc/extend.texi @@ -21399,6 +21399,7 @@ The following built-in functions operate on pairs of @smallexample __vector_pair __builtin_vpair_f32_abs (__vector_pair); __vector_pair __builtin_vpair_f32_add (__vector_pair, __vector_pair); +float __builtin_vpair_f32_add_elements (__vector_pair); __vector_pair __builtin_vpair_f32_assemble (vector float, vector float); vector float __builtin_vpair_f32_extract_vector (__vector_pair, int); __vector_pair __builtin_vpair_f32_fma (__vector_pair, __vector_pair, __vector_pair); @@ -21416,6 +21417,7 @@ The following built-in functions operate on pairs of @smallexample __vector_pair __builtin_vpair_f64_abs (__vector_pair); __vector_pair __builtin_vpair_f64_add (__vector_pair, __vector_pair); +double __builtin_vpair_f64_add_elements (__vector_pair); __vector_pair __builtin_vpair_f64_assemble (vector double, vector double); vector double __builtin_vpair_f64_extract_vector (__vector_pair, int); __vector_pair __builtin_vpair_f64_fma (__vector_pair, __vector_pair, __vector_pair); @@ -21432,6 +21434,7 @@ The following built-in functions operate on pairs of @smallexample __vector_pair __builtin_vpair_i64_add (__vector_pair, __vector_pair); +long long __builtin_vpair_i64_add_elements (__vector_pair); __vector_pair __builtin_vpair_i64_and (__vector_pair, __vector_pair); __vector_pair __builtin_vpair_i64_assemble (vector long long, vector long long); diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-16.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-16.c new file mode 100644 index 00000000000..a8c206c4093 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-16.c @@ -0,0 +1,45 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target power10_ok } */ +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ + +/* Test vector pair built-in functions to do a horizontal add of the + elements. */ + +float +f32_add_elements (__vector_pair *p) +{ + /* 1 lxvp, 1 xvaddsp, 2 vsldoi, 2 xvaddsp, 1 xcvspdp. */ + return __builtin_vpair_f32_add_elements (*p); +} + +double +f64_add_elements (__vector_pair *p) +{ + /* 1 lxvp, 1 xvadddp, 1 xxperdi, 1 fadd/xxadddp. */ + return __builtin_vpair_f64_add_elements (*p); +} + +long long +i64_add_elements (__vector_pair *p) +{ + /* 1 lxvp, 1vaddudm, 1 mfvsrld, 1 mfvsrd, 1 add. */ + return __builtin_vpair_i64_add_elements (*p); +} + +unsigned long long +i64u_add_elements (__vector_pair *p) +{ + /* 1 lxvp, 1vaddudm, 1 mfvsrld, 1 mfvsrd, 1 add. */ + return __builtin_vpair_i64u_add_elements (*p); +} + +/* { dg-final { scan-assembler-times {\mfadd\M|\mxsadddp\M} 1 } } */ +/* { dg-final { scan-assembler-times {\mlxvp\M} 4 } } */ +/* { dg-final { scan-assembler-times {\mmfvsrd\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mmfvsrld\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mvaddudm\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mvsldoi\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxscvspdp\M} 1 } } */ +/* { dg-final { scan-assembler-times {\mxvadddp\M} 1 } } */ +/* { dg-final { scan-assembler-times {\mxvaddsp\M} 3 } } */ +/* { dg-final { scan-assembler-times {\mxxpermdi\M} 1 } } */ -- 2.41.0 -- Michael Meissner, IBM PO Box 98, Ayer, Massachusetts, USA, 01432 email: meiss...@linux.ibm.com