This patch adds a series of built-in functions to allow users to write code to do a number of simple operations where the loop is done using the __vector_pair type. The __vector_pair type is an opaque type. These built-in functions keep the two 128-bit vectors within the __vector_pair together, and split the operation after register allocation.
This patch provides vector pair operations for 32-bit floating point and 64-bit floating point. I have built and tested these patches on: * A little endian power10 server using --with-cpu=power10 * A little endian power9 server using --with-cpu=power9 * A big endian power9 server using --with-cpu=power9. Can I check this patch into the master branch? 2023-11-09 Michael Meissner <meiss...@linux.ibm.com> gcc/ * config/rs6000/rs6000-builtins.def (__builtin_vpair_f32_*): Add vector pair built-in functions for float. (__builtin_vpair_f64_*): Add vector pair built-in functions for double. * config/rs6000/rs6000-protos.h (split_unary_vector_pair): Add declaration. (split_binary_vector_pair): Likewise. (split_fma_vector_pair): Likewise. * config/rs6000/rs6000.cc (split_unary_vector_pair): New helper function for vector pair built-in functions. (split_binary_vector_pair): Likewise. (split_fma_vector_pair): Likewise. * config/rs6000/rs6000.md (toplevel): Include vector-pair.md. * config/rs6000/t-rs6000 (MD_INCLUDES): Add vector-pair.md. * config/rs6000/vector-pair.md: New file. * doc/extend.texi (PowerPC Vector Pair Built-in Functions): Document the floating point and general vector pair built-in functions. gcc/testsuite/ * gcc.target/powerpc/vector-pair-1.c: New test. * gcc.target/powerpc/vector-pair-2.c: New test. * gcc.target/powerpc/vector-pair-3.c: New test. * gcc.target/powerpc/vector-pair-4.c: New test. --- gcc/config/rs6000/rs6000-builtins.def | 52 +++ gcc/config/rs6000/rs6000-protos.h | 5 + gcc/config/rs6000/rs6000.cc | 74 ++++ gcc/config/rs6000/rs6000.md | 1 + gcc/config/rs6000/t-rs6000 | 1 + gcc/config/rs6000/vector-pair.md | 329 ++++++++++++++++++ gcc/doc/extend.texi | 46 +++ .../gcc.target/powerpc/vector-pair-1.c | 135 +++++++ .../gcc.target/powerpc/vector-pair-2.c | 134 +++++++ .../gcc.target/powerpc/vector-pair-3.c | 60 ++++ .../gcc.target/powerpc/vector-pair-4.c | 60 ++++ 11 files changed, 897 insertions(+) create mode 100644 gcc/config/rs6000/vector-pair.md create mode 100644 gcc/testsuite/gcc.target/powerpc/vector-pair-1.c create mode 100644 gcc/testsuite/gcc.target/powerpc/vector-pair-2.c create mode 100644 gcc/testsuite/gcc.target/powerpc/vector-pair-3.c create mode 100644 gcc/testsuite/gcc.target/powerpc/vector-pair-4.c diff --git a/gcc/config/rs6000/rs6000-builtins.def b/gcc/config/rs6000/rs6000-builtins.def index ce40600e803..89b248b50ef 100644 --- a/gcc/config/rs6000/rs6000-builtins.def +++ b/gcc/config/rs6000/rs6000-builtins.def @@ -4131,3 +4131,55 @@ void __builtin_vsx_stxvp (v256, unsigned long, const v256 *); STXVP nothing {mma,pair} + +;; vector pair built-in functions for 8 32-bit float values + + v256 __builtin_vpair_f32_abs (v256); + VPAIR_F32_ABS vpair_abs_v8sf2 {mma,pair} + + v256 __builtin_vpair_f32_add (v256, v256); + VPAIR_F32_ADD vpair_add_v8sf3 {mma,pair} + + v256 __builtin_vpair_f32_fma (v256, v256, v256); + VPAIR_F32_FMA vpair_fma_v8sf4 {mma,pair} + + v256 __builtin_vpair_f32_max (v256, v256); + VPAIR_F32_MAX vpair_smax_v8sf3 {mma,pair} + + v256 __builtin_vpair_f32_min (v256, v256); + VPAIR_F32_MIN vpair_smin_v8sf3 {mma,pair} + + v256 __builtin_vpair_f32_mul (v256, v256); + VPAIR_F32_MUL vpair_mul_v8sf3 {mma,pair} + + v256 __builtin_vpair_f32_neg (v256); + VPAIR_F32_NEG vpair_neg_v8sf2 {mma,pair} + + v256 __builtin_vpair_f32_sub (v256, v256); + VPAIR_F32_SUB vpair_sub_v8sf3 {mma,pair} + +;; vector pair built-in functions for 4 64-bit double values + + v256 __builtin_vpair_f64_abs (v256); + VPAIR_F64_ABS vpair_abs_v4df2 {mma,pair} + + v256 __builtin_vpair_f64_add (v256, v256); + VPAIR_F64_ADD vpair_add_v4df3 {mma,pair} + + v256 __builtin_vpair_f64_fma (v256, v256, v256); + VPAIR_F64_FMA vpair_fma_v4df4 {mma,pair} + + v256 __builtin_vpair_f64_max (v256, v256); + VPAIR_F64_MAX vpair_smax_v4df3 {mma,pair} + + v256 __builtin_vpair_f64_min (v256, v256); + VPAIR_F64_MIN vpair_smin_v4df3 {mma,pair} + + v256 __builtin_vpair_f64_mul (v256, v256); + VPAIR_F64_MUL vpair_mul_v4df3 {mma,pair} + + v256 __builtin_vpair_f64_neg (v256); + VPAIR_F64_NEG vpair_neg_v4df2 {mma,pair} + + v256 __builtin_vpair_f64_sub (v256, v256); + VPAIR_F64_SUB vpair_sub_v4df3 {mma,pair} diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h index f70118ea40f..bbd899d7562 100644 --- a/gcc/config/rs6000/rs6000-protos.h +++ b/gcc/config/rs6000/rs6000-protos.h @@ -138,6 +138,11 @@ extern void rs6000_emit_swsqrt (rtx, rtx, bool); extern void output_toc (FILE *, rtx, int, machine_mode); extern void rs6000_fatal_bad_address (rtx); extern rtx create_TOC_reference (rtx, rtx); +extern void split_unary_vector_pair (machine_mode, rtx [], rtx (*)(rtx, rtx)); +extern void split_binary_vector_pair (machine_mode, rtx [], + rtx (*)(rtx, rtx, rtx)); +extern void split_fma_vector_pair (machine_mode, rtx [], + rtx (*)(rtx, rtx, rtx, rtx)); extern void rs6000_split_multireg_move (rtx, rtx); extern void rs6000_emit_le_vsx_permute (rtx, rtx, machine_mode); extern void rs6000_emit_le_vsx_move (rtx, rtx, machine_mode); diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc index db60d3ca960..99352400197 100644 --- a/gcc/config/rs6000/rs6000.cc +++ b/gcc/config/rs6000/rs6000.cc @@ -27396,6 +27396,80 @@ rs6000_split_logical (rtx operands[3], return; } +/* Split a unary vector pair insn into two separate vector insns. */ + +void +split_unary_vector_pair (machine_mode mode, /* vector mode. */ + rtx operands[], /* dest, src. */ + rtx (*func)(rtx, rtx)) /* create insn. */ +{ + rtx op0 = operands[0]; + rtx op1 = operands[1]; + machine_mode orig_mode = GET_MODE (op0); + + rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0); + rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0); + rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16); + rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16); + + emit_insn (func (reg0_vector0, reg1_vector0)); + emit_insn (func (reg0_vector1, reg1_vector1)); + return; +} + +/* Split a binary vector pair insn into two separate vector insns. */ + +void +split_binary_vector_pair (machine_mode mode, /* vector mode. */ + rtx operands[], /* dest, src. */ + rtx (*func)(rtx, rtx, rtx)) /* create insn. */ +{ + rtx op0 = operands[0]; + rtx op1 = operands[1]; + rtx op2 = operands[2]; + machine_mode orig_mode = GET_MODE (op0); + + rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0); + rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0); + rtx reg2_vector0 = simplify_gen_subreg (mode, op2, orig_mode, 0); + rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16); + rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16); + rtx reg2_vector1 = simplify_gen_subreg (mode, op2, orig_mode, 16); + + emit_insn (func (reg0_vector0, reg1_vector0, reg2_vector0)); + emit_insn (func (reg0_vector1, reg1_vector1, reg2_vector1)); + return; +} + +/* Split a fused multiply-add vector pair insn into two separate vector + insns. */ + +void +split_fma_vector_pair (machine_mode mode, /* vector mode. */ + rtx operands[], /* dest, src. */ + rtx (*func)(rtx, rtx, rtx, rtx)) /* create insn. */ +{ + rtx op0 = operands[0]; + rtx op1 = operands[1]; + rtx op2 = operands[2]; + rtx op3 = operands[3]; + machine_mode orig_mode = GET_MODE (op0); + + rtx reg0_vector0 = simplify_gen_subreg (mode, op0, orig_mode, 0); + rtx reg1_vector0 = simplify_gen_subreg (mode, op1, orig_mode, 0); + rtx reg2_vector0 = simplify_gen_subreg (mode, op2, orig_mode, 0); + rtx reg3_vector0 = simplify_gen_subreg (mode, op3, orig_mode, 0); + + rtx reg0_vector1 = simplify_gen_subreg (mode, op0, orig_mode, 16); + rtx reg1_vector1 = simplify_gen_subreg (mode, op1, orig_mode, 16); + rtx reg2_vector1 = simplify_gen_subreg (mode, op2, orig_mode, 16); + rtx reg3_vector1 = simplify_gen_subreg (mode, op3, orig_mode, 16); + + emit_insn (func (reg0_vector0, reg1_vector0, reg2_vector0, reg3_vector0)); + emit_insn (func (reg0_vector1, reg1_vector1, reg2_vector1, reg3_vector1)); + return; +} + /* Emit instructions to move SRC to DST. Called by splitters for multi-register moves. It will emit at most one instruction for each register that is accessed; that is, it won't emit li/lis pairs diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index dcf1f3526f5..5a17adc1bc3 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -15767,6 +15767,7 @@ (define_insn "hashchk" (include "vsx.md") (include "altivec.md") (include "mma.md") +(include "vector-pair.md") (include "dfp.md") (include "crypto.md") (include "htm.md") diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000 index f183b42ce1d..5fc89499795 100644 --- a/gcc/config/rs6000/t-rs6000 +++ b/gcc/config/rs6000/t-rs6000 @@ -128,6 +128,7 @@ MD_INCLUDES = $(srcdir)/config/rs6000/rs64.md \ $(srcdir)/config/rs6000/vsx.md \ $(srcdir)/config/rs6000/altivec.md \ $(srcdir)/config/rs6000/mma.md \ + $(srcdir)/config/rs6000/vector-pair.md \ $(srcdir)/config/rs6000/crypto.md \ $(srcdir)/config/rs6000/htm.md \ $(srcdir)/config/rs6000/dfp.md \ diff --git a/gcc/config/rs6000/vector-pair.md b/gcc/config/rs6000/vector-pair.md new file mode 100644 index 00000000000..2dcac6a31e2 --- /dev/null +++ b/gcc/config/rs6000/vector-pair.md @@ -0,0 +1,329 @@ +;; Vector pair arithmetic support. +;; Copyright (C) 2020-2023 Free Software Foundation, Inc. +;; Contributed by Peter Bergner <berg...@linux.ibm.com> and +;; Michael Meissner <meiss...@linux.ibm.com> +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published +;; by the Free Software Foundation; either version 3, or (at your +;; option) any later version. +;; +;; GCC is distributed in the hope that it will be useful, but WITHOUT +;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public +;; License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; <http://www.gnu.org/licenses/>. +;; +;; This file adds support for doing vector operations on pairs of vector +;; registers. Most of the instructions use vector pair instructions to load +;; and possibly store registers, but splitting the operation after register +;; allocation to do 2 separate operations. The second scheduler pass can +;; interleave other instructions between these pairs of instructions if +;; possible. + +(define_c_enum "unspec" + [UNSPEC_VPAIR_V4DF + UNSPEC_VPAIR_V8SF + ]) + +;; Iterator doing unary/binary arithmetic on vector pairs +(define_code_iterator VP_FP_UNARY [abs neg]) +(define_code_iterator VP_FP_BINARY [minus mult plus smin smax]) + +;; Return the insn name from the VP_* code iterator +(define_code_attr vp_insn [(abs "abs") + (minus "sub") + (mult "mul") + (neg "neg") + (plus "add") + (smin "smin") + (smax "smax") + (xor "xor")]) + +;; Iterator for creating the unspecs for vector pair built-ins +(define_int_iterator VP_FP [UNSPEC_VPAIR_V4DF + UNSPEC_VPAIR_V8SF]) + +;; Map VP_* to vector mode of the arguments after they are split +(define_int_attr VP_VEC_MODE [(UNSPEC_VPAIR_V4DF "V2DF") + (UNSPEC_VPAIR_V8SF "V4SF")]) + +;; Map VP_* to a lower case name to identify the vector pair. +(define_int_attr vp_pmode [(UNSPEC_VPAIR_V4DF "v4df") + (UNSPEC_VPAIR_V8SF "v8sf")]) + +;; Map VP_* to a lower case name to identify the vector after the vector pair +;; has been split. +(define_int_attr vp_vmode [(UNSPEC_VPAIR_V4DF "v2df") + (UNSPEC_VPAIR_V8SF "v4sf")]) + + +;; Vector pair floating point unary operations +(define_insn_and_split "vpair_<vp_insn>_<vp_pmode>2" + [(set (match_operand:OO 0 "vsx_register_operand" "=wa") + (unspec:OO [(VP_FP_UNARY:OO + (match_operand:OO 1 "vsx_register_operand" "wa"))] + VP_FP))] + "TARGET_MMA" + "#" + "&& reload_completed" + [(const_int 0)] +{ + split_unary_vector_pair (<VP_VEC_MODE>mode, operands, + gen_<vp_insn><vp_vmode>2); + DONE; +} + [(set_attr "length" "8")]) + +;; Optimize vector pair negate of absolute value +(define_insn_and_split "vpair_nabs_<vp_pmode>2" + [(set (match_operand:OO 0 "vsx_register_operand" "=wa") + (unspec:OO + [(neg:OO + (unspec:OO + [(abs:OO (match_operand:OO 1 "vsx_register_operand" "ww"))] + VP_FP))] + VP_FP))] + "TARGET_MMA" + "#" + "&& reload_completed" + [(const_int 0)] +{ + split_unary_vector_pair (<VP_VEC_MODE>mode, operands, + gen_vsx_nabs<vp_vmode>2); + DONE; +} + [(set_attr "length" "8")]) + +;; Vector pair floating binary operations +(define_insn_and_split "vpair_<vp_insn>_<vp_pmode>3" + [(set (match_operand:OO 0 "vsx_register_operand" "=wa") + (unspec:OO [(VP_FP_BINARY:OO + (match_operand:OO 1 "vsx_register_operand" "wa") + (match_operand:OO 2 "vsx_register_operand" "wa"))] + VP_FP))] + "TARGET_MMA" + "#" + "&& reload_completed" + [(const_int 0)] +{ + split_binary_vector_pair (<VP_VEC_MODE>mode, operands, + gen_<vp_insn><vp_vmode>3); + DONE; +} + [(set_attr "length" "8")]) + +;; Vector pair fused multiply-add floating point operations +(define_insn_and_split "vpair_fma_<vp_pmode>4" + [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa") + (unspec:OO + [(fma:OO + (match_operand:OO 1 "vsx_register_operand" "%wa,wa") + (match_operand:OO 2 "vsx_register_operand" "wa,0") + (match_operand:OO 3 "vsx_register_operand" "0,wa"))] + VP_FP))] + "TARGET_MMA" + "#" + "&& reload_completed" + [(const_int 0)] +{ + split_fma_vector_pair (<VP_VEC_MODE>mode, operands, + gen_fma<vp_vmode>4); + DONE; +} + [(set_attr "length" "8")]) + +(define_insn_and_split "vpair_fms_<vp_pmode>4" + [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa") + (unspec:OO + [(fma:OO + (match_operand:OO 1 "vsx_register_operand" "%wa,wa") + (match_operand:OO 2 "vsx_register_operand" "wa,0") + (unspec:OO + [(neg:OO (match_operand:OO 3 "vsx_register_operand" "0,wa"))] + VP_FP))] + VP_FP))] + "TARGET_MMA" + "#" + "&& reload_completed" + [(const_int 0)] +{ + split_fma_vector_pair (<VP_VEC_MODE>mode, operands, + gen_fms<vp_vmode>4); + DONE; +} + [(set_attr "length" "8")]) + +(define_insn_and_split "vpair_nfma_<vp_pmode>4" + [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa") + (unspec:OO + [(neg:OO + (unspec:OO + [(fma:OO + (match_operand:OO 1 "vsx_register_operand" "%wa,wa") + (match_operand:OO 2 "vsx_register_operand" "wa,0") + (match_operand:OO 3 "vsx_register_operand" "0,wa"))] + VP_FP))] + VP_FP))] + "TARGET_MMA" + "#" + "&& reload_completed" + [(const_int 0)] +{ + split_fma_vector_pair (<VP_VEC_MODE>mode, operands, + gen_nfma<vp_vmode>4); + DONE; +} + [(set_attr "length" "8")]) + +(define_insn_and_split "vpair_nfms_<vp_pmode>4" + [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa") + (unspec:OO + [(neg:OO + (unspec:OO + [(fma:OO + (match_operand:OO 1 "vsx_register_operand" "%wa,wa") + (match_operand:OO 2 "vsx_register_operand" "wa,0") + (unspec:OO + [(neg:OO (match_operand:OO 3 "vsx_register_operand" "0,wa"))] + VP_FP))] + VP_FP))] + VP_FP))] + "TARGET_MMA" + "#" + "&& reload_completed" + [(const_int 0)] +{ + split_fma_vector_pair (<VP_VEC_MODE>mode, operands, + gen_nfms<vp_vmode>4); + DONE; +} + [(set_attr "length" "8")]) + +;; Optimize vector pair (a * b) + c into vector pair fma (a, b, c). +(define_insn_and_split "*vpair_fma_fpcontract_<vp_pmode>4" + [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa") + (unspec:OO + [(plus:OO + (unspec:OO + [(mult:OO + (match_operand:OO 1 "vsx_register_operand" "%wa,wa") + (match_operand:OO 2 "vsx_register_operand" "wa,0"))] + VP_FP) + (match_operand:OO 3 "vsx_register_operand" "0,wa"))] + VP_FP))] + "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST" + "#" + "&& 1" + [(set (match_dup 0) + (unspec:OO + [(fma:OO + (match_dup 1) + (match_dup 2) + (match_dup 3))] + VP_FP))] +{ +} + [(set_attr "length" "8")]) + +;; Optimize vector pair (a * b) - c into vector pair fma (a, b, -c) +(define_insn_and_split "*vpair_fms_fpcontract_<vp_pmode>4" + [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa") + (unspec:OO + [(minus:OO + (unspec:OO + [(mult:OO + (match_operand:OO 1 "vsx_register_operand" "%wa,wa") + (match_operand:OO 2 "vsx_register_operand" "wa,0"))] + VP_FP) + (match_operand:OO 3 "vsx_register_operand" "0,wa"))] + VP_FP))] + "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST" + "#" + "&& 1" + [(set (match_dup 0) + (unspec:OO + [(fma:OO + (match_dup 1) + (match_dup 2) + (unspec:OO + [(neg:OO + (match_dup 3))] + VP_FP))] + VP_FP))] +{ +} + [(set_attr "length" "8")]) + + +;; Optimize vector pair -((a * b) + c) into vector pair -fma (a, b, c). +(define_insn_and_split "*vpair_nfma_fpcontract_<vp_pmode>4" + [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa") + (unspec:OO + [(neg:OO + (unspec:OO + [(plus:OO + (unspec:OO + [(mult:OO + (match_operand:OO 1 "vsx_register_operand" "%wa,wa") + (match_operand:OO 2 "vsx_register_operand" "wa,0"))] + VP_FP) + (match_operand:OO 3 "vsx_register_operand" "0,wa"))] + VP_FP))] + VP_FP))] + "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST" + "#" + "&& 1" + [(set (match_dup 0) + (unspec:OO + [(neg:OO + (unspec:OO + [(fma:OO + (match_dup 1) + (match_dup 2) + (match_dup 3))] + VP_FP))] + VP_FP))] +{ +} + [(set_attr "length" "8")]) + +;; Optimize vector pair -((a * b) - c) into vector pair -fma (a, b, -c) +(define_insn_and_split "*vpair_nfms_fpcontract_<vp_pmode>4" + [(set (match_operand:OO 0 "vsx_register_operand" "=wa,wa") + (unspec:OO + [(neg:OO + (unspec:OO + [(minus:OO + (unspec:OO + [(mult:OO + (match_operand:OO 1 "vsx_register_operand" "%wa,wa") + (match_operand:OO 2 "vsx_register_operand" "wa,0"))] + VP_FP) + (match_operand:OO 3 "vsx_register_operand" "0,wa"))] + VP_FP))] + VP_FP))] + "TARGET_MMA && flag_fp_contract_mode == FP_CONTRACT_FAST" + "#" + "&& 1" + [(set (match_dup 0) + (unspec:OO + [(neg:OO + (unspec:OO + [(fma:OO + (match_dup 1) + (match_dup 2) + (unspec:OO + [(neg:OO + (match_dup 3))] + VP_FP))] + VP_FP))] + VP_FP))] +{ +} + [(set_attr "length" "8")]) diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi index 7cdfdf8c83b..a830ad06b90 100644 --- a/gcc/doc/extend.texi +++ b/gcc/doc/extend.texi @@ -15038,6 +15038,7 @@ instructions, but allow the compiler to schedule those calls. * NDS32 Built-in Functions:: * Nvidia PTX Built-in Functions:: * Basic PowerPC Built-in Functions:: +* PowerPC Vector Pair Built-in Functions Available on ISA 3.1:: * PowerPC AltiVec/VSX Built-in Functions:: * PowerPC Hardware Transactional Memory Built-in Functions:: * PowerPC Atomic Memory Operation Functions:: @@ -21368,6 +21369,51 @@ int vec_any_le (vector unsigned __int128, vector unsigned __int128); @end smallexample +@node PowerPC Vector Pair Built-in Functions Available on ISA 3.1 +@subsection PowerPC Vector Pair Built-in Functions Available on ISA 3.1 + +GCC provides functions to speed up processing by using the type +@code{__vector_pair} to hold two 128-bit vectors on processors that +support ISA 3.1 (power10). The @code{__vector_pair} type and the +vector pair built-in functions require the MMA instruction set +(@option{-mmma}) to be enabled, which is on by default for +@option{-mcpu=power10}. + +By default, @code{__vector_pair} types are loaded into vectors with a +single load vector pair instruction. The processing for the built-in +function is done as two separate vector instructions on each of the +two 128-bit vectors stored in the vector pair. The +@code{__vector_pair} type is usually stored with a single vector pair +store instruction. + +The following built-in functions operate on pairs of +@code{vector float} values: + +@smallexample +__vector_pair __builtin_vpair_f32_abs (__vector_pair); +__vector_pair __builtin_vpair_f32_add (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_f32_fma (__vector_pair, __vector_pair, __vector_pair); +__vector_pair __builtin_vpair_f32_max (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_f32_min (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_f32_mul (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_f32_neg (__vector_pair); +__vector_pair __builtin_vpair_f32_sub (__vector_pair, __vector_pair); +@end smallexample + +The following built-in functions operate on pairs of +@code{vector double} values: + +@smallexample +__vector_pair __builtin_vpair_f64_abs (__vector_pair); +__vector_pair __builtin_vpair_f64_add (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_f64_fma (__vector_pair, __vector_pair, __vector_pair); +__vector_pair __builtin_vpair_f64_mul (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_f64_neg (__vector_pair); +__vector_pair __builtin_vpair_f64_max (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_f64_min (__vector_pair, __vector_pair); +__vector_pair __builtin_vpair_f64_sub (__vector_pair, __vector_pair); +@end smallexample + @node PowerPC Hardware Transactional Memory Built-in Functions @subsection PowerPC Hardware Transactional Memory Built-in Functions GCC provides two interfaces for accessing the Hardware Transactional diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-1.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-1.c new file mode 100644 index 00000000000..e74840cebc0 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-1.c @@ -0,0 +1,135 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target power10_ok } */ +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ + +/* Test whether the vector buitin code generates the expected instructions for + vector pairs with 4 double elements. */ + +void +test_add (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xvadddp, 1 stxvp. */ + *dest = __builtin_vpair_f64_add (*x, *y); +} + +void +test_sub (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xvsubdp, 1 stxvp. */ + *dest = __builtin_vpair_f64_sub (*x, *y); +} + +void +test_multiply (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xvmuldp, 1 stxvp. */ + *dest = __builtin_vpair_f64_mul (*x, *y); +} + +void +test_min (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xvmindp, 1 stxvp. */ + *dest = __builtin_vpair_f64_min (*x, *y); +} + +void +test_max (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xvmaxdp, 1 stxvp. */ + *dest = __builtin_vpair_f64_max (*x, *y); +} + +void +test_negate (__vector_pair *dest, + __vector_pair *x) +{ + /* 1 lxvp, 2 xvnegdp, 1 stxvp. */ + *dest = __builtin_vpair_f64_neg (*x); +} + +void +test_abs (__vector_pair *dest, + __vector_pair *x) +{ + /* 1 lxvp, 2 xvabsdp, 1 stxvp. */ + *dest = __builtin_vpair_f64_abs (*x); +} + +void +test_negative_abs (__vector_pair *dest, + __vector_pair *x) +{ + /* 2 lxvp, 2 xvnabsdp, 1 stxvp. */ + __vector_pair ab = __builtin_vpair_f64_abs (*x); + *dest = __builtin_vpair_f64_neg (ab); +} + +void +test_fma (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 lxvp, 2 xvmadd{a,q}dp, 1 stxvp. */ + *dest = __builtin_vpair_f64_fma (*x, *y, *z); +} + +void +test_fms (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 lxvp, 2 xvmsub{a,q}dp, 1 stxvp. */ + __vector_pair n = __builtin_vpair_f64_neg (*z); + *dest = __builtin_vpair_f64_fma (*x, *y, n); +} + +void +test_nfma (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 lxvp, 2 xvnmadd{a,q}dp, 1 stxvp. */ + __vector_pair w = __builtin_vpair_f64_fma (*x, *y, *z); + *dest = __builtin_vpair_f64_neg (w); +} + +void +test_nfms (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 lxvp, 2 xvnmsub{a,q}dp, 1 stxvp. */ + __vector_pair n = __builtin_vpair_f64_neg (*z); + __vector_pair w = __builtin_vpair_f64_fma (*x, *y, n); + *dest = __builtin_vpair_f64_neg (w); +} + +/* { dg-final { scan-assembler-times {\mlxvp\M} 25 } } */ +/* { dg-final { scan-assembler-times {\mstxvp\M} 12 } } */ +/* { dg-final { scan-assembler-times {\mxvabsdp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvadddp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmadd.dp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmaxdp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmindp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmsub.dp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmuldp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnabsdp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnegdp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnmadd.dp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnmsub.dp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvsubdp\M} 2 } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-2.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-2.c new file mode 100644 index 00000000000..2facb727053 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-2.c @@ -0,0 +1,134 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target power10_ok } */ +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ + +/* Test whether the vector buitin code generates the expected instructions for + vector pairs with 8 float elements. */ + +void +test_add (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xvaddsp, 1 stxvp. */ + *dest = __builtin_vpair_f32_add (*x, *y); +} + +void +test_sub (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xvsubsp, 1 stxvp. */ + *dest = __builtin_vpair_f32_sub (*x, *y); +} + +void +test_multiply (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xvmulsp, 1 stxvp. */ + *dest = __builtin_vpair_f32_mul (*x, *y); +} + +void +test_max (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xvmaxsp, 1 stxvp. */ + *dest = __builtin_vpair_f32_max (*x, *y); +} + +void +test_min (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y) +{ + /* 2 lxvp, 2 xvminsp, 1 stxvp. */ + *dest = __builtin_vpair_f32_min (*x, *y); +} + +void +test_negate (__vector_pair *dest, + __vector_pair *x) +{ + /* 1 lxvp, 2 xvnegsp, 1 stxvp. */ + *dest = __builtin_vpair_f32_neg (*x); +} + +void +test_abs (__vector_pair *dest, + __vector_pair *x) +{ + /* 1 lxvp, 2 xvabssp, 1 stxvp. */ + *dest = __builtin_vpair_f32_abs (*x); +} + +void +test_negative_abs (__vector_pair *dest, + __vector_pair *x) +{ + /* 2 lxvp, 2 xvnabssp, 1 stxvp. */ + __vector_pair ab = __builtin_vpair_f32_abs (*x); + *dest = __builtin_vpair_f32_neg (ab); +} + +void +test_fma (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 lxvp, 2 xvmadd{a,q}sp, 1 stxvp. */ + *dest = __builtin_vpair_f32_fma (*x, *y, *z); +} + +void +test_fms (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 lxvp, 2 xvmsub{a,q}sp, 1 stxvp. */ + __vector_pair n = __builtin_vpair_f32_neg (*z); + *dest = __builtin_vpair_f32_fma (*x, *y, n); +} + +void +test_nfma (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 lxvp, 2 xvnmadd{a,q}sp, 1 stxvp. */ + __vector_pair w = __builtin_vpair_f32_fma (*x, *y, *z); + *dest = __builtin_vpair_f32_neg (w); +} + +void +test_nfms (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 lxvp, 2 xvnmsub{a,q}sp, 1 stxvp. */ + __vector_pair n = __builtin_vpair_f32_neg (*z); + __vector_pair w = __builtin_vpair_f32_fma (*x, *y, n); + *dest = __builtin_vpair_f32_neg (w); +} + +/* { dg-final { scan-assembler-times {\mlxvp\M} 25 } } */ +/* { dg-final { scan-assembler-times {\mstxvp\M} 12 } } */ +/* { dg-final { scan-assembler-times {\mxvabssp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvaddsp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmadd.sp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmaxsp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvminsp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmsub.sp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmulsp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnabssp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnegsp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnmadd.sp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnmsub.sp\M} 2 } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-3.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-3.c new file mode 100644 index 00000000000..65bfc44f85d --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-3.c @@ -0,0 +1,60 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target power10_ok } */ +/* { dg-options "-mdejagnu-cpu=power10 -Ofast" } */ + +/* Test whether the vector buitin code combines multiply, add/subtract, and + negate operations to the appropriate fused multiply-add instruction for + vector pairs with 4 double elements. */ + +void +test_fma (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 ldxvp, 2 xvmadd{a,m}dp, 1 stxvp. */ + __vector_pair m = __builtin_vpair_f64_mul (*x, *y); + *dest = __builtin_vpair_f64_add (m, *z); +} + +void +test_fms (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 ldxvp, 2 xvmsub{a,m}dp, 1 stxvp. */ + __vector_pair m = __builtin_vpair_f64_mul (*x, *y); + *dest = __builtin_vpair_f64_sub (m, *z); +} + +void +test_nfma (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 ldxvp, 2 xvnmadd{a,m}dp, 1 stxvp. */ + __vector_pair m = __builtin_vpair_f64_mul (*x, *y); + __vector_pair w = __builtin_vpair_f64_add (m, *z); + *dest = __builtin_vpair_f64_neg (w); +} + +void +test_nfms (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 ldxvp, 2 xvnmadd{a,m}dp, 1 stxvp. */ + __vector_pair m = __builtin_vpair_f64_mul (*x, *y); + __vector_pair w = __builtin_vpair_f64_sub (m, *z); + *dest = __builtin_vpair_f64_neg (w); +} + +/* { dg-final { scan-assembler-times {\mlxvp\M} 12 } } */ +/* { dg-final { scan-assembler-times {\mstxvp\M} 4 } } */ +/* { dg-final { scan-assembler-times {\mxvmadd.dp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmsub.dp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnmadd.dp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnmsub.dp\M} 2 } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/vector-pair-4.c b/gcc/testsuite/gcc.target/powerpc/vector-pair-4.c new file mode 100644 index 00000000000..b62871be1fd --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/vector-pair-4.c @@ -0,0 +1,60 @@ +/* { dgv64-do compile } */ +/* { dg-require-effective-target power10_ok } */ +/* { dg-options "-mdejagnu-cpu=power10 -Ofast" } */ + +/* Test whether the vector buitin code combines multiply, add/subtract, and + negate operations to the appropriate fused multiply-add instruction for + vector pairs with 8 float elements. */ + +void +test_fma (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 ldxvp, 2 xvmadd{a,m}sp, 1 stxvp. */ + __vector_pair m = __builtin_vpair_f32_mul (*x, *y); + *dest = __builtin_vpair_f32_add (m, *z); +} + +void +test_fms (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 ldxvp, 2 xvmsub{a,m}sp, 1 stxvp. */ + __vector_pair m = __builtin_vpair_f32_mul (*x, *y); + *dest = __builtin_vpair_f32_sub (m, *z); +} + +void +test_nfma (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 ldxvp, 2 xvnmadd{a,m}sp, 1 stxvp. */ + __vector_pair m = __builtin_vpair_f32_mul (*x, *y); + __vector_pair w = __builtin_vpair_f32_add (m, *z); + *dest = __builtin_vpair_f32_neg (w); +} + +void +test_nfms (__vector_pair *dest, + __vector_pair *x, + __vector_pair *y, + __vector_pair *z) +{ + /* 3 ldxvp, 2 xvnmadd{a,m}sp, 1 stxvp. */ + __vector_pair m = __builtin_vpair_f32_mul (*x, *y); + __vector_pair w = __builtin_vpair_f32_sub (m, *z); + *dest = __builtin_vpair_f32_neg (w); +} + +/* { dg-final { scan-assembler-times {\mlxvp\M} 12 } } */ +/* { dg-final { scan-assembler-times {\mstxvp\M} 4 } } */ +/* { dg-final { scan-assembler-times {\mxvmadd.sp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvmsub.sp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnmadd.sp\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxvnmsub.sp\M} 2 } } */ -- 2.41.0 -- Michael Meissner, IBM PO Box 98, Ayer, Massachusetts, USA, 01432 email: meiss...@linux.ibm.com