https://gcc.gnu.org/g:5fc9771c32162e44542a798f0f4ccd4ee9f4a8ba
commit 5fc9771c32162e44542a798f0f4ccd4ee9f4a8ba Author: Michael Meissner <[email protected]> Date: Fri Oct 17 13:04:46 2025 -0400 Add unary, fma 16-bit floating point vector optimization. 2025-10-17 Michael Meissner <[email protected]> gcc/ * config/rs6000/float16.cc (bfloat16_operation_as_v4sf): Add support for vectorizing unary and fma 16-bit floating point. (fp16_vectorization): Likewise. * config/rs6000/float16.md (neg<mode>2, VFP16_HW iterator): Likewise. (xor<mode>2, VFP16_HW iterator): Likewise. (abs<mode>2, VFP16_HW iterator): Likewise. (andc<mode>2, VFP16_HW iterator): Likewise. (neg_<fp16_names><mode>2): Likewise. (abs_<fp16_names><mode>2): Likewise. (fma<fp16_names><mode>4): Likewise. (fms<fp16_names><mode>4): Likewise. (nfma<fp16_names><mode>4): Likewise. (nfms<fp16_names><mode>4): Likewise. * config/rs6000/rs6000-protos.h (FP16_ABS_BINARY): Likewise. (FP16_NEG_BINARY,): Likewise. Diff: --- gcc/config/rs6000/float16.cc | 27 +++++ gcc/config/rs6000/float16.md | 202 ++++++++++++++++++++++++++++++++++++++ gcc/config/rs6000/rs6000-protos.h | 2 + 3 files changed, 231 insertions(+) diff --git a/gcc/config/rs6000/float16.cc b/gcc/config/rs6000/float16.cc index 3fb61e9e4621..80f580388e57 100644 --- a/gcc/config/rs6000/float16.cc +++ b/gcc/config/rs6000/float16.cc @@ -88,6 +88,8 @@ bfloat16_operation_as_v4sf (enum rtx_code icode, n_opts = 3; break; + case FP16_ABS_BINARY: + case FP16_NEG_BINARY: default: gcc_unreachable (); } @@ -174,6 +176,8 @@ bfloat16_operation_as_v4sf (enum rtx_code icode, } break; + case FP16_ABS_BINARY: + case FP16_NEG_BINARY: default: gcc_unreachable (); } @@ -212,6 +216,7 @@ fp16_vectorization (enum rtx_code icode, { gcc_assert (can_create_pseudo_p ()); + enum rtx_code unary_op = UNKNOWN; machine_mode result_mode = GET_MODE (result); rtx op_orig[3] = { op1, op2, op3 }; rtx op_hi[3]; @@ -226,6 +231,16 @@ fp16_vectorization (enum rtx_code icode, n_opts = 2; break; + case FP16_NEG_BINARY: + n_opts = 2; + unary_op = NEG; + break; + + case FP16_ABS_BINARY: + n_opts = 2; + unary_op = ABS; + break; + case FP16_FMA: case FP16_FMS: case FP16_NFMA: @@ -274,6 +289,8 @@ fp16_vectorization (enum rtx_code icode, switch (subtype) { case FP16_BINARY: + case FP16_NEG_BINARY: + case FP16_ABS_BINARY: emit_insn (gen_rtx_SET (result_hi, gen_rtx_fmt_ee (icode, V4SFmode, op_hi[0], @@ -322,6 +339,16 @@ fp16_vectorization (enum rtx_code icode, gcc_unreachable (); } + /* Add any unary operator modifications. */ + if (unary_op != UNKNOWN) + { + emit_insn (gen_rtx_SET (result_hi, + gen_rtx_fmt_e (unary_op, V4SFmode, result_hi))); + + emit_insn (gen_rtx_SET (result_lo, + gen_rtx_fmt_e (unary_op, V4SFmode, result_lo))); + } + /* Combine the 2 V4SFmode operations into one V8HFmode/V8BFmode vector. */ if (result_mode == V8HFmode) emit_insn (gen_vec_pack_trunc_v4sf_v8hf (result, result_hi, result_lo)); diff --git a/gcc/config/rs6000/float16.md b/gcc/config/rs6000/float16.md index 1ea070c4486e..cb684a41ec47 100644 --- a/gcc/config/rs6000/float16.md +++ b/gcc/config/rs6000/float16.md @@ -706,6 +706,104 @@ }) ;; Add vectorization support for 16-bit floating point. + +;; Negate vector bfloat16/float16 +(define_insn_and_split "neg<mode>2" + [(set (match_operand:VFP16_HW 0 "vsx_register_operand" "=wa") + (neg:VFP16_HW + (match_operand:VFP16_HW 1 "vsx_register_operand" "wa"))) + (clobber (match_scratch:VFP16_HW 2 "=&wa"))] + "" + "#" + "&& 1" + [(set (match_dup 2) + (match_dup 3)) + (set (match_dup 0) + (xor:VFP16_HW (match_dup 1) + (match_dup 2)))] +{ + if (GET_CODE (operands[2]) == SCRATCH) + operands[2] = gen_reg_rtx (<MODE>mode); + + REAL_VALUE_TYPE dconst; + + gcc_assert (real_from_string (&dconst, "-0.0") == 0); + + rtx neg0 = const_double_from_real_value (dconst, <VEC_base>mode); + rtvec v = rtvec_alloc (8); + + for (size_t i = 0; i < 8; i++) + RTVEC_ELT (v, i) = neg0; + + rtx vneg0 = gen_rtx_CONST_VECTOR (<MODE>mode, v); + if (!TARGET_PREFIXED) + vneg0 = force_const_mem (<MODE>mode, vneg0); + + operands[3] = vneg0; +} + [(set_attr "type" "veclogical") + (set_attr "length" "16")]) + +;; XOR used to negate a 16-bit floating point type + +(define_insn "*xor<mode>3" + [(set (match_operand:VFP16_HW 0 "vsx_register_operand" "=wa") + (xor:VFP16_HW (match_operand:VFP16_HW 1 "vsx_register_operand" "wa") + (match_operand:VFP16_HW 2 "vsx_register_operand" "wa")))] + "" + "xxlxor %x0,%x1,%x2" + [(set_attr "type" "veclogical")]) + +;; 16-bit floating point vector absolute value + +(define_insn_and_split "abs<mode>2" + [(set (match_operand:VFP16_HW 0 "vsx_register_operand" "=wa") + (abs:VFP16_HW + (match_operand:VFP16_HW 1 "vsx_register_operand" "wa"))) + (clobber (match_scratch:VFP16_HW 2 "=&wa"))] + "" + "#" + "&& 1" + [(set (match_dup 2) + (match_dup 3)) + (set (match_dup 0) + (and:VFP16_HW (match_dup 1) + (not:VFP16_HW (match_dup 2))))] +{ + if (GET_CODE (operands[2]) == SCRATCH) + operands[2] = gen_reg_rtx (<MODE>mode); + + REAL_VALUE_TYPE dconst; + + gcc_assert (real_from_string (&dconst, "-0.0") == 0); + + rtx neg0 = const_double_from_real_value (dconst, <VEC_base>mode); + rtvec v = rtvec_alloc (8); + + for (size_t i = 0; i < 8; i++) + RTVEC_ELT (v, i) = neg0; + + rtx vneg0 = gen_rtx_CONST_VECTOR (<MODE>mode, v); + if (!TARGET_PREFIXED) + vneg0 = force_const_mem (<MODE>mode, vneg0); + + operands[3] = vneg0; +} + [(set_attr "type" "veclogical") + (set_attr "length" "16")]) + +;; ANDC used to clear the sign bit of a 16-bit floating point type +;; for absolute value. + +(define_insn "*andc<mode>3" + [(set (match_operand:VFP16_HW 0 "gpc_reg_operand" "=wa") + (and:VFP16_HW (match_operand:VFP16_HW 1 "gpc_reg_operand" "wa") + (not:VFP16_HW + (match_operand:VFP16_HW 2 "gpc_reg_operand" "wa"))))] + "" + "xxlandc %x0,%x1,%x2" + [(set_attr "type" "veclogical")]) + ;; Binary operators being vectorized. (define_insn_and_split "<fp16_names><mode>3" [(set (match_operand:VFP16_HW 0 "vsx_register_operand") @@ -722,6 +820,110 @@ DONE; }) +;; Negative of binary operators being vectorized. +(define_insn_and_split "*neg_<fp16_names><mode>3" + [(set (match_operand:VFP16_HW 0 "vsx_register_operand") + (neg:VFP16_HW + (FP16_BINARY_OP:VFP16_HW + (match_operand:VFP16_HW 1 "vsx_register_operand") + (match_operand:VFP16_HW 2 "vsx_register_operand"))))] + "can_create_pseudo_p ()" + "#" + "&& 1" + [(pc)] +{ + fp16_vectorization (<CODE>, operands[0], operands[1], operands[2], + NULL_RTX, FP16_NEG_BINARY); + DONE; +}) + +;; Absolute value of binary operators being vectorized. +(define_insn_and_split "*abs_<fp16_names><mode>3" + [(set (match_operand:VFP16_HW 0 "vsx_register_operand") + (abs:VFP16_HW + (FP16_BINARY_OP:VFP16_HW + (match_operand:VFP16_HW 1 "vsx_register_operand") + (match_operand:VFP16_HW 2 "vsx_register_operand"))))] + "can_create_pseudo_p ()" + "#" + "&& 1" + [(pc)] +{ + fp16_vectorization (<CODE>, operands[0], operands[1], operands[2], + NULL_RTX, FP16_ABS_BINARY); + DONE; +}) + +;; FMA operations being vectorized. +(define_insn_and_split "fma<mode>4" + [(set (match_operand:VFP16_HW 0 "vsx_register_operand") + (fma:VFP16_HW + (match_operand:VFP16_HW 1 "vsx_register_operand") + (match_operand:VFP16_HW 2 "vsx_register_operand") + (match_operand:VFP16_HW 3 "vsx_register_operand")))] + "can_create_pseudo_p ()" + "#" + "&& 1" + [(pc)] +{ + fp16_vectorization (FMA, operands[0], operands[1], operands[2], + operands[3], FP16_FMA); + DONE; +}) + +(define_insn_and_split "*fms<mode>4" + [(set (match_operand:VFP16_HW 0 "vsx_register_operand") + (fma:VFP16_HW + (match_operand:VFP16_HW 1 "vsx_register_operand") + (match_operand:VFP16_HW 2 "vsx_register_operand") + (neg:VFP16_HW + (match_operand:VFP16_HW 3 "vsx_register_operand"))))] + "can_create_pseudo_p ()" + "#" + "&& 1" + [(pc)] +{ + fp16_vectorization (FMA, operands[0], operands[1], operands[2], + operands[3], FP16_FMS); + DONE; +}) + +(define_insn_and_split "*nfma<mode>4" + [(set (match_operand:VFP16_HW 0 "vsx_register_operand") + (neg:VFP16_HW + (fma:VFP16_HW + (match_operand:VFP16_HW 1 "vsx_register_operand") + (match_operand:VFP16_HW 2 "vsx_register_operand") + (match_operand:VFP16_HW 3 "vsx_register_operand"))))] + "can_create_pseudo_p ()" + "#" + "&& 1" + [(pc)] +{ + fp16_vectorization (FMA, operands[0], operands[1], operands[2], + operands[3], FP16_NFMA); + DONE; +}) + +(define_insn_and_split "*nfms<mode>4" + [(set (match_operand:VFP16_HW 0 "vsx_register_operand") + (neg:VFP16_HW + (fma:VFP16_HW + (match_operand:VFP16_HW 1 "vsx_register_operand") + (match_operand:VFP16_HW 2 "vsx_register_operand") + (neg:VFP16_HW + (match_operand:VFP16_HW 3 "vsx_register_operand")))))] + "can_create_pseudo_p ()" + "#" + "&& 1" + [(pc)] +{ + fp16_vectorization (FMA, operands[0], operands[1], operands[2], + operands[3], FP16_NFMS); + DONE; +}) + + ;; If we do multiple __bfloat16 operations, between the first and ;; second operation, GCC will want to convert the first operation from diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h index 3c4d21299e1a..ee96e0e50a0e 100644 --- a/gcc/config/rs6000/rs6000-protos.h +++ b/gcc/config/rs6000/rs6000-protos.h @@ -263,6 +263,8 @@ extern unsigned constant_generates_xxspltidp (vec_const_128bit_type *); /* Optimize bfloat16 and float16 operations. */ enum fp16_operation { FP16_BINARY, /* Bfloat16/float16 binary op. */ + FP16_ABS_BINARY, /* abs (binary op). */ + FP16_NEG_BINARY, /* - (binary op). */ FP16_FMA, /* (a * b) + c. */ FP16_FMS, /* (a * b) - c. */ FP16_NFMA, /* - ((a * b) + c). */
