https://gcc.gnu.org/g:a512ee583cd1f2e8792a2556c300c95f78fc3086
commit a512ee583cd1f2e8792a2556c300c95f78fc3086 Author: Michael Meissner <[email protected]> Date: Wed Oct 1 19:15:43 2025 -0400 Attempt to add bfloat16 operations. 2025-10-01 Michael Meissner <[email protected]> gcc/ * config/rs6000/rs6000.cc (output_vec_const_move): Add support for 16-bit floating point and vector 16-bit floating point modes. (reg_offset_addressing_ok_p): Likewise. (rs6000_legitimate_offset_address_p): Likewise. (rs6000_const_vec): Likewise. (rs6000_emit_move): Likewise. * config/rs6000/rs6000.h (TARGET_BFLOAT16_HW): New macro. (TARGET_FLOAT16_HW): Likewise. (FP16_HW_SCALAR_MODE_P): Likewise. * config/rs6000/rs6000.md (UNSPEC_XVCVBF16SPN_BF): New unspec. (UNSPEC_XVCVSPBF16_BF): Likewise. (UNSPEC_XXSPLTW_BF): Likewise. (FP16_HW): Use TARGET_BFLOAT16_HW and TARGET_FLOAT16_HW. (BF_OPS): New code iterator. (BF_OPS_NAME): New code attribute. (<BF_OPS_NAME>bf3): New insns for bfloat6 arithmetic operations. (xxspltw_bf): New insn. (xvcvbf16spn_bf): Likewise. (xvcvspbf16_bf): Likewise. (extendhf<mode>2): Use TARGET_FLOAT16_HW. (trunc<mode>hf2): Likewise. (neg<mode>2, FP16 iterator): New insns. (xor<mode>3, FP16 iterator): Likewise. (extendbf<mode>2): Use TARGET_BFLOAT16_HW. (trunc<mode>bf2): Likewise. Diff: --- gcc/config/rs6000/rs6000.cc | 27 ++++++++ gcc/config/rs6000/rs6000.h | 10 +++ gcc/config/rs6000/rs6000.md | 162 ++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 192 insertions(+), 7 deletions(-) diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc index a6aef3e9be38..4a0eb26f41aa 100644 --- a/gcc/config/rs6000/rs6000.cc +++ b/gcc/config/rs6000/rs6000.cc @@ -6881,6 +6881,8 @@ output_vec_const_move (rtx *operands) return "vspltisw %0,%1"; case E_V8HImode: + case E_V8HFmode: + case E_V8BFmode: return "vspltish %0,%1"; case E_V16QImode: @@ -8747,6 +8749,8 @@ reg_offset_addressing_ok_p (machine_mode mode) { case E_V16QImode: case E_V8HImode: + case E_V8HFmode: + case E_V8BFmode: case E_V4SFmode: case E_V4SImode: case E_V2DFmode: @@ -8765,6 +8769,13 @@ reg_offset_addressing_ok_p (machine_mode mode) return mode_supports_dq_form (mode); break; + /* For 16-bit floating point types, do not allow offset addressing, since + it is assumed that most of the use will be in vector registers, and we + only have reg+reg addressing for 16-bit modes. */ + case E_BFmode: + case E_HFmode: + return false; + /* The vector pair/quad types support offset addressing if the underlying vectors support offset addressing. */ case E_OOmode: @@ -9055,6 +9066,13 @@ rs6000_legitimate_offset_address_p (machine_mode mode, rtx x, extra = 0; switch (mode) { + /* For 16-bit floating point types, do not allow offset addressing, since + it is assumed that most of the use will be in vector registers, and we + only have reg+reg addressing for 16-bit modes. */ + case E_BFmode: + case E_HFmode: + return false; + case E_DFmode: case E_DDmode: case E_DImode: @@ -9156,6 +9174,11 @@ macho_lo_sum_memory_operand (rtx x, machine_mode mode) static bool legitimate_lo_sum_address_p (machine_mode mode, rtx x, int strict) { + /* For 16-bit floating point types, do not allow offset addressing, since + it is assumed that most of the use will be in vector registers, and we + only have reg+reg addressing for 16-bit modes. */ + if (FP16_SCALAR_MODE_P (mode)) + return false; if (GET_CODE (x) != LO_SUM) return false; if (!REG_P (XEXP (x, 0))) @@ -10852,6 +10875,8 @@ rs6000_const_vec (machine_mode mode) subparts = 4; break; case E_V8HImode: + case E_V8HFmode: + case E_V8BFmode: subparts = 8; break; case E_V16QImode: @@ -11307,6 +11332,8 @@ rs6000_emit_move (rtx dest, rtx source, machine_mode mode) case E_V16QImode: case E_V8HImode: + case E_V8HFmode: + case E_V8BFmode: case E_V4SFmode: case E_V4SImode: case E_V2DFmode: diff --git a/gcc/config/rs6000/rs6000.h b/gcc/config/rs6000/rs6000.h index 2c98d1b98493..51b205c480b4 100644 --- a/gcc/config/rs6000/rs6000.h +++ b/gcc/config/rs6000/rs6000.h @@ -343,11 +343,21 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); || ((MODE) == TDmode) \ || (!TARGET_FLOAT128_TYPE && FLOAT128_IEEE_P (MODE))) +/* Do we have conversion support in hardware for the 16-bit floating point? */ +#define TARGET_BFLOAT16_HW (TARGET_BFLOAT16 && TARGET_POWER10) +#define TARGET_FLOAT16_HW (TARGET_FLOAT16 && TARGET_POWER9) + /* Is this a valid 16-bit scalar floating point mode? */ #define FP16_SCALAR_MODE_P(MODE) \ (((MODE) == HFmode && TARGET_FLOAT16) \ || ((MODE) == BFmode && TARGET_BFLOAT16)) +/* Is this a valid 16-bit scalar floating point mode that has hardware + conversions? */ +#define FP16_HW_SCALAR_MODE_P(MODE) \ + (((MODE) == HFmode && TARGET_FLOAT16_HW) \ + || ((MODE) == BFmode && TARGET_BFLOAT16_HW)) + /* Return true for floating point that does not use a vector register. */ #define SCALAR_FLOAT_MODE_NOT_VECTOR_P(MODE) \ (SCALAR_FLOAT_MODE_P (MODE) && !FLOAT128_VECTOR_P (MODE)) diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index 86687187137a..774b058a8e18 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -174,6 +174,9 @@ UNSPEC_FMAX UNSPEC_FMIN UNSPEC_V8BF_SHIFT_LEFT_32BIT + UNSPEC_XVCVBF16SPN_BF + UNSPEC_XVCVSPBF16_BF + UNSPEC_XXSPLTW_BF ]) ;; @@ -866,13 +869,24 @@ ;; Mode iterator for 16-bit floating modes on machines with hardware ;; support. -(define_mode_iterator FP16_HW [(BF "TARGET_BFLOAT16 && TARGET_POWER10") - (HF "TARGET_FLOAT16 && TARGET_POWER9")]) +(define_mode_iterator FP16_HW [(BF "TARGET_BFLOAT16_HW") + (HF "TARGET_FLOAT16_HW")]) ;; Mode iterator for floating point modes other than SF/DFmode that we ;; convert to/from _Float16 (HFmode) via DFmode. (define_mode_iterator FP16_CONVERT [TF KF IF SD DD TD]) +;; Code iterator giving the basic operations for bfloat16 floating point +;; operations +(define_code_iterator BF_OPS [plus minus mult div]) + +;; Code attribute that gives the standard name for the bfloat16 +;; operations done via V4SF vector +(define_code_attr BF_OPS_NAME [(plus "add") + (minus "sub") + (mult "mul") + (div "div")]) + (include "darwin.md") ;; Start with fixed-point load and store insns. Here we put only the more @@ -5865,6 +5879,100 @@ "xxsel %x0,%x4,%x3,%x1" [(set_attr "type" "vecmove")]) + +;; Bfloat16 floating point operations. We convert the 16-bit scalar to a +;; V4SF vector, do the operation, and then convert the value back to +;; 16-bit format. We only care about the 2nd element that the scalar +;; value in it. For plus, minus, and mult the other 3 elements can be +;; 0. This means we can combine a load (which sets the other bits to +;; 0) with the conversion to vector. For divide, the divisor must not +;; be 0, so we use a splat operation to guarantee that we are not +;; dividing by 0. + +(define_insn_and_split "<BF_OPS_NAME>bf3" + [(set (match_operand:BF 0 "vsx_register_operand" "=wa") + (BF_OPS:BF (match_operand:BF 1 "vsx_register_operand" "wa") + (match_operand:BF 2 "vsx_register_operand" "wa"))) + (clobber (match_scratch:V4SF 3 "=&wa")) + (clobber (match_scratch:V4SF 4 "=&wa")) + (clobber (match_scratch:V4SF 5 "=&wa"))] + "TARGET_BFLOAT16_HW" + "#" + "&& 1" + [(pc)] +{ + rtx op0 = operands[0]; + rtx op1 = operands[1]; + rtx op2 = operands[2]; + rtx tmp0 = operands[3]; + rtx tmp1 = operands[4]; + rtx tmp2 = operands[5]; + + if (GET_CODE (tmp0) == SCRATCH) + tmp0 = gen_reg_rtx (V4SFmode); + + if (GET_CODE (tmp1) == SCRATCH) + tmp1 = gen_reg_rtx (V4SFmode); + + if (GET_CODE (tmp2) == SCRATCH) + tmp2 = gen_reg_rtx (V4SFmode); + + printf ("--- %s\n", "<CODE>_bf"); + + /* Convert operand1 to V4SFmode format. */ + emit_insn (gen_xxspltw_bf (tmp1, op1)); + emit_insn (gen_xvcvbf16spn_bf (tmp1, tmp1)); + + /* Convert operand2 to V4SFmode format. */ + emit_insn (gen_xxspltw_bf (tmp2, op2)); + emit_insn (gen_xvcvbf16spn_bf (tmp2, tmp2)); + + /* Do the operation in V4SFmode. */ + emit_insn (gen_<BF_OPS_NAME>v4sf3 (tmp0, tmp1, tmp2)); + + /* Convert V4SF result back to scalar mode. */ + emit_insn (gen_xvcvspbf16_bf (op0, tmp0)); + DONE; +} + [(set_attr "type" "vecperm") + (set_attr "length" "24")]) + +;; Duplicate a BF value so it can be used for xvcvbf16spn. Because +;; xvcvbf16spn only uses the even elements, we can use xxspltw instead +;; of vspltw. + +(define_insn "xxspltw_bf" + [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa") + (unspec:V4SF [(match_operand:BF 1 "vsx_register_operand" "wa")] + UNSPEC_XXSPLTW_BF))] + "TARGET_BFLOAT16_HW" + "xxspltw %x0,%x1,1" + [(set_attr "type" "vecperm")]) + +;; Convert a bfloat16 floating point scalar that has been splatted to +;; V4SFmode. + +(define_insn "xvcvbf16spn_bf" + [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa") + (unspec:V4SF [(match_operand:V4SF 1 "vsx_register_operand" "wa")] + UNSPEC_XVCVBF16SPN_BF))] + "TARGET_BFLOAT16_HW" + "xvcvbf16spn %x0,%x1" + [(set_attr "type" "vecperm")]) + +;; Convert a V4SFmode vector back to 16-bit floating point scalar. We +;; only care about the 2nd V4SFmode element, which is the element we +;; converted the 16-bit scalar (4th element) to V4SFmode to do the +;; operation, and converted it back. + +(define_insn "xvcvspbf16_bf" + [(set (match_operand:BF 0 "vsx_register_operand" "=wa") + (unspec:BF [(match_operand:V4SF 1 "vsx_register_operand" "wa")] + UNSPEC_XVCVSPBF16_BF))] + "TARGET_BFLOAT16_HW" + "xvcvspbf16 %x0,%x1" + [(set_attr "type" "vecperm")]) + ;; Convert IEEE 16-bit floating point to/from other floating point modes. @@ -5872,7 +5980,7 @@ [(set (match_operand:SFDF 0 "vsx_register_operand" "=wa") (float_extend:SFDF (match_operand:HF 1 "vsx_register_operand" "wa")))] - "TARGET_FLOAT16 && TARGET_POWER9" + "TARGET_FLOAT16_HW" "xscvhpdp %x0,%x1" [(set_attr "type" "fpsimple")]) @@ -5880,10 +5988,50 @@ [(set (match_operand:HF 0 "vsx_register_operand" "=wa") (float_truncate:HF (match_operand:SFDF 1 "vsx_register_operand" "wa")))] - "TARGET_FLOAT16 && TARGET_POWER9" + "TARGET_FLOAT16_HW" "xscvdphp %x0,%x1" [(set_attr "type" "fpsimple")]) +;; Negate 16-bit floating point by XOR with -0.0. We only do this on +;; power10, since we can easily load up -0.0 via XXSPLTIW. + +(define_insn_and_split "neg<mode>2" + [(set (match_operand:FP16 0 "register_operand" "=wa,wr") + (neg:FP16 (match_operand:FP16 1 "register_operand" "wa,wr"))) + (clobber (match_scratch:FP16 2 "=&wa,&r"))] + "TARGET_POWER10 && TARGET_PREFIXED" + "#" + "&& 1" + [(set (match_dup 2) + (match_dup 3)) + (set (match_dup 0) + (xor:FP16 (match_dup 1) + (match_dup 2)))] +{ + REAL_VALUE_TYPE dconst; + + gcc_assert (real_from_string (&dconst, "-0.0") == 0); + + if (GET_CODE (operands[2]) == SCRATCH) + operands[2] = gen_reg_rtx (<MODE>mode); + + operands[3] = const_double_from_real_value (dconst, <MODE>mode); +} + [(set_attr "type" "veclogical,integer") + (set_attr "length" "16")]) + +;; XOR used to negate a 16-bit floating point type + +(define_insn "xor<mode>3" + [(set (match_operand:FP16 0 "register_operand" "=wa,wr") + (xor:FP16 (match_operand:FP16 1 "register_operand" "wa,wr") + (match_operand:FP16 2 "register_operand" "wa,wr")))] + "TARGET_POWER10 && TARGET_PREFIXED" + "@ + xxlxor %x0,%x1,%x2 + xor %0,%1,%2" + [(set_attr "type" "veclogical,integer")]) + ;; Convert BFmode to SFmode/DFmode. ;; 3 instructions are generated: ;; VSPLTH -- duplicate BFmode into all elements @@ -5894,7 +6042,7 @@ (float_extend:SFDF (match_operand:BF 1 "vsx_register_operand" "v"))) (clobber (match_scratch:V8BF 2 "=v"))] - "TARGET_BFLOAT16 && TARGET_POWER10" + "TARGET_BFLOAT16_HW" "#" "&& 1" [(pc)] @@ -5930,7 +6078,7 @@ [(set (match_operand:V8BF 0 "register_operand" "=wa") (unspec:V8BF [(match_operand:BF 1 "register_operand" "wa")] UNSPEC_V8BF_SHIFT_LEFT_32BIT))] - "TARGET_BFLOAT16" + "TARGET_BFLOAT16_HW" "xxsldwi %x0,%x1,%x1,1" [(set_attr "type" "vecperm")]) @@ -5944,7 +6092,7 @@ (float_truncate:BF (match_operand:SFDF 1 "vsx_register_operand" "wa"))) (clobber (match_scratch:V4SF 2 "=wa"))] - "TARGET_BFLOAT16 && TARGET_POWER10" + "TARGET_BFLOAT16_HW" "#" "&& 1" [(pc)]
