https://gcc.gnu.org/g:a512ee583cd1f2e8792a2556c300c95f78fc3086

commit a512ee583cd1f2e8792a2556c300c95f78fc3086
Author: Michael Meissner <[email protected]>
Date:   Wed Oct 1 19:15:43 2025 -0400

    Attempt to add bfloat16 operations.
    
    2025-10-01  Michael Meissner  <[email protected]>
    
    gcc/
    
            * config/rs6000/rs6000.cc (output_vec_const_move): Add support for
            16-bit floating point and vector 16-bit floating point modes.
            (reg_offset_addressing_ok_p): Likewise.
            (rs6000_legitimate_offset_address_p): Likewise.
            (rs6000_const_vec): Likewise.
            (rs6000_emit_move): Likewise.
            * config/rs6000/rs6000.h (TARGET_BFLOAT16_HW): New macro.
            (TARGET_FLOAT16_HW): Likewise.
            (FP16_HW_SCALAR_MODE_P): Likewise.
            * config/rs6000/rs6000.md (UNSPEC_XVCVBF16SPN_BF): New unspec.
            (UNSPEC_XVCVSPBF16_BF): Likewise.
            (UNSPEC_XXSPLTW_BF): Likewise.
            (FP16_HW): Use TARGET_BFLOAT16_HW and TARGET_FLOAT16_HW.
            (BF_OPS): New code iterator.
            (BF_OPS_NAME): New code attribute.
            (<BF_OPS_NAME>bf3): New insns for bfloat6 arithmetic operations.
            (xxspltw_bf): New insn.
            (xvcvbf16spn_bf): Likewise.
            (xvcvspbf16_bf): Likewise.
            (extendhf<mode>2): Use TARGET_FLOAT16_HW.
            (trunc<mode>hf2): Likewise.
            (neg<mode>2, FP16 iterator): New insns.
            (xor<mode>3, FP16 iterator): Likewise.
            (extendbf<mode>2): Use TARGET_BFLOAT16_HW.
            (trunc<mode>bf2): Likewise.

Diff:
---
 gcc/config/rs6000/rs6000.cc |  27 ++++++++
 gcc/config/rs6000/rs6000.h  |  10 +++
 gcc/config/rs6000/rs6000.md | 162 ++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 192 insertions(+), 7 deletions(-)

diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index a6aef3e9be38..4a0eb26f41aa 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -6881,6 +6881,8 @@ output_vec_const_move (rtx *operands)
          return "vspltisw %0,%1";
 
        case E_V8HImode:
+       case E_V8HFmode:
+       case E_V8BFmode:
          return "vspltish %0,%1";
 
        case E_V16QImode:
@@ -8747,6 +8749,8 @@ reg_offset_addressing_ok_p (machine_mode mode)
     {
     case E_V16QImode:
     case E_V8HImode:
+    case E_V8HFmode:
+    case E_V8BFmode:
     case E_V4SFmode:
     case E_V4SImode:
     case E_V2DFmode:
@@ -8765,6 +8769,13 @@ reg_offset_addressing_ok_p (machine_mode mode)
        return mode_supports_dq_form (mode);
       break;
 
+      /* For 16-bit floating point types, do not allow offset addressing, since
+        it is assumed that most of the use will be in vector registers, and we
+        only have reg+reg addressing for 16-bit modes.  */
+    case E_BFmode:
+    case E_HFmode:
+      return false;
+
       /* The vector pair/quad types support offset addressing if the
         underlying vectors support offset addressing.  */
     case E_OOmode:
@@ -9055,6 +9066,13 @@ rs6000_legitimate_offset_address_p (machine_mode mode, 
rtx x,
   extra = 0;
   switch (mode)
     {
+      /* For 16-bit floating point types, do not allow offset addressing, since
+        it is assumed that most of the use will be in vector registers, and we
+        only have reg+reg addressing for 16-bit modes.  */
+    case E_BFmode:
+    case E_HFmode:
+      return false;
+
     case E_DFmode:
     case E_DDmode:
     case E_DImode:
@@ -9156,6 +9174,11 @@ macho_lo_sum_memory_operand (rtx x, machine_mode mode)
 static bool
 legitimate_lo_sum_address_p (machine_mode mode, rtx x, int strict)
 {
+      /* For 16-bit floating point types, do not allow offset addressing, since
+        it is assumed that most of the use will be in vector registers, and we
+        only have reg+reg addressing for 16-bit modes.  */
+  if (FP16_SCALAR_MODE_P (mode))
+    return false;
   if (GET_CODE (x) != LO_SUM)
     return false;
   if (!REG_P (XEXP (x, 0)))
@@ -10852,6 +10875,8 @@ rs6000_const_vec (machine_mode mode)
       subparts = 4;
       break;
     case E_V8HImode:
+    case E_V8HFmode:
+    case E_V8BFmode:
       subparts = 8;
       break;
     case E_V16QImode:
@@ -11307,6 +11332,8 @@ rs6000_emit_move (rtx dest, rtx source, machine_mode 
mode)
 
     case E_V16QImode:
     case E_V8HImode:
+    case E_V8HFmode:
+    case E_V8BFmode:
     case E_V4SFmode:
     case E_V4SImode:
     case E_V2DFmode:
diff --git a/gcc/config/rs6000/rs6000.h b/gcc/config/rs6000/rs6000.h
index 2c98d1b98493..51b205c480b4 100644
--- a/gcc/config/rs6000/rs6000.h
+++ b/gcc/config/rs6000/rs6000.h
@@ -343,11 +343,21 @@ extern const char *host_detect_local_cpu (int argc, const 
char **argv);
    || ((MODE) == TDmode)                                               \
    || (!TARGET_FLOAT128_TYPE && FLOAT128_IEEE_P (MODE)))
 
+/* Do we have conversion support in hardware for the 16-bit floating point?  */
+#define TARGET_BFLOAT16_HW     (TARGET_BFLOAT16 && TARGET_POWER10)
+#define TARGET_FLOAT16_HW      (TARGET_FLOAT16 && TARGET_POWER9)
+
 /* Is this a valid 16-bit scalar floating point mode?  */
 #define FP16_SCALAR_MODE_P(MODE)                                       \
   (((MODE) == HFmode && TARGET_FLOAT16)                                        
\
    || ((MODE) == BFmode && TARGET_BFLOAT16))
 
+/* Is this a valid 16-bit scalar floating point mode that has hardware
+   conversions?  */
+#define FP16_HW_SCALAR_MODE_P(MODE)                                    \
+  (((MODE) == HFmode && TARGET_FLOAT16_HW)                             \
+   || ((MODE) == BFmode && TARGET_BFLOAT16_HW))
+
 /* Return true for floating point that does not use a vector register.  */
 #define SCALAR_FLOAT_MODE_NOT_VECTOR_P(MODE)                           \
   (SCALAR_FLOAT_MODE_P (MODE) && !FLOAT128_VECTOR_P (MODE))
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 86687187137a..774b058a8e18 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -174,6 +174,9 @@
    UNSPEC_FMAX
    UNSPEC_FMIN
    UNSPEC_V8BF_SHIFT_LEFT_32BIT
+   UNSPEC_XVCVBF16SPN_BF
+   UNSPEC_XVCVSPBF16_BF
+   UNSPEC_XXSPLTW_BF
   ])
 
 ;;
@@ -866,13 +869,24 @@
 
 ;; Mode iterator for 16-bit floating modes on machines with hardware
 ;; support.
-(define_mode_iterator FP16_HW [(BF "TARGET_BFLOAT16 && TARGET_POWER10")
-                              (HF "TARGET_FLOAT16 && TARGET_POWER9")])
+(define_mode_iterator FP16_HW [(BF "TARGET_BFLOAT16_HW")
+                              (HF "TARGET_FLOAT16_HW")])
 
 ;; Mode iterator for floating point modes other than SF/DFmode that we
 ;; convert to/from _Float16 (HFmode) via DFmode.
 (define_mode_iterator FP16_CONVERT [TF KF IF SD DD TD])
 
+;; Code iterator giving the basic operations for bfloat16 floating point
+;; operations
+(define_code_iterator BF_OPS [plus minus mult div])
+
+;; Code attribute that gives the standard name for the bfloat16
+;; operations done via V4SF vector
+(define_code_attr BF_OPS_NAME [(plus  "add")
+                              (minus "sub")
+                              (mult  "mul")
+                              (div   "div")])
+
 (include "darwin.md")
 
 ;; Start with fixed-point load and store insns.  Here we put only the more
@@ -5865,6 +5879,100 @@
   "xxsel %x0,%x4,%x3,%x1"
   [(set_attr "type" "vecmove")])
 
+
+;; Bfloat16 floating point operations.  We convert the 16-bit scalar to a
+;; V4SF vector, do the operation, and then convert the value back to
+;; 16-bit format.  We only care about the 2nd element that the scalar
+;; value in it.  For plus, minus, and mult the other 3 elements can be
+;; 0.  This means we can combine a load (which sets the other bits to
+;; 0) with the conversion to vector.  For divide, the divisor must not
+;; be 0, so we use a splat operation to guarantee that we are not
+;; dividing by 0.
+
+(define_insn_and_split "<BF_OPS_NAME>bf3"
+  [(set (match_operand:BF 0 "vsx_register_operand" "=wa")
+       (BF_OPS:BF (match_operand:BF 1 "vsx_register_operand" "wa")
+                  (match_operand:BF 2 "vsx_register_operand" "wa")))
+   (clobber (match_scratch:V4SF 3 "=&wa"))
+   (clobber (match_scratch:V4SF 4 "=&wa"))
+   (clobber (match_scratch:V4SF 5 "=&wa"))]
+  "TARGET_BFLOAT16_HW"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  rtx tmp0 = operands[3];
+  rtx tmp1 = operands[4];
+  rtx tmp2 = operands[5];
+
+  if (GET_CODE (tmp0) == SCRATCH)
+    tmp0 = gen_reg_rtx (V4SFmode);
+
+  if (GET_CODE (tmp1) == SCRATCH)
+    tmp1 = gen_reg_rtx (V4SFmode);
+
+  if (GET_CODE (tmp2) == SCRATCH)
+    tmp2 = gen_reg_rtx (V4SFmode);
+
+  printf ("--- %s\n", "<CODE>_bf");
+
+  /* Convert operand1 to V4SFmode format.  */
+  emit_insn (gen_xxspltw_bf (tmp1, op1));
+  emit_insn (gen_xvcvbf16spn_bf (tmp1, tmp1));
+
+  /* Convert operand2 to V4SFmode format.  */
+  emit_insn (gen_xxspltw_bf (tmp2, op2));
+  emit_insn (gen_xvcvbf16spn_bf (tmp2, tmp2));
+
+  /* Do the operation in V4SFmode.  */
+  emit_insn (gen_<BF_OPS_NAME>v4sf3 (tmp0, tmp1, tmp2));
+
+  /* Convert V4SF result back to scalar mode.  */
+  emit_insn (gen_xvcvspbf16_bf (op0, tmp0));
+  DONE;
+}
+  [(set_attr "type" "vecperm")
+   (set_attr "length" "24")])
+
+;; Duplicate a BF value so it can be used for xvcvbf16spn.  Because
+;; xvcvbf16spn only uses the even elements, we can use xxspltw instead
+;; of vspltw.
+
+(define_insn "xxspltw_bf"
+  [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa")
+       (unspec:V4SF [(match_operand:BF 1 "vsx_register_operand" "wa")]
+                    UNSPEC_XXSPLTW_BF))]
+  "TARGET_BFLOAT16_HW"
+  "xxspltw %x0,%x1,1"
+  [(set_attr "type" "vecperm")])
+
+;; Convert a bfloat16 floating point scalar that has been splatted to
+;; V4SFmode.
+
+(define_insn "xvcvbf16spn_bf"
+  [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa")
+       (unspec:V4SF [(match_operand:V4SF 1 "vsx_register_operand" "wa")]
+                    UNSPEC_XVCVBF16SPN_BF))]
+  "TARGET_BFLOAT16_HW"
+  "xvcvbf16spn %x0,%x1"
+  [(set_attr "type" "vecperm")])
+
+;; Convert a V4SFmode vector back to 16-bit floating point scalar.  We
+;; only care about the 2nd V4SFmode element, which is the element we
+;; converted the 16-bit scalar (4th element) to V4SFmode to do the
+;; operation, and converted it back.
+
+(define_insn "xvcvspbf16_bf"
+  [(set (match_operand:BF 0 "vsx_register_operand" "=wa")
+       (unspec:BF [(match_operand:V4SF 1 "vsx_register_operand" "wa")]
+                  UNSPEC_XVCVSPBF16_BF))]
+  "TARGET_BFLOAT16_HW"
+  "xvcvspbf16 %x0,%x1"
+  [(set_attr "type" "vecperm")])
+
 
 ;; Convert IEEE 16-bit floating point to/from other floating point modes.
 
@@ -5872,7 +5980,7 @@
   [(set (match_operand:SFDF 0 "vsx_register_operand" "=wa")
        (float_extend:SFDF
         (match_operand:HF 1 "vsx_register_operand" "wa")))]
-  "TARGET_FLOAT16 && TARGET_POWER9"
+  "TARGET_FLOAT16_HW"
   "xscvhpdp %x0,%x1"
   [(set_attr "type" "fpsimple")])
 
@@ -5880,10 +5988,50 @@
   [(set (match_operand:HF 0 "vsx_register_operand" "=wa")
        (float_truncate:HF
         (match_operand:SFDF 1 "vsx_register_operand" "wa")))]
-  "TARGET_FLOAT16 && TARGET_POWER9"
+  "TARGET_FLOAT16_HW"
   "xscvdphp %x0,%x1"
   [(set_attr "type" "fpsimple")])
 
+;; Negate 16-bit floating point by XOR with -0.0.  We only do this on
+;; power10, since we can easily load up -0.0 via XXSPLTIW.
+
+(define_insn_and_split "neg<mode>2"
+  [(set (match_operand:FP16 0 "register_operand" "=wa,wr")
+       (neg:FP16 (match_operand:FP16 1 "register_operand" "wa,wr")))
+   (clobber (match_scratch:FP16 2 "=&wa,&r"))]
+  "TARGET_POWER10 && TARGET_PREFIXED"
+  "#"
+  "&& 1"
+  [(set (match_dup 2)
+       (match_dup 3))
+   (set (match_dup 0)
+       (xor:FP16 (match_dup 1)
+                 (match_dup 2)))]
+{
+  REAL_VALUE_TYPE dconst;
+
+  gcc_assert (real_from_string (&dconst, "-0.0") == 0);
+
+  if (GET_CODE (operands[2]) == SCRATCH)
+    operands[2] = gen_reg_rtx (<MODE>mode);
+
+  operands[3] = const_double_from_real_value (dconst, <MODE>mode);
+}
+  [(set_attr "type" "veclogical,integer")
+   (set_attr "length" "16")])
+
+;; XOR used to negate a 16-bit floating point type
+
+(define_insn "xor<mode>3"
+  [(set (match_operand:FP16 0 "register_operand" "=wa,wr")
+       (xor:FP16 (match_operand:FP16 1 "register_operand" "wa,wr")
+                 (match_operand:FP16 2 "register_operand" "wa,wr")))]
+  "TARGET_POWER10 && TARGET_PREFIXED"
+  "@
+   xxlxor %x0,%x1,%x2
+   xor %0,%1,%2"
+  [(set_attr "type" "veclogical,integer")])
+
 ;; Convert BFmode to SFmode/DFmode.
 ;; 3 instructions are generated:
 ;;     VSPLTH          -- duplicate BFmode into all elements
@@ -5894,7 +6042,7 @@
        (float_extend:SFDF
         (match_operand:BF 1 "vsx_register_operand" "v")))
    (clobber (match_scratch:V8BF 2 "=v"))]
-  "TARGET_BFLOAT16 && TARGET_POWER10"
+  "TARGET_BFLOAT16_HW"
   "#"
   "&& 1"
   [(pc)]
@@ -5930,7 +6078,7 @@
   [(set (match_operand:V8BF 0 "register_operand" "=wa")
         (unspec:V8BF [(match_operand:BF 1 "register_operand" "wa")]
                      UNSPEC_V8BF_SHIFT_LEFT_32BIT))]
-  "TARGET_BFLOAT16"
+  "TARGET_BFLOAT16_HW"
   "xxsldwi %x0,%x1,%x1,1"
   [(set_attr "type" "vecperm")])
 
@@ -5944,7 +6092,7 @@
        (float_truncate:BF
         (match_operand:SFDF 1 "vsx_register_operand" "wa")))
    (clobber (match_scratch:V4SF 2 "=wa"))]
-  "TARGET_BFLOAT16 && TARGET_POWER10"
+  "TARGET_BFLOAT16_HW"
   "#"
   "&& 1"
   [(pc)]

Reply via email to