work222-float)] Add unary, fma 16-bit floating point vector optimization.

Michael Meissner via Gcc-cvs Fri, 17 Oct 2025 10:05:29 -0700

https://gcc.gnu.org/g:5fc9771c32162e44542a798f0f4ccd4ee9f4a8ba


commit 5fc9771c32162e44542a798f0f4ccd4ee9f4a8ba
Author: Michael Meissner <[email protected]>
Date:   Fri Oct 17 13:04:46 2025 -0400

    Add unary, fma 16-bit floating point vector optimization.
    
    2025-10-17  Michael Meissner  <[email protected]>
    
    gcc/
    
            * config/rs6000/float16.cc (bfloat16_operation_as_v4sf): Add 
support for
            vectorizing unary and fma 16-bit floating point.
            (fp16_vectorization): Likewise.
            * config/rs6000/float16.md (neg<mode>2, VFP16_HW iterator): 
Likewise.
            (xor<mode>2, VFP16_HW iterator): Likewise.
            (abs<mode>2, VFP16_HW iterator): Likewise.
            (andc<mode>2, VFP16_HW iterator): Likewise.
            (neg_<fp16_names><mode>2): Likewise.
            (abs_<fp16_names><mode>2): Likewise.
            (fma<fp16_names><mode>4): Likewise.
            (fms<fp16_names><mode>4): Likewise.
            (nfma<fp16_names><mode>4): Likewise.
            (nfms<fp16_names><mode>4): Likewise.
            * config/rs6000/rs6000-protos.h (FP16_ABS_BINARY): Likewise.
            (FP16_NEG_BINARY,): Likewise.

Diff:
---
 gcc/config/rs6000/float16.cc      |  27 +++++
 gcc/config/rs6000/float16.md      | 202 ++++++++++++++++++++++++++++++++++++++
 gcc/config/rs6000/rs6000-protos.h |   2 +
 3 files changed, 231 insertions(+)

diff --git a/gcc/config/rs6000/float16.cc b/gcc/config/rs6000/float16.cc
index 3fb61e9e4621..80f580388e57 100644
--- a/gcc/config/rs6000/float16.cc
+++ b/gcc/config/rs6000/float16.cc
@@ -88,6 +88,8 @@ bfloat16_operation_as_v4sf (enum rtx_code icode,
       n_opts = 3;
       break;
 
+    case FP16_ABS_BINARY:
+    case FP16_NEG_BINARY:
     default:
       gcc_unreachable ();
     }
@@ -174,6 +176,8 @@ bfloat16_operation_as_v4sf (enum rtx_code icode,
       }
       break;
 
+    case FP16_ABS_BINARY:
+    case FP16_NEG_BINARY:
     default:
       gcc_unreachable ();
     }
@@ -212,6 +216,7 @@ fp16_vectorization (enum rtx_code icode,
 {
   gcc_assert (can_create_pseudo_p ());
 
+  enum rtx_code unary_op = UNKNOWN;
   machine_mode result_mode = GET_MODE (result);
   rtx op_orig[3] = { op1, op2, op3 };
   rtx op_hi[3];
@@ -226,6 +231,16 @@ fp16_vectorization (enum rtx_code icode,
       n_opts = 2;
       break;
 
+    case FP16_NEG_BINARY:
+      n_opts = 2;
+      unary_op = NEG;
+      break;
+
+    case FP16_ABS_BINARY:
+      n_opts = 2;
+      unary_op = ABS;
+      break;
+
     case FP16_FMA:
     case FP16_FMS:
     case FP16_NFMA:
@@ -274,6 +289,8 @@ fp16_vectorization (enum rtx_code icode,
   switch (subtype)
     {
     case FP16_BINARY:
+    case FP16_NEG_BINARY:
+    case FP16_ABS_BINARY:
       emit_insn (gen_rtx_SET (result_hi,
                              gen_rtx_fmt_ee (icode, V4SFmode,
                                              op_hi[0],
@@ -322,6 +339,16 @@ fp16_vectorization (enum rtx_code icode,
       gcc_unreachable ();
     }
 
+  /* Add any unary operator modifications.  */
+  if (unary_op != UNKNOWN)
+    {
+      emit_insn (gen_rtx_SET (result_hi,
+                             gen_rtx_fmt_e (unary_op, V4SFmode, result_hi)));
+
+      emit_insn (gen_rtx_SET (result_lo,
+                             gen_rtx_fmt_e (unary_op, V4SFmode, result_lo)));
+    }
+
   /* Combine the 2 V4SFmode operations into one V8HFmode/V8BFmode vector.  */
   if (result_mode == V8HFmode)
     emit_insn (gen_vec_pack_trunc_v4sf_v8hf (result, result_hi, result_lo));
diff --git a/gcc/config/rs6000/float16.md b/gcc/config/rs6000/float16.md
index 1ea070c4486e..cb684a41ec47 100644
--- a/gcc/config/rs6000/float16.md
+++ b/gcc/config/rs6000/float16.md
@@ -706,6 +706,104 @@
 })
 
 ;; Add vectorization support for 16-bit floating point.
+
+;; Negate vector bfloat16/float16
+(define_insn_and_split "neg<mode>2"
+  [(set (match_operand:VFP16_HW 0 "vsx_register_operand" "=wa")
+       (neg:VFP16_HW
+        (match_operand:VFP16_HW 1 "vsx_register_operand" "wa")))
+   (clobber (match_scratch:VFP16_HW 2 "=&wa"))]
+  ""
+  "#"
+  "&& 1"
+  [(set (match_dup 2)
+       (match_dup 3))
+   (set (match_dup 0)
+       (xor:VFP16_HW (match_dup 1)
+                     (match_dup 2)))]
+{
+  if (GET_CODE (operands[2]) == SCRATCH)
+    operands[2] = gen_reg_rtx (<MODE>mode);
+
+  REAL_VALUE_TYPE dconst;
+
+  gcc_assert (real_from_string (&dconst, "-0.0") == 0);
+
+  rtx neg0 = const_double_from_real_value (dconst, <VEC_base>mode);
+  rtvec v = rtvec_alloc (8);
+
+  for (size_t i = 0; i < 8; i++)
+  RTVEC_ELT (v, i) = neg0;
+
+  rtx vneg0 = gen_rtx_CONST_VECTOR (<MODE>mode, v);
+  if (!TARGET_PREFIXED)
+    vneg0 = force_const_mem (<MODE>mode, vneg0);
+
+  operands[3] = vneg0;
+}
+  [(set_attr "type" "veclogical")
+   (set_attr "length" "16")])
+
+;; XOR used to negate a 16-bit floating point type
+
+(define_insn "*xor<mode>3"
+  [(set (match_operand:VFP16_HW 0 "vsx_register_operand" "=wa")
+       (xor:VFP16_HW (match_operand:VFP16_HW 1 "vsx_register_operand" "wa")
+                     (match_operand:VFP16_HW 2 "vsx_register_operand" "wa")))]
+  ""
+  "xxlxor %x0,%x1,%x2"
+  [(set_attr "type" "veclogical")])
+
+;; 16-bit floating point vector absolute value
+
+(define_insn_and_split "abs<mode>2"
+  [(set (match_operand:VFP16_HW 0 "vsx_register_operand" "=wa")
+       (abs:VFP16_HW
+        (match_operand:VFP16_HW 1 "vsx_register_operand" "wa")))
+   (clobber (match_scratch:VFP16_HW 2 "=&wa"))]
+  ""
+  "#"
+  "&& 1"
+  [(set (match_dup 2)
+       (match_dup 3))
+   (set (match_dup 0)
+       (and:VFP16_HW (match_dup 1)
+                     (not:VFP16_HW (match_dup 2))))]
+{
+  if (GET_CODE (operands[2]) == SCRATCH)
+    operands[2] = gen_reg_rtx (<MODE>mode);
+
+  REAL_VALUE_TYPE dconst;
+
+  gcc_assert (real_from_string (&dconst, "-0.0") == 0);
+
+  rtx neg0 = const_double_from_real_value (dconst, <VEC_base>mode);
+  rtvec v = rtvec_alloc (8);
+
+  for (size_t i = 0; i < 8; i++)
+  RTVEC_ELT (v, i) = neg0;
+
+  rtx vneg0 = gen_rtx_CONST_VECTOR (<MODE>mode, v);
+  if (!TARGET_PREFIXED)
+    vneg0 = force_const_mem (<MODE>mode, vneg0);
+
+  operands[3] = vneg0;
+}
+  [(set_attr "type" "veclogical")
+   (set_attr "length" "16")])
+
+;; ANDC used to clear the sign bit of a 16-bit floating point type
+;; for absolute value.
+
+(define_insn "*andc<mode>3"
+  [(set (match_operand:VFP16_HW 0 "gpc_reg_operand" "=wa")
+       (and:VFP16_HW (match_operand:VFP16_HW 1 "gpc_reg_operand" "wa")
+                     (not:VFP16_HW
+                      (match_operand:VFP16_HW 2 "gpc_reg_operand" "wa"))))]
+  ""
+  "xxlandc %x0,%x1,%x2"
+  [(set_attr "type" "veclogical")])
+
 ;; Binary operators being vectorized.
 (define_insn_and_split "<fp16_names><mode>3"
   [(set (match_operand:VFP16_HW 0 "vsx_register_operand")
@@ -722,6 +820,110 @@
   DONE;
 })
 
+;; Negative of binary operators being vectorized.
+(define_insn_and_split "*neg_<fp16_names><mode>3"
+  [(set (match_operand:VFP16_HW 0 "vsx_register_operand")
+       (neg:VFP16_HW
+        (FP16_BINARY_OP:VFP16_HW
+         (match_operand:VFP16_HW 1 "vsx_register_operand")
+         (match_operand:VFP16_HW 2 "vsx_register_operand"))))]
+  "can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  fp16_vectorization (<CODE>, operands[0], operands[1], operands[2],
+                     NULL_RTX, FP16_NEG_BINARY);
+  DONE;
+})
+
+;; Absolute value of binary operators being vectorized.
+(define_insn_and_split "*abs_<fp16_names><mode>3"
+  [(set (match_operand:VFP16_HW 0 "vsx_register_operand")
+       (abs:VFP16_HW
+        (FP16_BINARY_OP:VFP16_HW
+         (match_operand:VFP16_HW 1 "vsx_register_operand")
+         (match_operand:VFP16_HW 2 "vsx_register_operand"))))]
+  "can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  fp16_vectorization (<CODE>, operands[0], operands[1], operands[2],
+                     NULL_RTX, FP16_ABS_BINARY);
+  DONE;
+})
+
+;; FMA operations being vectorized.
+(define_insn_and_split "fma<mode>4"
+  [(set (match_operand:VFP16_HW 0 "vsx_register_operand")
+       (fma:VFP16_HW
+        (match_operand:VFP16_HW 1 "vsx_register_operand")
+        (match_operand:VFP16_HW 2 "vsx_register_operand")
+        (match_operand:VFP16_HW 3 "vsx_register_operand")))]
+  "can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  fp16_vectorization (FMA, operands[0], operands[1], operands[2],
+                     operands[3], FP16_FMA);
+  DONE;
+})
+
+(define_insn_and_split "*fms<mode>4"
+  [(set (match_operand:VFP16_HW 0 "vsx_register_operand")
+       (fma:VFP16_HW
+        (match_operand:VFP16_HW 1 "vsx_register_operand")
+        (match_operand:VFP16_HW 2 "vsx_register_operand")
+        (neg:VFP16_HW
+         (match_operand:VFP16_HW 3 "vsx_register_operand"))))]
+  "can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  fp16_vectorization (FMA, operands[0], operands[1], operands[2],
+                     operands[3], FP16_FMS);
+  DONE;
+})
+
+(define_insn_and_split "*nfma<mode>4"
+  [(set (match_operand:VFP16_HW 0 "vsx_register_operand")
+       (neg:VFP16_HW
+        (fma:VFP16_HW
+         (match_operand:VFP16_HW 1 "vsx_register_operand")
+         (match_operand:VFP16_HW 2 "vsx_register_operand")
+         (match_operand:VFP16_HW 3 "vsx_register_operand"))))]
+  "can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  fp16_vectorization (FMA, operands[0], operands[1], operands[2],
+                     operands[3], FP16_NFMA);
+  DONE;
+})
+
+(define_insn_and_split "*nfms<mode>4"
+  [(set (match_operand:VFP16_HW 0 "vsx_register_operand")
+       (neg:VFP16_HW
+        (fma:VFP16_HW
+         (match_operand:VFP16_HW 1 "vsx_register_operand")
+         (match_operand:VFP16_HW 2 "vsx_register_operand")
+         (neg:VFP16_HW
+          (match_operand:VFP16_HW 3 "vsx_register_operand")))))]
+  "can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  fp16_vectorization (FMA, operands[0], operands[1], operands[2],
+                     operands[3], FP16_NFMS);
+  DONE;
+})
+
+
 
 ;; If we do multiple __bfloat16 operations, between the first and
 ;; second operation, GCC will want to convert the first operation from
diff --git a/gcc/config/rs6000/rs6000-protos.h 
b/gcc/config/rs6000/rs6000-protos.h
index 3c4d21299e1a..ee96e0e50a0e 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -263,6 +263,8 @@ extern unsigned constant_generates_xxspltidp 
(vec_const_128bit_type *);
 /* Optimize bfloat16 and float16 operations.  */
 enum fp16_operation {
   FP16_BINARY,                         /* Bfloat16/float16 binary op.  */
+  FP16_ABS_BINARY,                     /* abs (binary op).  */
+  FP16_NEG_BINARY,                     /* - (binary op).  */
   FP16_FMA,                            /* (a * b) + c.  */
   FP16_FMS,                            /* (a * b) - c.  */
   FP16_NFMA,                           /* - ((a * b) + c).  */

[gcc(refs/users/meissner/heads/work222-float)] Add unary, fma 16-bit floating point vector optimization.

Reply via email to