https://gcc.gnu.org/g:d095370c890d2d51479de908bd6e5d342214612f

commit d095370c890d2d51479de908bd6e5d342214612f
Author: Michael Meissner <[email protected]>
Date:   Mon Nov 3 19:54:47 2025 -0500

    Add changes from future patch submission.
    
    2025-11-03  Michael Meissner  <[email protected]>
    
    gcc/
    
            * config/rs6000/float16.cc (bfloat16_operation_as_v4sf): Move to be
            after fp16_vectorization.
            * config/rs6000/rs6000-protos.h (bfloat16_operation_as_v4sf): 
Likewise.

Diff:
---
 gcc/config/rs6000/float16.cc      | 285 +++++++++++++++++++-------------------
 gcc/config/rs6000/rs6000-protos.h |   4 +-
 2 files changed, 144 insertions(+), 145 deletions(-)

diff --git a/gcc/config/rs6000/float16.cc b/gcc/config/rs6000/float16.cc
index b2a389270136..2c7b6278a16a 100644
--- a/gcc/config/rs6000/float16.cc
+++ b/gcc/config/rs6000/float16.cc
@@ -42,49 +42,43 @@
 #include "common/common-target.h"
 #include "rs6000-internal.h"
 
-/* Expand a bfloat16 scalar floating point operation:
+/* Expand a 16-bit vector operation:
 
    ICODE:   Operation to perform.
    RESULT:  Result of the operation.
    OP1:     Input operand1.
    OP2:     Input operand2.
    OP3:     Input operand3 or NULL_RTX.
-   SUBTYPE: Describe the operation.
-
-   The operation is done as a V4SFmode vector operation.  This is because
-   converting BFmode from a scalar BFmode to SFmode to do the operation and
-   back again takes quite a bit of time.  GCC will only generate the native
-   operation if -Ofast is used.  The float16.md code that calls this function
-   adds various combine operations to do the operation in V4SFmode instead of
-   SFmode.  */
+   SUBTYPE: Describe the operation.  */
        
 void
-bfloat16_operation_as_v4sf (enum rtx_code icode,
-                           rtx result,
-                           rtx op1,
-                           rtx op2,
-                           rtx op3,
-                           enum fp16_operation subtype)
+fp16_vectorization (enum rtx_code icode,
+                   rtx result,
+                   rtx op1,
+                   rtx op2,
+                   rtx op3,
+                   enum fp16_operation subtype)
 {
   gcc_assert (can_create_pseudo_p ());
 
-  rtx result_v4sf = gen_reg_rtx (V4SFmode);
-  rtx ops_orig[3] = { op1, op2, op3 };
-  rtx ops_v4sf[3];
+  machine_mode result_mode = GET_MODE (result);
+  rtx op_orig[3] = { op1, op2, op3 };
+  rtx op_hi[3];
+  rtx op_lo[3];
+  rtx result_hi;
+  rtx result_lo;
   size_t n_opts;
 
   switch (subtype)
     {
     case FP16_BINARY:
       n_opts = 2;
-      gcc_assert (op3 == NULL_RTX);
       break;
 
     case FP16_FMA:
     case FP16_FMS:
     case FP16_NFMA:
     case FP16_NFMS:
-      gcc_assert (icode == FMA);
       n_opts = 3;
       break;
 
@@ -92,65 +86,52 @@ bfloat16_operation_as_v4sf (enum rtx_code icode,
       gcc_unreachable ();
     }
 
+  /* Allocate 2 temporaries for the results and the input operands.  */
+  result_hi = gen_reg_rtx (V4SFmode);
+  result_lo = gen_reg_rtx (V4SFmode);
+
   for (size_t i = 0; i < n_opts; i++)
     {
-      rtx op = ops_orig[i];
-      rtx tmp = ops_v4sf[i] = gen_reg_rtx (V4SFmode);
-
-      gcc_assert (op != NULL_RTX);
-
-      /* Remove truncation/extend added.  */
-      if (GET_CODE (op) == FLOAT_EXTEND || GET_CODE (op) == FLOAT_TRUNCATE)
-       op = XEXP (op, 0);
+      gcc_assert (op_orig[i] != NULL_RTX);
+      op_hi[i] = gen_reg_rtx (V4SFmode);       /* high register.  */
+      op_lo[i] = gen_reg_rtx (V4SFmode);       /* low register.  */
 
-      /* Convert operands to V4SFmode format.  We use SPLAT for registers to
-        get the value into the upper 32-bits.  We can use XXSPLTW to splat
-        words instead of VSPLTIH since the XVCVBF16SPN instruction ignores the
-        odd half-words, and XXSPLTW can operate on all VSX registers instead
-        of just the Altivec registers.  Using SPLAT instead of a shift also
-        insure that other bits are not a signalling NaN.  If we are using
-        XXSPLTIW or XXSPLTIB to load the constant the other bits are
-        duplicated.  */
+      rtx interleave_hi = gen_reg_rtx (result_mode);
+      rtx interleave_lo = gen_reg_rtx (result_mode);
+      rtx orig = op_orig[i];
 
-      if (op == CONST0_RTX (SFmode) || op == CONST0_RTX (BFmode))
-       emit_move_insn (tmp, CONST0_RTX (V4SFmode));
+      rs6000_expand_interleave (interleave_hi, orig, orig, !BYTES_BIG_ENDIAN);
+      rs6000_expand_interleave (interleave_lo, orig, orig,  BYTES_BIG_ENDIAN);
 
-      else if (GET_MODE (op) == BFmode)
+      if (result_mode == V8HFmode)
        {
-         emit_insn (gen_xxspltw_bf (tmp, force_reg (BFmode, op)));
-         emit_insn (gen_xvcvbf16spn_bf (tmp, tmp));
+         emit_insn (gen_xvcvhpsp_v8hf (op_hi[i], interleave_hi));
+         emit_insn (gen_xvcvhpsp_v8hf (op_lo[i], interleave_lo));
        }
 
-      else if (GET_MODE (op) == SFmode)
+      else if (result_mode == V8BFmode)
        {
-         if (GET_CODE (op) == CONST_DOUBLE)
-           {
-             rtvec v = rtvec_alloc (4);
-
-             for (size_t i = 0; i < 4; i++)
-               RTVEC_ELT (v, i) = op;
-
-             emit_insn (gen_rtx_SET (tmp,
-                                     gen_rtx_CONST_VECTOR (V4SFmode, v)));
-           }
-
-         else
-           emit_insn (gen_vsx_splat_v4sf (tmp,
-                                          force_reg (SFmode, op)));
+         emit_insn (gen_xvcvbf16spn_v8bf (op_hi[i], interleave_hi));
+         emit_insn (gen_xvcvbf16spn_v8bf (op_lo[i], interleave_lo));
        }
 
       else
        gcc_unreachable ();
     }
 
-  /* Do the operation in V4SFmode.  */
+  /* Do 2 sets of V4SFmode operations.  */
   switch (subtype)
     {
     case FP16_BINARY:
-      emit_insn (gen_rtx_SET (result_v4sf,
+      emit_insn (gen_rtx_SET (result_hi,
                              gen_rtx_fmt_ee (icode, V4SFmode,
-                                             ops_v4sf[0],
-                                             ops_v4sf[1])));
+                                             op_hi[0],
+                                             op_hi[1])));
+
+      emit_insn (gen_rtx_SET (result_lo,
+                             gen_rtx_fmt_ee (icode, V4SFmode,
+                                             op_lo[0],
+                                             op_lo[1])));
       break;
 
     case FP16_FMA:
@@ -158,19 +139,31 @@ bfloat16_operation_as_v4sf (enum rtx_code icode,
     case FP16_NFMA:
     case FP16_NFMS:
       {
-       rtx op1 = ops_v4sf[0];
-       rtx op2 = ops_v4sf[1];
-       rtx op3 = ops_v4sf[2];
+       rtx op1_hi = op_hi[0];
+       rtx op2_hi = op_hi[1];
+       rtx op3_hi = op_hi[2];
+
+       rtx op1_lo = op_lo[0];
+       rtx op2_lo = op_lo[1];
+       rtx op3_lo = op_lo[2];
 
        if (subtype == FP16_FMS || subtype == FP16_NFMS)
-         op3 = gen_rtx_NEG (V4SFmode, op3);
+         {
+           op3_hi = gen_rtx_NEG (V4SFmode, op3_hi);
+           op3_lo = gen_rtx_NEG (V4SFmode, op3_lo);
+         }
 
-       rtx op_fma = gen_rtx_FMA (V4SFmode, op1, op2, op3);
+       rtx op_fma_hi = gen_rtx_FMA (V4SFmode, op1_hi, op2_hi, op3_hi);
+       rtx op_fma_lo = gen_rtx_FMA (V4SFmode, op1_lo, op2_lo, op3_lo);
 
        if (subtype == FP16_NFMA || subtype == FP16_NFMS)
-         op_fma = gen_rtx_NEG (V4SFmode, op_fma);
+         {
+           op_fma_hi = gen_rtx_NEG (V4SFmode, op_fma_hi);
+           op_fma_lo = gen_rtx_NEG (V4SFmode, op_fma_lo);
+         }
 
-       emit_insn (gen_rtx_SET (result_v4sf, op_fma));
+       emit_insn (gen_rtx_SET (result_hi, op_fma_hi));
+       emit_insn (gen_rtx_SET (result_lo, op_fma_lo));
       }
       break;
 
@@ -178,58 +171,62 @@ bfloat16_operation_as_v4sf (enum rtx_code icode,
       gcc_unreachable ();
     }
 
-  /* Convert V4SF result back to scalar mode.  */
-  if (GET_MODE (result) == BFmode)
-    emit_insn (gen_xvcvspbf16_bf (result, result_v4sf));
+  /* Combine the 2 V4SFmode operations into one V8HFmode/V8BFmode vector.  */
+  if (result_mode == V8HFmode)
+    emit_insn (gen_vec_pack_trunc_v4sf_v8hf (result, result_hi, result_lo));
 
-  else if (GET_MODE (result) == SFmode)
-    {
-      rtx element = GEN_INT (WORDS_BIG_ENDIAN ? 2 : 3);
-      emit_insn (gen_vsx_extract_v4sf (result, result_v4sf, element));
-    }
+  else if (result_mode == V8BFmode)
+    emit_insn (gen_vec_pack_trunc_v4sf_v8bf (result, result_hi, result_lo));
 
   else
     gcc_unreachable ();
-}
 
+  return;
+}
 
-/* Expand a 16-bit vector operation:
+/* Expand a bfloat16 scalar floating point operation:
 
    ICODE:   Operation to perform.
    RESULT:  Result of the operation.
    OP1:     Input operand1.
    OP2:     Input operand2.
    OP3:     Input operand3 or NULL_RTX.
-   SUBTYPE: Describe the operation.  */
+   SUBTYPE: Describe the operation.
+
+   The operation is done as a V4SFmode vector operation.  This is because
+   converting BFmode from a scalar BFmode to SFmode to do the operation and
+   back again takes quite a bit of time.  GCC will only generate the native
+   operation if -Ofast is used.  The float16.md code that calls this function
+   adds various combine operations to do the operation in V4SFmode instead of
+   SFmode.  */
        
 void
-fp16_vectorization (enum rtx_code icode,
-                   rtx result,
-                   rtx op1,
-                   rtx op2,
-                   rtx op3,
-                   enum fp16_operation subtype)
+bfloat16_operation_as_v4sf (enum rtx_code icode,
+                           rtx result,
+                           rtx op1,
+                           rtx op2,
+                           rtx op3,
+                           enum fp16_operation subtype)
 {
   gcc_assert (can_create_pseudo_p ());
 
-  machine_mode result_mode = GET_MODE (result);
-  rtx op_orig[3] = { op1, op2, op3 };
-  rtx op_hi[3];
-  rtx op_lo[3];
-  rtx result_hi;
-  rtx result_lo;
+  rtx result_v4sf = gen_reg_rtx (V4SFmode);
+  rtx ops_orig[3] = { op1, op2, op3 };
+  rtx ops_v4sf[3];
   size_t n_opts;
 
   switch (subtype)
     {
     case FP16_BINARY:
       n_opts = 2;
+      gcc_assert (op3 == NULL_RTX);
       break;
 
     case FP16_FMA:
     case FP16_FMS:
     case FP16_NFMA:
     case FP16_NFMS:
+      gcc_assert (icode == FMA);
       n_opts = 3;
       break;
 
@@ -237,52 +234,65 @@ fp16_vectorization (enum rtx_code icode,
       gcc_unreachable ();
     }
 
-  /* Allocate 2 temporaries for the results and the input operands.  */
-  result_hi = gen_reg_rtx (V4SFmode);
-  result_lo = gen_reg_rtx (V4SFmode);
-
   for (size_t i = 0; i < n_opts; i++)
     {
-      gcc_assert (op_orig[i] != NULL_RTX);
-      op_hi[i] = gen_reg_rtx (V4SFmode);       /* high register.  */
-      op_lo[i] = gen_reg_rtx (V4SFmode);       /* low register.  */
+      rtx op = ops_orig[i];
+      rtx tmp = ops_v4sf[i] = gen_reg_rtx (V4SFmode);
 
-      rtx interleave_hi = gen_reg_rtx (result_mode);
-      rtx interleave_lo = gen_reg_rtx (result_mode);
-      rtx orig = op_orig[i];
+      gcc_assert (op != NULL_RTX);
 
-      rs6000_expand_interleave (interleave_hi, orig, orig, !BYTES_BIG_ENDIAN);
-      rs6000_expand_interleave (interleave_lo, orig, orig,  BYTES_BIG_ENDIAN);
+      /* Remove truncation/extend added.  */
+      if (GET_CODE (op) == FLOAT_EXTEND || GET_CODE (op) == FLOAT_TRUNCATE)
+       op = XEXP (op, 0);
 
-      if (result_mode == V8HFmode)
+      /* Convert operands to V4SFmode format.  We use SPLAT for registers to
+        get the value into the upper 32-bits.  We can use XXSPLTW to splat
+        words instead of VSPLTIH since the XVCVBF16SPN instruction ignores the
+        odd half-words, and XXSPLTW can operate on all VSX registers instead
+        of just the Altivec registers.  Using SPLAT instead of a shift also
+        insure that other bits are not a signalling NaN.  If we are using
+        XXSPLTIW or XXSPLTIB to load the constant the other bits are
+        duplicated.  */
+
+      if (op == CONST0_RTX (SFmode) || op == CONST0_RTX (BFmode))
+       emit_move_insn (tmp, CONST0_RTX (V4SFmode));
+
+      else if (GET_MODE (op) == BFmode)
        {
-         emit_insn (gen_xvcvhpsp_v8hf (op_hi[i], interleave_hi));
-         emit_insn (gen_xvcvhpsp_v8hf (op_lo[i], interleave_lo));
+         emit_insn (gen_xxspltw_bf (tmp, force_reg (BFmode, op)));
+         emit_insn (gen_xvcvbf16spn_bf (tmp, tmp));
        }
 
-      else if (result_mode == V8BFmode)
+      else if (GET_MODE (op) == SFmode)
        {
-         emit_insn (gen_xvcvbf16spn_v8bf (op_hi[i], interleave_hi));
-         emit_insn (gen_xvcvbf16spn_v8bf (op_lo[i], interleave_lo));
+         if (GET_CODE (op) == CONST_DOUBLE)
+           {
+             rtvec v = rtvec_alloc (4);
+
+             for (size_t i = 0; i < 4; i++)
+               RTVEC_ELT (v, i) = op;
+
+             emit_insn (gen_rtx_SET (tmp,
+                                     gen_rtx_CONST_VECTOR (V4SFmode, v)));
+           }
+
+         else
+           emit_insn (gen_vsx_splat_v4sf (tmp,
+                                          force_reg (SFmode, op)));
        }
 
       else
        gcc_unreachable ();
     }
 
-  /* Do 2 sets of V4SFmode operations.  */
+  /* Do the operation in V4SFmode.  */
   switch (subtype)
     {
     case FP16_BINARY:
-      emit_insn (gen_rtx_SET (result_hi,
-                             gen_rtx_fmt_ee (icode, V4SFmode,
-                                             op_hi[0],
-                                             op_hi[1])));
-
-      emit_insn (gen_rtx_SET (result_lo,
+      emit_insn (gen_rtx_SET (result_v4sf,
                              gen_rtx_fmt_ee (icode, V4SFmode,
-                                             op_lo[0],
-                                             op_lo[1])));
+                                             ops_v4sf[0],
+                                             ops_v4sf[1])));
       break;
 
     case FP16_FMA:
@@ -290,31 +300,19 @@ fp16_vectorization (enum rtx_code icode,
     case FP16_NFMA:
     case FP16_NFMS:
       {
-       rtx op1_hi = op_hi[0];
-       rtx op2_hi = op_hi[1];
-       rtx op3_hi = op_hi[2];
-
-       rtx op1_lo = op_lo[0];
-       rtx op2_lo = op_lo[1];
-       rtx op3_lo = op_lo[2];
+       rtx op1 = ops_v4sf[0];
+       rtx op2 = ops_v4sf[1];
+       rtx op3 = ops_v4sf[2];
 
        if (subtype == FP16_FMS || subtype == FP16_NFMS)
-         {
-           op3_hi = gen_rtx_NEG (V4SFmode, op3_hi);
-           op3_lo = gen_rtx_NEG (V4SFmode, op3_lo);
-         }
+         op3 = gen_rtx_NEG (V4SFmode, op3);
 
-       rtx op_fma_hi = gen_rtx_FMA (V4SFmode, op1_hi, op2_hi, op3_hi);
-       rtx op_fma_lo = gen_rtx_FMA (V4SFmode, op1_lo, op2_lo, op3_lo);
+       rtx op_fma = gen_rtx_FMA (V4SFmode, op1, op2, op3);
 
        if (subtype == FP16_NFMA || subtype == FP16_NFMS)
-         {
-           op_fma_hi = gen_rtx_NEG (V4SFmode, op_fma_hi);
-           op_fma_lo = gen_rtx_NEG (V4SFmode, op_fma_lo);
-         }
+         op_fma = gen_rtx_NEG (V4SFmode, op_fma);
 
-       emit_insn (gen_rtx_SET (result_hi, op_fma_hi));
-       emit_insn (gen_rtx_SET (result_lo, op_fma_lo));
+       emit_insn (gen_rtx_SET (result_v4sf, op_fma));
       }
       break;
 
@@ -322,15 +320,16 @@ fp16_vectorization (enum rtx_code icode,
       gcc_unreachable ();
     }
 
-  /* Combine the 2 V4SFmode operations into one V8HFmode/V8BFmode vector.  */
-  if (result_mode == V8HFmode)
-    emit_insn (gen_vec_pack_trunc_v4sf_v8hf (result, result_hi, result_lo));
+  /* Convert V4SF result back to scalar mode.  */
+  if (GET_MODE (result) == BFmode)
+    emit_insn (gen_xvcvspbf16_bf (result, result_v4sf));
 
-  else if (result_mode == V8BFmode)
-    emit_insn (gen_vec_pack_trunc_v4sf_v8bf (result, result_hi, result_lo));
+  else if (GET_MODE (result) == SFmode)
+    {
+      rtx element = GEN_INT (WORDS_BIG_ENDIAN ? 2 : 3);
+      emit_insn (gen_vsx_extract_v4sf (result, result_v4sf, element));
+    }
 
   else
     gcc_unreachable ();
-
-  return;
 }
diff --git a/gcc/config/rs6000/rs6000-protos.h 
b/gcc/config/rs6000/rs6000-protos.h
index 3c4d21299e1a..001dc1fc7f4b 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -269,10 +269,10 @@ enum fp16_operation {
   FP16_NFMS                            /* - ((a * b) - c).  */
 };
 
-extern void bfloat16_operation_as_v4sf (enum rtx_code, rtx, rtx, rtx, rtx,
-                                       enum fp16_operation);
 extern void fp16_vectorization (enum rtx_code, rtx, rtx, rtx, rtx,
                                enum fp16_operation);
+extern void bfloat16_operation_as_v4sf (enum rtx_code, rtx, rtx, rtx, rtx,
+                                       enum fp16_operation);
 #endif /* RTX_CODE */
 
 #ifdef TREE_CODE

Reply via email to