/* X86_TUNE_PREFER_BCST_FROM_INTEGER: Enable broadcast from integer for
   128/256/512-bit vector, if disabled, the move will be done by
   broadcast/load from constant pool

   broadcast from integer:
      mov    $0xa,%eax
      vmovd  %eax,%xmm0
      vpbroadcastd %xmm0,%xmm0

   broadcast/load from constant pool:
      vpbroadcastd CST.0(%rip), %xmm0  */

The tune is on by default.

gcc/ChangeLog:

        PR target/123631
        * config/i386/i386-expand.cc (ix86_vector_duplicate_value):
        Don't force CONST_INT to reg !TARGET_PREFER_BCST_FROM_INTEGER,
        force it to mem instead.
        * config/i386/i386.h (TARGET_PREFER_BCST_FROM_INTEGER): New macro.
        * config/i386/x86-tune.def
        (X86_TUNE_PREFER_BCST_FROM_INTEGER): New tune.
---
 gcc/config/i386/i386-expand.cc | 17 +++++++++++++----
 gcc/config/i386/i386.h         |  3 +++
 gcc/config/i386/x86-tune.def   | 15 +++++++++++++++
 3 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index d6525ddcdd0..a82bb4399c9 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -17361,12 +17361,21 @@ ix86_vector_duplicate_value (machine_mode mode, rtx 
target, rtx val)
       machine_mode innermode = GET_MODE_INNER (mode);
       rtx reg;
 
-      /* If that fails, force VAL into a register.  */
+      /* If that fails, force VAL into a register or mem.  */
 
       start_sequence ();
-      reg = force_reg (innermode, val);
-      if (GET_MODE (reg) != innermode)
-       reg = gen_lowpart (innermode, reg);
+
+      if (!TARGET_PREFER_BCST_FROM_INTEGER && CONST_INT_P (val)
+         && GET_MODE_BITSIZE (innermode) <= HOST_BITS_PER_WIDE_INT
+         && GET_MODE_BITSIZE(mode) >= 128)
+       reg = validize_mem (force_const_mem (innermode, val));
+      else
+       {
+         reg = force_reg (innermode, val);
+         if (GET_MODE (reg) != innermode)
+           reg = gen_lowpart (innermode, reg);
+       }
+
       SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
       seq = end_sequence ();
       if (seq)
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 71bacc22052..888edfed88f 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -409,6 +409,9 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
        ix86_tune_features[X86_TUNE_INTER_UNIT_MOVES_FROM_VEC]
 #define TARGET_INTER_UNIT_CONVERSIONS \
        ix86_tune_features[X86_TUNE_INTER_UNIT_CONVERSIONS]
+#define TARGET_PREFER_BCST_FROM_INTEGER \
+  ix86_tune_features[X86_TUNE_PREFER_BCST_FROM_INTEGER]
+
 #define TARGET_FOUR_JUMP_LIMIT ix86_tune_features[X86_TUNE_FOUR_JUMP_LIMIT]
 #define TARGET_SCHEDULE                ix86_tune_features[X86_TUNE_SCHEDULE]
 #define TARGET_USE_BT          ix86_tune_features[X86_TUNE_USE_BT]
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index a1944620daf..53cf1a19433 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -488,6 +488,21 @@ DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_FROM_VEC, 
"inter_unit_moves_from_vec",
 DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSIONS, "inter_unit_conversions",
           ~(m_AMDFAM10 | m_BDVER))
 
+/* X86_TUNE_PREFER_BCST_FROM_INTEGER: Enable broadcast from integer for
+   128/256/512-bit vector, if disabled, the move will be done by
+   broadcast/load from constant pool
+
+   broadcast from integer:
+      mov    $0xa,%eax
+      vmovd  %eax,%xmm0
+      vpbroadcastd %xmm0,%xmm0
+
+   broadcast/load from constant pool:
+      vpbroadcastd CST.0(%rip), %xmm0  */
+
+DEF_TUNE (X86_TUNE_PREFER_BCST_FROM_INTEGER, "prefer_bcst_from_integer",
+          m_ALL)
+
 /* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for
    fp converts to destination register.  */
 DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, 
"split_mem_opnd_for_fp_converts",
-- 
2.34.1

Reply via email to