/* X86_TUNE_PREFER_BCST_FROM_INTEGER: Enable broadcast from integer for
128/256/512-bit vector, if disabled, the move will be done by
broadcast/load from constant pool
broadcast from integer:
mov $0xa,%eax
vmovd %eax,%xmm0
vpbroadcastd %xmm0,%xmm0
broadcast/load from constant pool:
vpbroadcastd CST.0(%rip), %xmm0 */
The tune is on by default.
gcc/ChangeLog:
PR target/123631
* config/i386/i386-expand.cc (ix86_vector_duplicate_value):
Don't force CONST_INT to reg !TARGET_PREFER_BCST_FROM_INTEGER,
force it to mem instead.
* config/i386/i386.h (TARGET_PREFER_BCST_FROM_INTEGER): New macro.
* config/i386/x86-tune.def
(X86_TUNE_PREFER_BCST_FROM_INTEGER): New tune.
---
gcc/config/i386/i386-expand.cc | 17 +++++++++++++----
gcc/config/i386/i386.h | 3 +++
gcc/config/i386/x86-tune.def | 15 +++++++++++++++
3 files changed, 31 insertions(+), 4 deletions(-)
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index d6525ddcdd0..a82bb4399c9 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -17361,12 +17361,21 @@ ix86_vector_duplicate_value (machine_mode mode, rtx
target, rtx val)
machine_mode innermode = GET_MODE_INNER (mode);
rtx reg;
- /* If that fails, force VAL into a register. */
+ /* If that fails, force VAL into a register or mem. */
start_sequence ();
- reg = force_reg (innermode, val);
- if (GET_MODE (reg) != innermode)
- reg = gen_lowpart (innermode, reg);
+
+ if (!TARGET_PREFER_BCST_FROM_INTEGER && CONST_INT_P (val)
+ && GET_MODE_BITSIZE (innermode) <= HOST_BITS_PER_WIDE_INT
+ && GET_MODE_BITSIZE(mode) >= 128)
+ reg = validize_mem (force_const_mem (innermode, val));
+ else
+ {
+ reg = force_reg (innermode, val);
+ if (GET_MODE (reg) != innermode)
+ reg = gen_lowpart (innermode, reg);
+ }
+
SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
seq = end_sequence ();
if (seq)
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 71bacc22052..888edfed88f 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -409,6 +409,9 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
ix86_tune_features[X86_TUNE_INTER_UNIT_MOVES_FROM_VEC]
#define TARGET_INTER_UNIT_CONVERSIONS \
ix86_tune_features[X86_TUNE_INTER_UNIT_CONVERSIONS]
+#define TARGET_PREFER_BCST_FROM_INTEGER \
+ ix86_tune_features[X86_TUNE_PREFER_BCST_FROM_INTEGER]
+
#define TARGET_FOUR_JUMP_LIMIT ix86_tune_features[X86_TUNE_FOUR_JUMP_LIMIT]
#define TARGET_SCHEDULE ix86_tune_features[X86_TUNE_SCHEDULE]
#define TARGET_USE_BT ix86_tune_features[X86_TUNE_USE_BT]
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index a1944620daf..53cf1a19433 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -488,6 +488,21 @@ DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_FROM_VEC,
"inter_unit_moves_from_vec",
DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSIONS, "inter_unit_conversions",
~(m_AMDFAM10 | m_BDVER))
+/* X86_TUNE_PREFER_BCST_FROM_INTEGER: Enable broadcast from integer for
+ 128/256/512-bit vector, if disabled, the move will be done by
+ broadcast/load from constant pool
+
+ broadcast from integer:
+ mov $0xa,%eax
+ vmovd %eax,%xmm0
+ vpbroadcastd %xmm0,%xmm0
+
+ broadcast/load from constant pool:
+ vpbroadcastd CST.0(%rip), %xmm0 */
+
+DEF_TUNE (X86_TUNE_PREFER_BCST_FROM_INTEGER, "prefer_bcst_from_integer",
+ m_ALL)
+
/* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for
fp converts to destination register. */
DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS,
"split_mem_opnd_for_fp_converts",
--
2.34.1