On Wed, Jan 21, 2026 at 9:24 AM liuhongt <[email protected]> wrote:
>
> /* X86_TUNE_PREFER_BCST_FROM_INTEGER: Enable broadcast from integer for
> 128/256/512-bit vector, if disabled, the move will be done by
> broadcast/load from constant pool
>
> broadcast from integer:
> mov $0xa,%eax
> vmovd %eax,%xmm0
> vpbroadcastd %xmm0,%xmm0
>
> broadcast/load from constant pool:
> vpbroadcastd CST.0(%rip), %xmm0 */
>
> The tune is on by default.
I've tested the patch with the tune off, it passed bootstrap and
regression test except for 77 test cases originally used for broadcast
from integer
I also tested it with Spec2017 on both P-core and E-core(with the tune
off), there's no big performance impact.
By default, the tune is on, so the codegen should be the same.
Ready to upstream.
>
> gcc/ChangeLog:
>
> PR target/123631
> * config/i386/i386-expand.cc (ix86_vector_duplicate_value):
> Don't force CONST_INT to reg !TARGET_PREFER_BCST_FROM_INTEGER,
> force it to mem instead.
> * config/i386/i386.h (TARGET_PREFER_BCST_FROM_INTEGER): New macro.
> * config/i386/x86-tune.def
> (X86_TUNE_PREFER_BCST_FROM_INTEGER): New tune.
> ---
> gcc/config/i386/i386-expand.cc | 17 +++++++++++++----
> gcc/config/i386/i386.h | 3 +++
> gcc/config/i386/x86-tune.def | 15 +++++++++++++++
> 3 files changed, 31 insertions(+), 4 deletions(-)
>
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index d6525ddcdd0..a82bb4399c9 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -17361,12 +17361,21 @@ ix86_vector_duplicate_value (machine_mode mode, rtx
> target, rtx val)
> machine_mode innermode = GET_MODE_INNER (mode);
> rtx reg;
>
> - /* If that fails, force VAL into a register. */
> + /* If that fails, force VAL into a register or mem. */
>
> start_sequence ();
> - reg = force_reg (innermode, val);
> - if (GET_MODE (reg) != innermode)
> - reg = gen_lowpart (innermode, reg);
> +
> + if (!TARGET_PREFER_BCST_FROM_INTEGER && CONST_INT_P (val)
> + && GET_MODE_BITSIZE (innermode) <= HOST_BITS_PER_WIDE_INT
> + && GET_MODE_BITSIZE(mode) >= 128)
> + reg = validize_mem (force_const_mem (innermode, val));
> + else
> + {
> + reg = force_reg (innermode, val);
> + if (GET_MODE (reg) != innermode)
> + reg = gen_lowpart (innermode, reg);
> + }
> +
> SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
> seq = end_sequence ();
> if (seq)
> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> index 71bacc22052..888edfed88f 100644
> --- a/gcc/config/i386/i386.h
> +++ b/gcc/config/i386/i386.h
> @@ -409,6 +409,9 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
> ix86_tune_features[X86_TUNE_INTER_UNIT_MOVES_FROM_VEC]
> #define TARGET_INTER_UNIT_CONVERSIONS \
> ix86_tune_features[X86_TUNE_INTER_UNIT_CONVERSIONS]
> +#define TARGET_PREFER_BCST_FROM_INTEGER \
> + ix86_tune_features[X86_TUNE_PREFER_BCST_FROM_INTEGER]
> +
> #define TARGET_FOUR_JUMP_LIMIT ix86_tune_features[X86_TUNE_FOUR_JUMP_LIMIT]
> #define TARGET_SCHEDULE ix86_tune_features[X86_TUNE_SCHEDULE]
> #define TARGET_USE_BT ix86_tune_features[X86_TUNE_USE_BT]
> diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
> index a1944620daf..53cf1a19433 100644
> --- a/gcc/config/i386/x86-tune.def
> +++ b/gcc/config/i386/x86-tune.def
> @@ -488,6 +488,21 @@ DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_FROM_VEC,
> "inter_unit_moves_from_vec",
> DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSIONS, "inter_unit_conversions",
> ~(m_AMDFAM10 | m_BDVER))
>
> +/* X86_TUNE_PREFER_BCST_FROM_INTEGER: Enable broadcast from integer for
> + 128/256/512-bit vector, if disabled, the move will be done by
> + broadcast/load from constant pool
> +
> + broadcast from integer:
> + mov $0xa,%eax
> + vmovd %eax,%xmm0
> + vpbroadcastd %xmm0,%xmm0
> +
> + broadcast/load from constant pool:
> + vpbroadcastd CST.0(%rip), %xmm0 */
> +
> +DEF_TUNE (X86_TUNE_PREFER_BCST_FROM_INTEGER, "prefer_bcst_from_integer",
> + m_ALL)
> +
> /* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for
> fp converts to destination register. */
> DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS,
> "split_mem_opnd_for_fp_converts",
> --
> 2.34.1
>
--
BR,
Hongtao