On Thu, Jan 22, 2026 at 6:37 PM Richard Biener <[email protected]> wrote:
>
> On Tue, 20 Jan 2026, liuhongt wrote:
>
> > /* X86_TUNE_PREFER_BCST_FROM_INTEGER: Enable broadcast from integer for
> >    128/256/512-bit vector, if disabled, the move will be done by
> >    broadcast/load from constant pool
> >
> >    broadcast from integer:
> >       mov    $0xa,%eax
> >       vmovd  %eax,%xmm0
> >       vpbroadcastd %xmm0,%xmm0
> >
> >    broadcast/load from constant pool:
> >       vpbroadcastd CST.0(%rip), %xmm0  */
> >
> > The tune is on by default.
>
> Thanks!
>
> I wonder if the broadcast/load from constnat pool could be late
> split in to the former when a GPR is available (and the tune
> indicates it's profitable).
Good point, maybe the implementation of this tune could also be
changed to convert the load of const_vector into broadcast from memory
before RA, and then decide after RA whether we need to use a GPR to
avoid this load. I'm not sure if doing this optimization after RA
might have potential issues, I suspect there could be some level of
conflict between this and RA.
>
> > gcc/ChangeLog:
> >
> >       PR target/123631
> >       * config/i386/i386-expand.cc (ix86_vector_duplicate_value):
> >       Don't force CONST_INT to reg !TARGET_PREFER_BCST_FROM_INTEGER,
> >       force it to mem instead.
> >       * config/i386/i386.h (TARGET_PREFER_BCST_FROM_INTEGER): New macro.
> >       * config/i386/x86-tune.def
> >       (X86_TUNE_PREFER_BCST_FROM_INTEGER): New tune.
> > ---
> >  gcc/config/i386/i386-expand.cc | 17 +++++++++++++----
> >  gcc/config/i386/i386.h         |  3 +++
> >  gcc/config/i386/x86-tune.def   | 15 +++++++++++++++
> >  3 files changed, 31 insertions(+), 4 deletions(-)
> >
> > diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> > index d6525ddcdd0..a82bb4399c9 100644
> > --- a/gcc/config/i386/i386-expand.cc
> > +++ b/gcc/config/i386/i386-expand.cc
> > @@ -17361,12 +17361,21 @@ ix86_vector_duplicate_value (machine_mode mode, 
> > rtx target, rtx val)
> >        machine_mode innermode = GET_MODE_INNER (mode);
> >        rtx reg;
> >
> > -      /* If that fails, force VAL into a register.  */
> > +      /* If that fails, force VAL into a register or mem.  */
> >
> >        start_sequence ();
> > -      reg = force_reg (innermode, val);
> > -      if (GET_MODE (reg) != innermode)
> > -     reg = gen_lowpart (innermode, reg);
> > +
> > +      if (!TARGET_PREFER_BCST_FROM_INTEGER && CONST_INT_P (val)
> > +       && GET_MODE_BITSIZE (innermode) <= HOST_BITS_PER_WIDE_INT
> > +       && GET_MODE_BITSIZE(mode) >= 128)
> > +     reg = validize_mem (force_const_mem (innermode, val));
> > +      else
> > +     {
> > +       reg = force_reg (innermode, val);
> > +       if (GET_MODE (reg) != innermode)
> > +         reg = gen_lowpart (innermode, reg);
> > +     }
> > +
> >        SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
> >        seq = end_sequence ();
> >        if (seq)
> > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> > index 71bacc22052..888edfed88f 100644
> > --- a/gcc/config/i386/i386.h
> > +++ b/gcc/config/i386/i386.h
> > @@ -409,6 +409,9 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
> >       ix86_tune_features[X86_TUNE_INTER_UNIT_MOVES_FROM_VEC]
> >  #define TARGET_INTER_UNIT_CONVERSIONS \
> >       ix86_tune_features[X86_TUNE_INTER_UNIT_CONVERSIONS]
> > +#define TARGET_PREFER_BCST_FROM_INTEGER \
> > +  ix86_tune_features[X86_TUNE_PREFER_BCST_FROM_INTEGER]
> > +
> >  #define TARGET_FOUR_JUMP_LIMIT       
> > ix86_tune_features[X86_TUNE_FOUR_JUMP_LIMIT]
> >  #define TARGET_SCHEDULE              ix86_tune_features[X86_TUNE_SCHEDULE]
> >  #define TARGET_USE_BT                ix86_tune_features[X86_TUNE_USE_BT]
> > diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
> > index a1944620daf..53cf1a19433 100644
> > --- a/gcc/config/i386/x86-tune.def
> > +++ b/gcc/config/i386/x86-tune.def
> > @@ -488,6 +488,21 @@ DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_FROM_VEC, 
> > "inter_unit_moves_from_vec",
> >  DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSIONS, "inter_unit_conversions",
> >            ~(m_AMDFAM10 | m_BDVER))
> >
> > +/* X86_TUNE_PREFER_BCST_FROM_INTEGER: Enable broadcast from integer for
> > +   128/256/512-bit vector, if disabled, the move will be done by
> > +   broadcast/load from constant pool
> > +
> > +   broadcast from integer:
> > +      mov    $0xa,%eax
> > +      vmovd  %eax,%xmm0
> > +      vpbroadcastd %xmm0,%xmm0
> > +
> > +   broadcast/load from constant pool:
> > +      vpbroadcastd CST.0(%rip), %xmm0  */
> > +
> > +DEF_TUNE (X86_TUNE_PREFER_BCST_FROM_INTEGER, "prefer_bcst_from_integer",
> > +          m_ALL)
> > +
> >  /* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for
> >     fp converts to destination register.  */
> >  DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, 
> > "split_mem_opnd_for_fp_converts",
> >
>
> --
> Richard Biener <[email protected]>
> SUSE Software Solutions Germany GmbH,
> Frankenstrasse 146, 90461 Nuernberg, Germany;
> GF: Jochen Jaser, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)



-- 
BR,
Hongtao

Reply via email to