On Wed, Jul 14, 2021 at 10:34 AM liuhongt <hongtao....@intel.com> wrote: > > By optimizing vector movement to broadcast in ix86_expand_vector_move > during pass_expand, pass_reload/LRA can automatically generate an avx512 > embedded broadcast, pass_cpb is not needed. > > Considering that in the absence of avx512f, broadcast from memory is > still slightly faster than loading the entire memory, so always enable > broadcast. > > benchmark: > https://gitlab.com/x86-benchmarks/microbenchmark/-/tree/vaddps/broadcast > > The performance diff > > strategy : cycles > memory : 1046611188 > memory : 1255420817 > memory : 1044720793 > memory : 1253414145 > average : 1097868397 > > broadcast : 1044430688 > broadcast : 1044477630 > broadcast : 1253554603 > broadcast : 1044561934 > average : 1096756213 > > But however broadcast has larger size. > > the size diff > > size broadcast.o > text data bss dec hex filename > 137 0 0 137 89 broadcast.o > > size memory.o > text data bss dec hex filename > 115 0 0 115 73 memory.o > > Bootstrapped and regtested on x86_64-linux-gnu{-m32,} > > gcc/ChangeLog: > > * config/i386/i386-expand.c > (ix86_broadcast_from_integer_constant): Rename to .. > (ix86_broadcast_from_constant): .. this, and extend it to > handle float mode. > (ix86_expand_vector_move): Extend to float mode. > * config/i386/i386-features.c > (replace_constant_pool_with_broadcast): Remove. > (remove_partial_avx_dependency_gate): Ditto. > (constant_pool_broadcast): Ditto. > (class pass_constant_pool_broadcast): Ditto. > (make_pass_constant_pool_broadcast): Ditto. > (remove_partial_avx_dependency): Adjust gate. > * config/i386/i386-passes.def: Remove pass_constant_pool_broadcast. > * config/i386/i386-protos.h > (make_pass_constant_pool_broadcast): Remove. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/fuse-caller-save-xmm.c: Adjust testcase. > --- > gcc/config/i386/i386-expand.c | 29 +++- > gcc/config/i386/i386-features.c | 157 +----------------- > gcc/config/i386/i386-passes.def | 1 - > gcc/config/i386/i386-protos.h | 1 - > .../gcc.target/i386/fuse-caller-save-xmm.c | 2 +- > 5 files changed, 26 insertions(+), 164 deletions(-) > > diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c > index 69ea79e6123..ba870145acd 100644 > --- a/gcc/config/i386/i386-expand.c > +++ b/gcc/config/i386/i386-expand.c > @@ -453,8 +453,10 @@ ix86_expand_move (machine_mode mode, rtx operands[]) > emit_insn (gen_rtx_SET (op0, op1)); > } > > +/* OP is a memref of CONST_VECTOR, return scalar constant mem > + if CONST_VECTOR is a vec_duplicate, else return NULL. */ > static rtx > -ix86_broadcast_from_integer_constant (machine_mode mode, rtx op) > +ix86_broadcast_from_constant (machine_mode mode, rtx op) > { > int nunits = GET_MODE_NUNITS (mode); > if (nunits < 2) > @@ -462,7 +464,8 @@ ix86_broadcast_from_integer_constant (machine_mode mode, > rtx op) > > /* Don't use integer vector broadcast if we can't move from GPR to SSE > register directly. */ > - if (!TARGET_INTER_UNIT_MOVES_TO_VEC) > + if (!TARGET_INTER_UNIT_MOVES_TO_VEC > + && INTEGRAL_MODE_P (mode)) > return nullptr; > > /* Convert CONST_VECTOR to a non-standard SSE constant integer > @@ -470,12 +473,17 @@ ix86_broadcast_from_integer_constant (machine_mode > mode, rtx op) > if (!(TARGET_AVX2 > || (TARGET_AVX > && (GET_MODE_INNER (mode) == SImode > - || GET_MODE_INNER (mode) == DImode))) > + || GET_MODE_INNER (mode) == DImode)) > + || FLOAT_MODE_P (mode)) > || standard_sse_constant_p (op, mode)) > return nullptr; > > - /* Don't broadcast from a 64-bit integer constant in 32-bit mode. */ > - if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT) > + /* Don't broadcast from a 64-bit integer constant in 32-bit mode. > + We can still put 64-bit integer constant in memory when > + avx512 embed broadcast is available. */ > + if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT > + && (!TARGET_AVX512F > + || (GET_MODE_SIZE (mode) < 64 && !TARGET_AVX512VL))) > return nullptr; > > if (GET_MODE_INNER (mode) == TImode) > @@ -561,17 +569,20 @@ ix86_expand_vector_move (machine_mode mode, rtx > operands[]) > > if (can_create_pseudo_p () > && GET_MODE_SIZE (mode) >= 16 > - && GET_MODE_CLASS (mode) == MODE_VECTOR_INT > + && VECTOR_MODE_P (mode) > && (MEM_P (op1) > && SYMBOL_REF_P (XEXP (op1, 0)) > && CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0)))) > { > - rtx first = ix86_broadcast_from_integer_constant (mode, op1); > + rtx first = ix86_broadcast_from_constant (mode, op1); > if (first != nullptr) > { > /* Broadcast to XMM/YMM/ZMM register from an integer > - constant. */ > - op1 = ix86_gen_scratch_sse_rtx (mode); > + constant or scalar mem. */ > + op1 = gen_reg_rtx (mode); I've changed ix86_gen_scratch_sse_rtx to gen_reg_rtx to let LRA/reload make the final decision for register usage, would that make sense?
w/o that, hard register used here will prevent LRA/reload combine (set op1 (vec_duplicate: mem)) and (add (op1, op2)) to (add (op1, vec_duplicate: mem)) > + if (FLOAT_MODE_P (mode) > + || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode)) > + first = force_const_mem (GET_MODE_INNER (mode), first); > bool ok = ix86_expand_vector_init_duplicate (false, mode, > op1, first); > gcc_assert (ok); > diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c > index cbd430a2ecf..d9c6652d1c9 100644 > --- a/gcc/config/i386/i386-features.c > +++ b/gcc/config/i386/i386-features.c > @@ -2136,81 +2136,6 @@ make_pass_insert_endbr_and_patchable_area > (gcc::context *ctxt) > return new pass_insert_endbr_and_patchable_area (ctxt); > } > > -/* Replace all one-value const vector that are referenced by SYMBOL_REFs in x > - with embedded broadcast. i.e.transform > - > - vpaddq .LC0(%rip), %zmm0, %zmm0 > - ret > - .LC0: > - .quad 3 > - .quad 3 > - .quad 3 > - .quad 3 > - .quad 3 > - .quad 3 > - .quad 3 > - .quad 3 > - > - to > - > - vpaddq .LC0(%rip){1to8}, %zmm0, %zmm0 > - ret > - .LC0: > - .quad 3 */ > -static void > -replace_constant_pool_with_broadcast (rtx_insn *insn) > -{ > - subrtx_ptr_iterator::array_type array; > - FOR_EACH_SUBRTX_PTR (iter, array, &PATTERN (insn), ALL) > - { > - rtx *loc = *iter; > - rtx x = *loc; > - rtx broadcast_mem, vec_dup, constant, first; > - machine_mode mode; > - > - /* Constant pool. */ > - if (!MEM_P (x) > - || !SYMBOL_REF_P (XEXP (x, 0)) > - || !CONSTANT_POOL_ADDRESS_P (XEXP (x, 0))) > - continue; > - > - /* Const vector. */ > - mode = GET_MODE (x); > - if (!VECTOR_MODE_P (mode)) > - return; > - constant = get_pool_constant (XEXP (x, 0)); > - if (GET_CODE (constant) != CONST_VECTOR) > - return; > - > - /* There could be some rtx like > - (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1"))) > - but with "*.LC1" refer to V2DI constant vector. */ > - if (GET_MODE (constant) != mode) > - { > - constant = simplify_subreg (mode, constant, GET_MODE (constant), 0); > - if (constant == NULL_RTX || GET_CODE (constant) != CONST_VECTOR) > - return; > - } > - first = XVECEXP (constant, 0, 0); > - > - for (int i = 1; i < GET_MODE_NUNITS (mode); ++i) > - { > - rtx tmp = XVECEXP (constant, 0, i); > - /* Vector duplicate value. */ > - if (!rtx_equal_p (tmp, first)) > - return; > - } > - > - /* Replace with embedded broadcast. */ > - broadcast_mem = force_const_mem (GET_MODE_INNER (mode), first); > - vec_dup = gen_rtx_VEC_DUPLICATE (mode, broadcast_mem); > - validate_change (insn, loc, vec_dup, 0); > - > - /* At most 1 memory_operand in an insn. */ > - return; > - } > -} > - > /* At entry of the nearest common dominator for basic blocks with > conversions, generate a single > vxorps %xmmN, %xmmN, %xmmN > @@ -2249,10 +2174,6 @@ remove_partial_avx_dependency (void) > if (!NONDEBUG_INSN_P (insn)) > continue; > > - /* Handle AVX512 embedded broadcast here to save compile time. */ > - if (TARGET_AVX512F) > - replace_constant_pool_with_broadcast (insn); > - > set = single_set (insn); > if (!set) > continue; > @@ -2384,16 +2305,6 @@ remove_partial_avx_dependency (void) > return 0; > } > > -static bool > -remove_partial_avx_dependency_gate () > -{ > - return (TARGET_AVX > - && TARGET_SSE_PARTIAL_REG_DEPENDENCY > - && TARGET_SSE_MATH > - && optimize > - && optimize_function_for_speed_p (cfun)); > -} > - > namespace { > > const pass_data pass_data_remove_partial_avx_dependency = > @@ -2419,7 +2330,11 @@ public: > /* opt_pass methods: */ > virtual bool gate (function *) > { > - return remove_partial_avx_dependency_gate (); > + return (TARGET_AVX > + && TARGET_SSE_PARTIAL_REG_DEPENDENCY > + && TARGET_SSE_MATH > + && optimize > + && optimize_function_for_speed_p (cfun)); > } > > virtual unsigned int execute (function *) > @@ -2436,68 +2351,6 @@ make_pass_remove_partial_avx_dependency (gcc::context > *ctxt) > return new pass_remove_partial_avx_dependency (ctxt); > } > > -/* For const vector having one duplicated value, there's no need to put > - whole vector in the constant pool when target supports embedded > broadcast. */ > -static unsigned int > -constant_pool_broadcast (void) > -{ > - timevar_push (TV_MACH_DEP); > - rtx_insn *insn; > - > - for (insn = get_insns (); insn; insn = NEXT_INSN (insn)) > - { > - if (INSN_P (insn)) > - replace_constant_pool_with_broadcast (insn); > - } > - timevar_pop (TV_MACH_DEP); > - return 0; > -} > - > -namespace { > - > -const pass_data pass_data_constant_pool_broadcast = > -{ > - RTL_PASS, /* type */ > - "cpb", /* name */ > - OPTGROUP_NONE, /* optinfo_flags */ > - TV_MACH_DEP, /* tv_id */ > - 0, /* properties_required */ > - 0, /* properties_provided */ > - 0, /* properties_destroyed */ > - 0, /* todo_flags_start */ > - TODO_df_finish, /* todo_flags_finish */ > -}; > - > -class pass_constant_pool_broadcast : public rtl_opt_pass > -{ > -public: > - pass_constant_pool_broadcast (gcc::context *ctxt) > - : rtl_opt_pass (pass_data_constant_pool_broadcast, ctxt) > - {} > - > - /* opt_pass methods: */ > - virtual bool gate (function *) > - { > - /* Return false if rpad pass gate is true. > - replace_constant_pool_with_broadcast is called > - from both this pass and rpad pass. */ > - return (TARGET_AVX512F && !remove_partial_avx_dependency_gate ()); > - } > - > - virtual unsigned int execute (function *) > - { > - return constant_pool_broadcast (); > - } > -}; // class pass_cpb > - > -} // anon namespace > - > -rtl_opt_pass * > -make_pass_constant_pool_broadcast (gcc::context *ctxt) > -{ > - return new pass_constant_pool_broadcast (ctxt); > -} > - > /* This compares the priority of target features in function DECL1 > and DECL2. It returns positive value if DECL1 is higher priority, > negative value if DECL2 is higher priority and 0 if they are the > diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def > index 44df00e94ac..29baf8acd0b 100644 > --- a/gcc/config/i386/i386-passes.def > +++ b/gcc/config/i386/i386-passes.def > @@ -33,4 +33,3 @@ along with GCC; see the file COPYING3. If not see > INSERT_PASS_BEFORE (pass_shorten_branches, 1, > pass_insert_endbr_and_patchable_area); > > INSERT_PASS_AFTER (pass_combine, 1, pass_remove_partial_avx_dependency); > - INSERT_PASS_AFTER (pass_combine, 1, pass_constant_pool_broadcast); > diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h > index 51376fcc454..07ac02aff69 100644 > --- a/gcc/config/i386/i386-protos.h > +++ b/gcc/config/i386/i386-protos.h > @@ -395,4 +395,3 @@ extern rtl_opt_pass > *make_pass_insert_endbr_and_patchable_area > (gcc::context *); > extern rtl_opt_pass *make_pass_remove_partial_avx_dependency > (gcc::context *); > -extern rtl_opt_pass *make_pass_constant_pool_broadcast (gcc::context *); > diff --git a/gcc/testsuite/gcc.target/i386/fuse-caller-save-xmm.c > b/gcc/testsuite/gcc.target/i386/fuse-caller-save-xmm.c > index 4deff93c1e8..b0d3dc38a0c 100644 > --- a/gcc/testsuite/gcc.target/i386/fuse-caller-save-xmm.c > +++ b/gcc/testsuite/gcc.target/i386/fuse-caller-save-xmm.c > @@ -6,7 +6,7 @@ typedef double v2df __attribute__((vector_size (16))); > static v2df __attribute__((noinline)) > bar (v2df a) > { > - return a + (v2df){ 3.0, 3.0 }; > + return a + (v2df){ 3.0, 4.0 }; > } > > v2df __attribute__((noinline)) > -- > 2.18.1 > -- BR, Hongtao