On Sat, Sep 21, 2013 at 12:54 PM, Jan Hubicka <hubi...@ucw.cz> wrote: > Hi, > this is upated version of patch discussed at > http://gcc.gnu.org/ml/gcc-patches/2012-12/msg00841.html > > It makes CORE tuning to more follow the optimization guidelines. > In particular it removes some tuning flags for features I implemented years > back specifically for K7/K8 chips that ended up in Core tunning becuase > it was based on generic. Incrementally I plan to drop some of these from > generic, too. > > Compared to previous version of patch I left out INC_DEC change, even > though Core I7+ should resolve dependencies on partial flags correctly. > Optimization manual still seems to suggest to not use this: > > Assembly/Compiler Coding Rule 33. (M impact, H generality) > INC and DEC instructions should be replaced with ADD or SUB instructions, > because ADD and SUB overwrite all flags, whereas INC and DEC do not, therefore > creating false dependencies on earlier instructions that set the flags. > > Other change dropped is use_vector_fp_converts that seems to improve > Core perofrmance.
I did not see this in your patch, but Wei has this tuning in this patch: http://gcc.gnu.org/ml/gcc-patches/2013-09/msg00884.html thanks, David > > I benchmarked the patch on SPEC2k and earlier it was benchmarked on 2k6 > and the performance difference seems in noise. It causes about 0.3% code > size reduction. Main motivation for the patch is to drop some codegen > oddities that do not make sense on modern chips. > > Bootstrapped/regtested x86_64-linux, will commit it shortly. > Honza > > * x86-tune.def (partial_reg_stall): Disable for CoreI7 and newer. > (sse_typeless_stores): Enable for core > (sse_load0_by_pxor): Likewise. > (four_jump_limit): Disable for core. > (pad_returns): Likewise. > (avoid_vector_decode): Likewise. > (fuse_cmp_and_branch): Enable for cores. > * i386.c (x86_accumulate_outgoing_args): Disable for cores. > Index: x86-tune.def > =================================================================== > *** x86-tune.def (revision 202812) > --- x86-tune.def (working copy) > *************** DEF_TUNE (X86_TUNE_MOVX, "movx", > *** 52,58 **** > and can happen in caller/callee saving sequences. */ > DEF_TUNE (X86_TUNE_PARTIAL_REG_STALL, "partial_reg_stall", m_PPRO) > DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall", > ! m_CORE_ALL | m_GENERIC) > /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall > * on 16-bit immediate moves into memory on Core2 and Corei7. */ > DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | m_GENERIC) > --- 52,58 ---- > and can happen in caller/callee saving sequences. */ > DEF_TUNE (X86_TUNE_PARTIAL_REG_STALL, "partial_reg_stall", m_PPRO) > DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall", > ! m_CORE2 | m_GENERIC) > /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall > * on 16-bit immediate moves into memory on Core2 and Corei7. */ > DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | m_GENERIC) > *************** DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INS > *** 125,132 **** > maintain just lower part of scalar values in proper format leaving the > upper part undefined. */ > DEF_TUNE (X86_TUNE_SSE_SPLIT_REGS, "sse_split_regs", m_ATHLON_K8) > ! DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores", > m_AMD_MULTIPLE) > ! DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor", m_PPRO | > m_P4_NOCONA) > DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall", > m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | > m_GENERIC) > DEF_TUNE (X86_TUNE_PROLOGUE_USING_MOVE, "prologue_using_move", > --- 125,134 ---- > maintain just lower part of scalar values in proper format leaving the > upper part undefined. */ > DEF_TUNE (X86_TUNE_SSE_SPLIT_REGS, "sse_split_regs", m_ATHLON_K8) > ! DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores", > ! m_AMD_MULTIPLE | m_CORE_ALL) > ! DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor", > ! m_PPRO | m_P4_NOCONA | m_CORE_ALL) > DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall", > m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | > m_GENERIC) > DEF_TUNE (X86_TUNE_PROLOGUE_USING_MOVE, "prologue_using_move", > *************** DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSION > *** 144,150 **** > /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more > than 4 branch instructions in the 16 byte window. */ > DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit", > ! m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM| m_AMD_MULTIPLE > | m_GENERIC) > DEF_TUNE (X86_TUNE_SCHEDULE, "schedule", > m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE > --- 146,152 ---- > /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more > than 4 branch instructions in the 16 byte window. */ > DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit", > ! m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_AMD_MULTIPLE > | m_GENERIC) > DEF_TUNE (X86_TUNE_SCHEDULE, "schedule", > m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE > *************** DEF_TUNE (X86_TUNE_USE_BT, "use_bt", > *** 154,166 **** > DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec", > ~(m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_GENERIC)) > DEF_TUNE (X86_TUNE_PAD_RETURNS, "pad_returns", > ! m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC) > DEF_TUNE (X86_TUNE_PAD_SHORT_FUNCTION, "pad_short_function", m_ATOM) > DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS, "ext_80387_constants", > m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE > | m_ATHLON_K8 | m_GENERIC) > DEF_TUNE (X86_TUNE_AVOID_VECTOR_DECODE, "avoid_vector_decode", > ! m_CORE_ALL | m_K8 | m_GENERIC) > /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode > and SImode multiply, but 386 and 486 do HImode multiply faster. */ > DEF_TUNE (X86_TUNE_PROMOTE_HIMODE_IMUL, "promote_himode_imul", > --- 156,168 ---- > DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec", > ~(m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_GENERIC)) > DEF_TUNE (X86_TUNE_PAD_RETURNS, "pad_returns", > ! m_AMD_MULTIPLE | m_GENERIC) > DEF_TUNE (X86_TUNE_PAD_SHORT_FUNCTION, "pad_short_function", m_ATOM) > DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS, "ext_80387_constants", > m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE > | m_ATHLON_K8 | m_GENERIC) > DEF_TUNE (X86_TUNE_AVOID_VECTOR_DECODE, "avoid_vector_decode", > ! m_K8 | m_GENERIC) > /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode > and SImode multiply, but 386 and 486 do HImode multiply faster. */ > DEF_TUNE (X86_TUNE_PROMOTE_HIMODE_IMUL, "promote_himode_imul", > *************** DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, > *** 193,199 **** > /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction > with a subsequent conditional jump instruction into a single > compare-and-branch uop. */ > ! DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH, "fuse_cmp_and_branch", m_BDVER) > /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag > will impact LEA instruction selection. */ > DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_ATOM | m_SLM) > --- 195,201 ---- > /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction > with a subsequent conditional jump instruction into a single > compare-and-branch uop. */ > ! DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH, "fuse_cmp_and_branch", m_BDVER | > m_CORE_ALL) > /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag > will impact LEA instruction selection. */ > DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_ATOM | m_SLM) > Index: i386.c > =================================================================== > *** i386.c (revision 202812) > --- i386.c (working copy) > *************** static unsigned int initial_ix86_arch_fe > *** 1899,1905 **** > }; > > static const unsigned int x86_accumulate_outgoing_args > ! = m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_CORE_ALL | m_AMD_MULTIPLE | > m_GENERIC; > > static const unsigned int x86_arch_always_fancy_math_387 > = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | > m_AMD_MULTIPLE | m_GENERIC; > --- 1899,1905 ---- > }; > > static const unsigned int x86_accumulate_outgoing_args > ! = m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC; > > static const unsigned int x86_arch_always_fancy_math_387 > = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | > m_AMD_MULTIPLE | m_GENERIC;