Hi, this patch enables X86_TUNE_SSE_TYPELESS_STORES for generic. This is because it is currently enabled for both Bulldozer and Cores and I think in fact all modern chips honnors it. I am not sure why it is off for Atom/Slm and Bobcat.
Second change is to enable X86_TUNE_SSE_LOAD0_BY_PXOR for Bulldozer, Bobcat and Generic. This flag is already on for Intel chips as recomended by the optimization manual. Amdfam15 manual has similar recommendation but only for avx variant. I believe it applies for pxor too and I verified it does not cause any regressions. I am now benchmarking the next two flags I dropped FIXME on - the multiply by immediate that are really specific for K8-based chips and I think it got enabled elsewhere by a mistake. Honza Bootstrapped/regtested x86_64-linux, will commit it shortly. * config/i386/x86-tune.def: Enable X86_TUNE_SSE_TYPELESS_STORES for generic, enable X86_TUNE_SSE_LOAD0_BY_PXOR for Bulldozer, Bobcat and generic. * gcc.target/i386/avx256-unaligned-store-3.c: Update template for tuning change. * gcc.target/i386/avx256-unaligned-store-1.c: Likewise. * gcc.target/i386/pr49168-1.c: Likewise. * gcc.target/i386/pr49002-2.c: Likewise. Index: testsuite/gcc.target/i386/avx256-unaligned-store-3.c =================================================================== --- testsuite/gcc.target/i386/avx256-unaligned-store-3.c (revision 203380) +++ testsuite/gcc.target/i386/avx256-unaligned-store-3.c (working copy) @@ -18,5 +18,5 @@ avx_test (void) } /* { dg-final { scan-assembler-not "avx_storeupd256" } } */ -/* { dg-final { scan-assembler "vmovupd.*\\*movv2df_internal/3" } } */ +/* { dg-final { scan-assembler "vmovups.*\\*movv2df_internal/3" } } */ /* { dg-final { scan-assembler "vextractf128" } } */ Index: testsuite/gcc.target/i386/pr49168-1.c =================================================================== --- testsuite/gcc.target/i386/pr49168-1.c (revision 203380) +++ testsuite/gcc.target/i386/pr49168-1.c (working copy) @@ -2,7 +2,8 @@ /* { dg-do compile } */ /* { dg-options "-O2 -msse2 -mtune=generic" } */ /* { dg-final { scan-assembler-not "movdqa\[\t \]*%xmm\[0-9\]\+,\[^,\]*" } } */ -/* { dg-final { scan-assembler "movdqu\[\t \]*%xmm\[0-9\]\+,\[^,\]*" } } */ +/* { dg-final { scan-assembler-not "movaps\[\t \]*%xmm\[0-9\]\+,\[^,\]*" } } */ +/* { dg-final { scan-assembler "movups\[\t \]*%xmm\[0-9\]\+,\[^,\]*" } } */ void flt128_va (void *mem, __float128 d) Index: testsuite/gcc.target/i386/pr49002-2.c =================================================================== --- testsuite/gcc.target/i386/pr49002-2.c (revision 203380) +++ testsuite/gcc.target/i386/pr49002-2.c (working copy) @@ -11,4 +11,5 @@ void foo(const __m128d from, __m256d *to /* Ensure we store ymm, not xmm. */ /* { dg-final { scan-assembler-not "vmovapd\[\t \]*%xmm\[0-9\]\+,\[^,\]*" } } */ -/* { dg-final { scan-assembler "vmovapd\[\t \]*%ymm\[0-9\]\+,\[^,\]*" } } */ +/* { dg-final { scan-assembler-not "vmovaps\[\t \]*%xmm\[0-9\]\+,\[^,\]*" } } */ +/* { dg-final { scan-assembler "vmovaps\[\t \]*%ymm\[0-9\]\+,\[^,\]*" } } */ Index: testsuite/gcc.target/i386/avx256-unaligned-store-2.c =================================================================== --- testsuite/gcc.target/i386/avx256-unaligned-store-2.c (revision 203380) +++ testsuite/gcc.target/i386/avx256-unaligned-store-2.c (working copy) @@ -24,5 +24,5 @@ avx_test (void) } /* { dg-final { scan-assembler-not "avx_storedqu256" } } */ -/* { dg-final { scan-assembler "vmovdqu.*\\*movv16qi_internal/3" } } */ +/* { dg-final { scan-assembler "vmovups.*\\*movv16qi_internal/3" } } */ /* { dg-final { scan-assembler "vextract.128" } } */ Index: config/i386/x86-tune.def =================================================================== --- config/i386/x86-tune.def (revision 203380) +++ config/i386/x86-tune.def (working copy) @@ -221,16 +221,14 @@ DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INS upper part undefined. */ DEF_TUNE (X86_TUNE_SSE_SPLIT_REGS, "sse_split_regs", m_ATHLON_K8) -/* X86_TUNE_SSE_TYPELESS_STORES: Always movaps/movups for 128bit stores. - FIXME: Shall we enable it for generic? */ +/* X86_TUNE_SSE_TYPELESS_STORES: Always movaps/movups for 128bit stores. */ DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores", - m_AMD_MULTIPLE | m_CORE_ALL) + m_AMD_MULTIPLE | m_CORE_ALL | m_GENERIC) /* X86_TUNE_SSE_LOAD0_BY_PXOR: Always use pxor to load0 as opposed to - xorps/xorpd and other variants. - FIXME: Shall we enable it buldozers and for generic? */ + xorps/xorpd and other variants. */ DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor", - m_PPRO | m_P4_NOCONA | m_CORE_ALL) + m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BDVER | m_BTVER | m_GENERIC) /* X86_TUNE_MEMORY_MISMATCH_STALL: Avoid partial stores that are followed by full sized loads. */