Hi, this patch disables use of FMA in matrix multiplication loop for generic (for x86-64-v3) and zen4. I tested this on zen4 and Xenon Gold Gold 6212U.
For Intel this is neutral both on the matrix multiplication microbenchmark (attached) and spec2k17 where the difference was within noise for Core. On core the micro-benchmark runs as follows: With FMA: 578,500,241 cycles:u # 3.645 GHz ( +- 0.12% ) 753,318,477 instructions:u # 1.30 insn per cycle ( +- 0.00% ) 125,417,701 branches:u # 790.227 M/sec ( +- 0.00% ) 0.159146 +- 0.000363 seconds time elapsed ( +- 0.23% ) No FMA: 577,573,960 cycles:u # 3.514 GHz ( +- 0.15% ) 878,318,479 instructions:u # 1.52 insn per cycle ( +- 0.00% ) 125,417,702 branches:u # 763.035 M/sec ( +- 0.00% ) 0.164734 +- 0.000321 seconds time elapsed ( +- 0.19% ) So the cycle count is unchanged and discrete multiply+add takes same time as FMA. While on zen: With FMA: 484875179 cycles:u # 3.599 GHz ( +- 0.05% ) (82.11%) 752031517 instructions:u # 1.55 insn per cycle 125106525 branches:u # 928.712 M/sec ( +- 0.03% ) (85.09%) 128356 branch-misses:u # 0.10% of all branches ( +- 0.06% ) (83.58%) No FMA: 375875209 cycles:u # 3.592 GHz ( +- 0.08% ) (80.74%) 875725341 instructions:u # 2.33 insn per cycle 124903825 branches:u # 1.194 G/sec ( +- 0.04% ) (84.59%) 0.105203 +- 0.000188 seconds time elapsed ( +- 0.18% ) The diffrerence is that Cores understand the fact that fmadd does not need all three parameters to start computation, while Zen cores doesn't. Since this seems noticeable win on zen and not loss on Core it seems like good default for generic. I plan to commit the patch next week if there are no compplains. Honza #include <stdio.h> #include <time.h> #define SIZE 1000 float a[SIZE][SIZE]; float b[SIZE][SIZE]; float c[SIZE][SIZE]; void init(void) { int i, j, k; for(i=0; i<SIZE; ++i) { for(j=0; j<SIZE; ++j) { a[i][j] = (float)i + j; b[i][j] = (float)i - j; c[i][j] = 0.0f; } } } void mult(void) { int i, j, k; for(i=0; i<SIZE; ++i) { for(j=0; j<SIZE; ++j) { for(k=0; k<SIZE; ++k) { c[i][j] += a[i][k] * b[k][j]; } } } } int main(void) { clock_t s, e; init(); s=clock(); mult(); e=clock(); printf(" mult took %10d clocks\n", (int)(e-s)); return 0; } * confg/i386/x86-tune.def (X86_TUNE_AVOID_128FMA_CHAINS, X86_TUNE_AVOID_256FMA_CHAINS) Enable for znver4 and Core. diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index 43fa9e8fd6d..74b03cbcc60 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -515,13 +515,13 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts", /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or smaller FMA chain. */ -DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | m_ZNVER2 | m_ZNVER3 - | m_YONGFENG) +DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 + | m_YONGFENG | m_GENERIC) /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or smaller FMA chain. */ -DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVER3 - | m_CORE_HYBRID | m_SAPPHIRERAPIDS | m_CORE_ATOM) +DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVER3 | m_ZNVER4 + | m_CORE_HYBRID | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC) /* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or smaller FMA chain. */