Hi,
this patch disables use of FMA in matrix multiplication loop for generic (for
x86-64-v3) and zen4.  I tested this on zen4 and Xenon Gold Gold 6212U.

For Intel this is neutral both on the matrix multiplication microbenchmark
(attached) and spec2k17 where the difference was within noise for Core.

On core the micro-benchmark runs as follows:

With FMA:

       578,500,241      cycles:u                         #    3.645 GHz         
                ( +-  0.12% )
       753,318,477      instructions:u                   #    1.30  insn per 
cycle              ( +-  0.00% )
       125,417,701      branches:u                       #  790.227 M/sec       
                ( +-  0.00% )
          0.159146 +- 0.000363 seconds time elapsed  ( +-  0.23% )


No FMA:

       577,573,960      cycles:u                         #    3.514 GHz         
                ( +-  0.15% )
       878,318,479      instructions:u                   #    1.52  insn per 
cycle              ( +-  0.00% )
       125,417,702      branches:u                       #  763.035 M/sec       
                ( +-  0.00% )
          0.164734 +- 0.000321 seconds time elapsed  ( +-  0.19% )

So the cycle count is unchanged and discrete multiply+add takes same time as 
FMA.

While on zen:


With FMA:
         484875179      cycles:u                         #    3.599 GHz         
             ( +-  0.05% )  (82.11%)
         752031517      instructions:u                   #    1.55  insn per 
cycle         
         125106525      branches:u                       #  928.712 M/sec       
             ( +-  0.03% )  (85.09%)
            128356      branch-misses:u                  #    0.10% of all 
branches          ( +-  0.06% )  (83.58%)

No FMA:
         375875209      cycles:u                         #    3.592 GHz         
             ( +-  0.08% )  (80.74%)
         875725341      instructions:u                   #    2.33  insn per 
cycle
         124903825      branches:u                       #    1.194 G/sec       
             ( +-  0.04% )  (84.59%)
          0.105203 +- 0.000188 seconds time elapsed  ( +-  0.18% )

The diffrerence is that Cores understand the fact that fmadd does not need
all three parameters to start computation, while Zen cores doesn't.

Since this seems noticeable win on zen and not loss on Core it seems like good
default for generic.

I plan to commit the patch next week if there are no compplains.

Honza

#include <stdio.h>
#include <time.h>

#define SIZE 1000

float a[SIZE][SIZE];
float b[SIZE][SIZE];
float c[SIZE][SIZE];

void init(void)
{
   int i, j, k;
   for(i=0; i<SIZE; ++i)
   {
      for(j=0; j<SIZE; ++j)
      {
         a[i][j] = (float)i + j;
         b[i][j] = (float)i - j;
         c[i][j] = 0.0f;
      }
   }
}

void mult(void)
{
   int i, j, k;

   for(i=0; i<SIZE; ++i)
   {
      for(j=0; j<SIZE; ++j)
      {  
         for(k=0; k<SIZE; ++k)
         {  
            c[i][j] += a[i][k] * b[k][j];
         }  
      }
   }
}

int main(void)
{
   clock_t s, e;

   init();
   s=clock();
   mult();
   e=clock();
   printf("        mult took %10d clocks\n", (int)(e-s));

   return 0;

}

        * confg/i386/x86-tune.def (X86_TUNE_AVOID_128FMA_CHAINS, 
X86_TUNE_AVOID_256FMA_CHAINS)
        Enable for znver4 and Core.

diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 43fa9e8fd6d..74b03cbcc60 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -515,13 +515,13 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, 
"use_scatter_8parts",
 
 /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
    smaller FMA chain.  */
-DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | 
m_ZNVER2 | m_ZNVER3
-          | m_YONGFENG)
+DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | 
m_ZNVER2 | m_ZNVER3 | m_ZNVER4
+          | m_YONGFENG | m_GENERIC)
 
 /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
    smaller FMA chain.  */
-DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | 
m_ZNVER3
-         | m_CORE_HYBRID | m_SAPPHIRERAPIDS | m_CORE_ATOM)
+DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | 
m_ZNVER3 | m_ZNVER4
+         | m_CORE_HYBRID | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC)
 
 /* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or
    smaller FMA chain.  */

Reply via email to