On 30 Jul 17:55, Kirill Yukhin wrote:
> On Wed, Jul 24, 2013 at 08:25:14AM -1000, Richard Henderson wrote:
> > On 07/24/2013 05:23 AM, Richard Biener wrote:
> > > "H.J. Lu" <hjl.to...@gmail.com> wrote:
> > > 
> > >> Hi,
> > >>
> > >> Here is a patch to extend x86-64 psABI to support AVX-512:
> > > 
> > > Afaik avx 512 doubles the amount of xmm registers. Can we get them callee 
> > > saved please?

Hello,
I've implemented a tiny patch on top of `avx512' branch.
It makes first 128-bit parts 8 registers of AVX-512 callee saved: xmm16 through 
xmm23.

Here is performance data. It seems we have a little degradation in GEOMEAN.

Workload: Spec2006
Dataset: test
Options experiment: -m64 -fstrict-aliasing -fno-prefetch-loop-arrays -Ofast 
-funroll-loops -flto -fwhole-program -mavx512f
Options refernece : -m64 -fstrict-aliasing -fno-prefetch-loop-arrays -Ofast 
-funroll-loops -flto -fwhole-program

                "8 callee-"     "icount, all"   icount
                "save icount"   "call-clobber"  decrease
--------------------------------------------------------
400.perlbench   1686198567      1682320942      -0.23%
401.bzip2       18983033855     18983033907     0.00%
403.gcc         3999481141      3999095681      -0.01%
410.bwaves      13736672428     13736640026     0.00%
416.gamess      1531782811      1531350122      -0.03%
429.mcf         3079764286      3080957858      0.04%
433.milc        14628097067     14628175244     0.00%
434.zeusmp      21336261982     21359384879     0.11%
435.gromacs     3593653152      3588581849      -0.14%
436.cactusADM   2822346689      2828797842      0.23%
437.leslie3d    15903712760     15975143040     0.45%
444.namd        42446067469     43607637322     2.74%
445.gobmk       35272482208     35268743690     -0.01%
447.dealII      42476324881     42507009849     0.07%
450.soplex      45943150        45652666        -0.63%
453.povray      2314481169      2222157619      -3.99%
454.calculix    131024939       131078501       0.04%
456.hmmer       13853478444     13853306947     0.00%
458.sjeng       14173066874     14173066909     0.00%
459.GemsFDTD    2437559044      2437819638      0.01%
462.libquantum  175827242       175657854       -0.10%
464.h264ref     75718510217     75711714226     -0.01%
465.tonto       2505737844      2511457541      0.23%
470.lbm         4799298802      4812180033      0.27%
473.astar       17435751523     17435498947     0.00%
481.wrf         7144685575      7170593748      0.36%
482.sphinx3     6000198462      5984438416      -0.26%
483.xalancbmk   273958223       273638145       -0.12%
--------------------------------------------------------
GEOMEAN         4678862313      4677012093      -0.04%

Bigger % is better, negative mean that we have icount
increased after experiment


It seems to me that LRA is not always optimal, e.g. if you compile attached 
testcase
with: ./build-x86_64-linux/gcc/xgcc -B./build-x86_64-linux/gcc repro.c -S 
-Ofast -mavx512f

Assembler for main looks like:
main:
.LFB2331:
        vcvtsi2ss       %edi, %xmm1, %xmm1
        subq    $24, %rsp
        vextractf32x4   $0x0, %zmm16, (%rsp)
        vmovaps %zmm1, %zmm16
        call    test
        vfmadd132ss     .LC1(%rip), %xmm16, %xmm16
        vmovaps %zmm16, %zmm2
        movl    $.LC2, %edi
        movl    $1, %eax
        vunpcklps       %xmm2, %xmm2, %xmm2
        vcvtps2pd       %xmm2, %xmm0
        call    printf
        vmovaps %zmm16, %zmm3
        vinsertf32x4    $0x0, (%rsp), %zmm16, %zmm16
        addq    $24, %rsp
        vcvttss2si      %xmm3, %eax
        ret
I have no idea, why we are doind conversion to %xmm1 and then save it to %xmm16
However it maybe non-LRA issue.

Thanks, K


---
 gcc/config/i386/i386.c | 2 +-
 gcc/config/i386/i386.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 6b13ac9..d6d8040 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -9125,7 +9125,7 @@ ix86_nsaved_sseregs (void)
   int nregs = 0;
   int regno;
 
-  if (!TARGET_64BIT_MS_ABI)
+  if (!(TARGET_64BIT_MS_ABI || TARGET_AVX512F))
     return 0;
   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
     if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index d7a934d..9faab8b 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -1026,9 +1026,9 @@ enum target_cpu_default
 /*xmm8,xmm9,xmm10,xmm11,xmm12,xmm13,xmm14,xmm15*/              \
      6,   6,    6,    6,    6,    6,    6,    6,               \
 /*xmm16,xmm17,xmm18,xmm19,xmm20,xmm21,xmm22,xmm23*/            \
-     6,    6,     6,    6,    6,    6,    6,    6,             \
+     0,    0,     0,    0,    0,    0,    0,    0,             \
 /*xmm24,xmm25,xmm26,xmm27,xmm28,xmm29,xmm30,xmm31*/            \
-     6,    6,     6,    6,    6,    6,    6,    6,             \
+     1,    1,     1,    1,    1,    1,    1,    1,             \
  /* k0,  k1,  k2,  k3,  k4,  k5,  k6,  k7*/                    \
      1,   1,   1,   1,   1,   1,   1,   1 }
 
-- 
1.7.11.7


#include <stdio.h>
#include <immintrin.h>

int *p;
volatile float g1 = 100, g2  = 200;

void foo ()
{
  printf ("Hi\n");
}

void extern
test (void)
{
  float x, y, z;
  y = g1;
  z = g2;
  x = y + z;
  foo ();
  x += y * z;
  g2 = x;
}

int
main (int argc, char **argv)
{
  float a = argc;
  a += argc;
  test ();
  a += argc;

  printf ("==> %f\n", a);

  return a;
}

Reply via email to