On Mon, Mar 22, 2021 at 6:16 AM H.J. Lu <hjl.to...@gmail.com> wrote: > > Simply memcpy and memset inline strategies to avoid branches for > Skylake family CPUs: > > 1. With MOVE_RATIO and CLEAR_RATIO == 17, GCC will use integer/vector > load and store for up to 16 * 16 (256) bytes when the data size is > fixed and known. > 2. Inline only if data size is known to be <= 256. > a. Use "rep movsb/stosb" with simple code sequence if the data size > is a constant. > b. Use loop if data size is not a constant. > 3. Use memcpy/memset libray function if data size is unknown or > 256. > > On Cascadelake processor with -march=native -Ofast -flto, > > 1. Performance impacts of SPEC CPU 2017 rate are: > > 500.perlbench_r 0.17% > 502.gcc_r -0.36% > 505.mcf_r 0.00% > 520.omnetpp_r 0.08% > 523.xalancbmk_r -0.62% > 525.x264_r 1.04% > 531.deepsjeng_r 0.11% > 541.leela_r -1.09% > 548.exchange2_r -0.25% > 557.xz_r 0.17% > Geomean -0.08% > > 503.bwaves_r 0.00% > 507.cactuBSSN_r 0.69% > 508.namd_r -0.07% > 510.parest_r 1.12% > 511.povray_r 1.82% > 519.lbm_r 0.00% > 521.wrf_r -1.32% > 526.blender_r -0.47% > 527.cam4_r 0.23% > 538.imagick_r -1.72% > 544.nab_r -0.56% > 549.fotonik3d_r 0.12% > 554.roms_r 0.43% > Geomean 0.02% > > 2. Significant impacts on eembc benchmarks are: > > eembc/idctrn01 9.23% > eembc/nnet_test 29.26% > > gcc/ > > * config/i386/x86-tune-costs.h (skylake_memcpy): Updated. > (skylake_memset): Likewise. > (skylake_cost): Change CLEAR_RATIO to 17. > * config/i386/x86-tune.def (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB): > Replace m_CANNONLAKE, m_ICELAKE_CLIENT, m_ICELAKE_SERVER, > m_TIGERLAKE and m_SAPPHIRERAPIDS with m_SKYLAKE and m_CORE_AVX512. > > gcc/testsuite/ > > * gcc.target/i386/memcpy-strategy-9.c: New test. > * gcc.target/i386/memcpy-strategy-10.c: Likewise. > * gcc.target/i386/memcpy-strategy-11.c: Likewise. > * gcc.target/i386/memset-strategy-7.c: Likewise. > * gcc.target/i386/memset-strategy-8.c: Likewise. > * gcc.target/i386/memset-strategy-9.c: Likewise. > --- > gcc/config/i386/x86-tune-costs.h | 27 ++++++++++++------- > gcc/config/i386/x86-tune.def | 3 +-- > .../gcc.target/i386/memcpy-strategy-10.c | 11 ++++++++ > .../gcc.target/i386/memcpy-strategy-11.c | 18 +++++++++++++ > .../gcc.target/i386/memcpy-strategy-9.c | 9 +++++++ > .../gcc.target/i386/memset-strategy-7.c | 11 ++++++++ > .../gcc.target/i386/memset-strategy-8.c | 9 +++++++ > .../gcc.target/i386/memset-strategy-9.c | 17 ++++++++++++ > 8 files changed, 93 insertions(+), 12 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-strategy-10.c > create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-strategy-11.c > create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-strategy-9.c > create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-7.c > create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-8.c > create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-9.c > > diff --git a/gcc/config/i386/x86-tune-costs.h > b/gcc/config/i386/x86-tune-costs.h > index 0e00ff99df3..ffe810f2bcb 100644 > --- a/gcc/config/i386/x86-tune-costs.h > +++ b/gcc/config/i386/x86-tune-costs.h > @@ -1822,17 +1822,24 @@ struct processor_costs znver3_cost = { > > /* skylake_cost should produce code tuned for Skylake familly of CPUs. */ > static stringop_algs skylake_memcpy[2] = { > - {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}}, > - {libcall, {{16, loop, false}, {512, unrolled_loop, false}, > - {-1, libcall, false}}}}; > + {libcall, > + {{256, rep_prefix_1_byte, true}, > + {256, loop, false}, > + {-1, libcall, false}}}, > + {libcall, > + {{256, rep_prefix_1_byte, true}, > + {256, loop, false}, > + {-1, libcall, false}}}}; > > static stringop_algs skylake_memset[2] = { > - {libcall, {{6, loop_1_byte, true}, > - {24, loop, true}, > - {8192, rep_prefix_4_byte, true}, > - {-1, libcall, false}}}, > - {libcall, {{24, loop, true}, {512, unrolled_loop, false}, > - {-1, libcall, false}}}}; > + {libcall, > + {{256, rep_prefix_1_byte, true}, > + {256, loop, false}, > + {-1, libcall, false}}}, > + {libcall, > + {{256, rep_prefix_1_byte, true}, > + {256, loop, false}, > + {-1, libcall, false}}}}; > > static const > struct processor_costs skylake_cost = { > @@ -1889,7 +1896,7 @@ struct processor_costs skylake_cost = { > COSTS_N_INSNS (0), /* cost of movzx */ > 8, /* "large" insn */ > 17, /* MOVE_RATIO */ > - 6, /* CLEAR_RATIO */ > + 17, /* CLEAR_RATIO */ > {4, 4, 4}, /* cost of loading integer registers > in QImode, HImode and SImode. > Relative to reg-reg move (2). */ > diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def > index 134916cc972..eb057a67750 100644 > --- a/gcc/config/i386/x86-tune.def > +++ b/gcc/config/i386/x86-tune.def > @@ -273,8 +273,7 @@ DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", > m_386 | m_P4_NOCONA) > move/set sequences of bytes with known size. */ > DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB, > "prefer_known_rep_movsb_stosb", > - m_CANNONLAKE | m_ICELAKE_CLIENT | m_ICELAKE_SERVER | m_TIGERLAKE > - | m_ALDERLAKE | m_SAPPHIRERAPIDS) > + m_SKYLAKE | m_ALDERLAKE | m_CORE_AVX512) > > /* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of > compact prologues and epilogues by issuing a misaligned moves. This > diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-10.c > b/gcc/testsuite/gcc.target/i386/memcpy-strategy-10.c > new file mode 100644 > index 00000000000..970aa741971 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-10.c > @@ -0,0 +1,11 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -march=skylake -mno-sse" } */ > +/* { dg-final { scan-assembler "jmp\tmemcpy" { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler "call\tmemcpy" { target ia32 } } } */ > +/* { dg-final { scan-assembler-not "rep movsb" } } */ > + > +void > +foo (char *dest, char *src) > +{ > + __builtin_memcpy (dest, src, 257); > +} > diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-11.c > b/gcc/testsuite/gcc.target/i386/memcpy-strategy-11.c > new file mode 100644 > index 00000000000..b6041944630 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-11.c > @@ -0,0 +1,18 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -march=skylake" } */ > +/* { dg-final { scan-assembler-not "jmp\tmemcpy" { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-not "call\tmemcpy" { target ia32 } } } */ > +/* { dg-final { scan-assembler-not "rep movsb" } } */ > + > +typedef unsigned char e_u8; > + > +#define MAXBC 8 > + > +void MixColumn(e_u8 a[4][MAXBC], e_u8 BC) > +{ > + e_u8 b[4][MAXBC]; > + int i, j; > + > + for(i = 0; i < 4; i++) > + for(j = 0; j < BC; j++) a[i][j] = b[i][j]; > +} > diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-9.c > b/gcc/testsuite/gcc.target/i386/memcpy-strategy-9.c > new file mode 100644 > index 00000000000..b0dc7484d09 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-9.c > @@ -0,0 +1,9 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -march=skylake -mno-sse" } */ > +/* { dg-final { scan-assembler "rep movsb" } } */ > + > +void > +foo (char *dest, char *src) > +{ > + __builtin_memcpy (dest, src, 256); > +} > diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-7.c > b/gcc/testsuite/gcc.target/i386/memset-strategy-7.c > new file mode 100644 > index 00000000000..07c2816910c > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/memset-strategy-7.c > @@ -0,0 +1,11 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -march=skylake -mno-sse" } */ > +/* { dg-final { scan-assembler "jmp\tmemset" { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler "call\tmemset" { target ia32 } } } */ > +/* { dg-final { scan-assembler-not "rep stosb" } } */ > + > +void > +foo (char *dest) > +{ > + __builtin_memset (dest, 0, 257); > +} > diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-8.c > b/gcc/testsuite/gcc.target/i386/memset-strategy-8.c > new file mode 100644 > index 00000000000..52ea882c814 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/memset-strategy-8.c > @@ -0,0 +1,9 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -march=skylake -mno-sse" } */ > +/* { dg-final { scan-assembler "rep stosb" } } */ > + > +void > +foo (char *dest) > +{ > + __builtin_memset (dest, 0, 256); > +} > diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-9.c > b/gcc/testsuite/gcc.target/i386/memset-strategy-9.c > new file mode 100644 > index 00000000000..d4db031958f > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/memset-strategy-9.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -march=skylake" } */ > +/* { dg-final { scan-assembler-not "jmp\tmemset" { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-not "call\tmemset" { target ia32 } } } */ > +/* { dg-final { scan-assembler-not "rep stosb" } } */ > + > +typedef unsigned char e_u8; > + > +#define MAXBC 8 > + > +void MixColumn(e_u8 a[4][MAXBC], e_u8 BC) > +{ > + int i, j; > + > + for(i = 0; i < 4; i++) > + for(j = 0; j < BC; j++) a[i][j] = 1; > +} > -- > 2.30.2 >
If there are no objections, I will check it in on Wednesday. -- H.J.