On Mon, Mar 22, 2021 at 6:16 AM H.J. Lu <hjl.to...@gmail.com> wrote:
>
> Simply memcpy and memset inline strategies to avoid branches for
> Skylake family CPUs:
>
> 1. With MOVE_RATIO and CLEAR_RATIO == 17, GCC will use integer/vector
>    load and store for up to 16 * 16 (256) bytes when the data size is
>    fixed and known.
> 2. Inline only if data size is known to be <= 256.
>    a. Use "rep movsb/stosb" with simple code sequence if the data size
>       is a constant.
>    b. Use loop if data size is not a constant.
> 3. Use memcpy/memset libray function if data size is unknown or > 256.
>
> On Cascadelake processor with -march=native -Ofast -flto,
>
> 1. Performance impacts of SPEC CPU 2017 rate are:
>
> 500.perlbench_r  0.17%
> 502.gcc_r       -0.36%
> 505.mcf_r        0.00%
> 520.omnetpp_r    0.08%
> 523.xalancbmk_r -0.62%
> 525.x264_r       1.04%
> 531.deepsjeng_r  0.11%
> 541.leela_r     -1.09%
> 548.exchange2_r -0.25%
> 557.xz_r         0.17%
> Geomean         -0.08%
>
> 503.bwaves_r     0.00%
> 507.cactuBSSN_r  0.69%
> 508.namd_r      -0.07%
> 510.parest_r     1.12%
> 511.povray_r     1.82%
> 519.lbm_r        0.00%
> 521.wrf_r       -1.32%
> 526.blender_r   -0.47%
> 527.cam4_r       0.23%
> 538.imagick_r   -1.72%
> 544.nab_r       -0.56%
> 549.fotonik3d_r  0.12%
> 554.roms_r       0.43%
> Geomean          0.02%
>
> 2. Significant impacts on eembc benchmarks are:
>
> eembc/idctrn01   9.23%
> eembc/nnet_test  29.26%
>
> gcc/
>
>         * config/i386/x86-tune-costs.h (skylake_memcpy): Updated.
>         (skylake_memset): Likewise.
>         (skylake_cost): Change CLEAR_RATIO to 17.
>         * config/i386/x86-tune.def (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB):
>         Replace m_CANNONLAKE, m_ICELAKE_CLIENT, m_ICELAKE_SERVER,
>         m_TIGERLAKE and m_SAPPHIRERAPIDS with m_SKYLAKE and m_CORE_AVX512.
>
> gcc/testsuite/
>
>         * gcc.target/i386/memcpy-strategy-9.c: New test.
>         * gcc.target/i386/memcpy-strategy-10.c: Likewise.
>         * gcc.target/i386/memcpy-strategy-11.c: Likewise.
>         * gcc.target/i386/memset-strategy-7.c: Likewise.
>         * gcc.target/i386/memset-strategy-8.c: Likewise.
>         * gcc.target/i386/memset-strategy-9.c: Likewise.
> ---
>  gcc/config/i386/x86-tune-costs.h              | 27 ++++++++++++-------
>  gcc/config/i386/x86-tune.def                  |  3 +--
>  .../gcc.target/i386/memcpy-strategy-10.c      | 11 ++++++++
>  .../gcc.target/i386/memcpy-strategy-11.c      | 18 +++++++++++++
>  .../gcc.target/i386/memcpy-strategy-9.c       |  9 +++++++
>  .../gcc.target/i386/memset-strategy-7.c       | 11 ++++++++
>  .../gcc.target/i386/memset-strategy-8.c       |  9 +++++++
>  .../gcc.target/i386/memset-strategy-9.c       | 17 ++++++++++++
>  8 files changed, 93 insertions(+), 12 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-strategy-10.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-strategy-11.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-strategy-9.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-7.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-8.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-9.c
>
> diff --git a/gcc/config/i386/x86-tune-costs.h 
> b/gcc/config/i386/x86-tune-costs.h
> index 0e00ff99df3..ffe810f2bcb 100644
> --- a/gcc/config/i386/x86-tune-costs.h
> +++ b/gcc/config/i386/x86-tune-costs.h
> @@ -1822,17 +1822,24 @@ struct processor_costs znver3_cost = {
>
>  /* skylake_cost should produce code tuned for Skylake familly of CPUs.  */
>  static stringop_algs skylake_memcpy[2] =   {
> -  {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
> -  {libcall, {{16, loop, false}, {512, unrolled_loop, false},
> -             {-1, libcall, false}}}};
> +  {libcall,
> +   {{256, rep_prefix_1_byte, true},
> +    {256, loop, false},
> +    {-1, libcall, false}}},
> +  {libcall,
> +   {{256, rep_prefix_1_byte, true},
> +    {256, loop, false},
> +    {-1, libcall, false}}}};
>
>  static stringop_algs skylake_memset[2] = {
> -  {libcall, {{6, loop_1_byte, true},
> -             {24, loop, true},
> -             {8192, rep_prefix_4_byte, true},
> -             {-1, libcall, false}}},
> -  {libcall, {{24, loop, true}, {512, unrolled_loop, false},
> -             {-1, libcall, false}}}};
> +  {libcall,
> +   {{256, rep_prefix_1_byte, true},
> +    {256, loop, false},
> +    {-1, libcall, false}}},
> +  {libcall,
> +   {{256, rep_prefix_1_byte, true},
> +    {256, loop, false},
> +    {-1, libcall, false}}}};
>
>  static const
>  struct processor_costs skylake_cost = {
> @@ -1889,7 +1896,7 @@ struct processor_costs skylake_cost = {
>    COSTS_N_INSNS (0),                   /* cost of movzx */
>    8,                                   /* "large" insn */
>    17,                                  /* MOVE_RATIO */
> -  6,                                   /* CLEAR_RATIO */
> +  17,                                  /* CLEAR_RATIO */
>    {4, 4, 4},                           /* cost of loading integer registers
>                                            in QImode, HImode and SImode.
>                                            Relative to reg-reg move (2).  */
> diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
> index 134916cc972..eb057a67750 100644
> --- a/gcc/config/i386/x86-tune.def
> +++ b/gcc/config/i386/x86-tune.def
> @@ -273,8 +273,7 @@ DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", 
> m_386 | m_P4_NOCONA)
>     move/set sequences of bytes with known size.  */
>  DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB,
>           "prefer_known_rep_movsb_stosb",
> -         m_CANNONLAKE | m_ICELAKE_CLIENT | m_ICELAKE_SERVER | m_TIGERLAKE
> -         | m_ALDERLAKE | m_SAPPHIRERAPIDS)
> +         m_SKYLAKE | m_ALDERLAKE | m_CORE_AVX512)
>
>  /* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of
>     compact prologues and epilogues by issuing a misaligned moves.  This
> diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-10.c 
> b/gcc/testsuite/gcc.target/i386/memcpy-strategy-10.c
> new file mode 100644
> index 00000000000..970aa741971
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-10.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -march=skylake -mno-sse" } */
> +/* { dg-final { scan-assembler "jmp\tmemcpy" { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler "call\tmemcpy" { target ia32 } } } */
> +/* { dg-final { scan-assembler-not "rep movsb" } } */
> +
> +void
> +foo (char *dest, char *src)
> +{
> +  __builtin_memcpy (dest, src, 257);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-11.c 
> b/gcc/testsuite/gcc.target/i386/memcpy-strategy-11.c
> new file mode 100644
> index 00000000000..b6041944630
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-11.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -march=skylake" } */
> +/* { dg-final { scan-assembler-not "jmp\tmemcpy" { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-not "call\tmemcpy" { target ia32 } } } */
> +/* { dg-final { scan-assembler-not "rep movsb" } } */
> +
> +typedef unsigned char e_u8;
> +
> +#define MAXBC 8
> +
> +void MixColumn(e_u8 a[4][MAXBC], e_u8 BC)
> +{
> +  e_u8 b[4][MAXBC];
> +  int i, j;
> +
> +  for(i = 0; i < 4; i++)
> +    for(j = 0; j < BC; j++) a[i][j] = b[i][j];
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-9.c 
> b/gcc/testsuite/gcc.target/i386/memcpy-strategy-9.c
> new file mode 100644
> index 00000000000..b0dc7484d09
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-9.c
> @@ -0,0 +1,9 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -march=skylake -mno-sse" } */
> +/* { dg-final { scan-assembler "rep movsb" } } */
> +
> +void
> +foo (char *dest, char *src)
> +{
> +  __builtin_memcpy (dest, src, 256);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-7.c 
> b/gcc/testsuite/gcc.target/i386/memset-strategy-7.c
> new file mode 100644
> index 00000000000..07c2816910c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/memset-strategy-7.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -march=skylake -mno-sse" } */
> +/* { dg-final { scan-assembler "jmp\tmemset" { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler "call\tmemset" { target ia32 } } } */
> +/* { dg-final { scan-assembler-not "rep stosb" } } */
> +
> +void
> +foo (char *dest)
> +{
> +  __builtin_memset (dest, 0, 257);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-8.c 
> b/gcc/testsuite/gcc.target/i386/memset-strategy-8.c
> new file mode 100644
> index 00000000000..52ea882c814
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/memset-strategy-8.c
> @@ -0,0 +1,9 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -march=skylake -mno-sse" } */
> +/* { dg-final { scan-assembler "rep stosb" } } */
> +
> +void
> +foo (char *dest)
> +{
> +  __builtin_memset (dest, 0, 256);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-9.c 
> b/gcc/testsuite/gcc.target/i386/memset-strategy-9.c
> new file mode 100644
> index 00000000000..d4db031958f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/memset-strategy-9.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -march=skylake" } */
> +/* { dg-final { scan-assembler-not "jmp\tmemset" { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-not "call\tmemset" { target ia32 } } } */
> +/* { dg-final { scan-assembler-not "rep stosb" } } */
> +
> +typedef unsigned char e_u8;
> +
> +#define MAXBC 8
> +
> +void MixColumn(e_u8 a[4][MAXBC], e_u8 BC)
> +{
> +  int i, j;
> +
> +  for(i = 0; i < 4; i++)
> +    for(j = 0; j < BC; j++) a[i][j] = 1;
> +}
> --
> 2.30.2
>

If there are no objections, I will check it in on Wednesday.

-- 
H.J.

Reply via email to