aba3b9d3a48a0703fd565f7c5f0caf604f59970b is the first bad commit commit aba3b9d3a48a0703fd565f7c5f0caf604f59970b Author: H.J. Lu <hjl.to...@gmail.com> Date: Fri May 9 07:17:07 2025 +0800
x86: Extend the remove_redundant_vector pass which removed non all 0s/1s redundant vector loads, caused SPEC CPU 2017 519.lbm_r and 470.lbm performance regressions on AMD znverN processors. Add a tuning option to keep non all 0s/1s redundant vector loads on AMD znverN processors. gcc/ PR target/120941 * config/i386/i386-features.cc (ix86_broadcast_inner): Keep non all 0s/1s redundant vector loads if asked. * config/i386/x86-tune.def (X86_TUNE_KEEP_REDUNDANT_VECTOR_LOAD): New tuning. gcc/testsuite/ PR target/120941 * gcc.target/i386/pr120941-1a.c: New test. * gcc.target/i386/pr120941-1b.c: Likewise. * gcc.target/i386/pr120941-1c.c: Likewise. * gcc.target/i386/pr120941-1d.c: Likewise. OK for master? Thanks. -- H.J.
From 27ca9842b54b9e3f585e23abe5fa6b21aa043c73 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" <hjl.to...@gmail.com> Date: Sat, 5 Jul 2025 04:12:47 +0800 Subject: [PATCH] x86: Keep non all 0s/1s redundant vector loads on AMD znverN aba3b9d3a48a0703fd565f7c5f0caf604f59970b is the first bad commit commit aba3b9d3a48a0703fd565f7c5f0caf604f59970b Author: H.J. Lu <hjl.to...@gmail.com> Date: Fri May 9 07:17:07 2025 +0800 x86: Extend the remove_redundant_vector pass which removed non all 0s/1s redundant vector loads, caused SPEC CPU 2017 519.lbm_r and 470.lbm performance regressions on AMD znverN processors. Add a tuning option to keep non all 0s/1s redundant vector loads on AMD znverN processors. gcc/ PR target/120941 * config/i386/i386-features.cc (ix86_broadcast_inner): Keep non all 0s/1s redundant vector loads if asked. * config/i386/x86-tune.def (X86_TUNE_KEEP_REDUNDANT_VECTOR_LOAD): New tuning. gcc/testsuite/ PR target/120941 * gcc.target/i386/pr120941-1a.c: New test. * gcc.target/i386/pr120941-1b.c: Likewise. * gcc.target/i386/pr120941-1c.c: Likewise. * gcc.target/i386/pr120941-1d.c: Likewise. Signed-off-by: H.J. Lu <hjl.to...@gmail.com> --- gcc/config/i386/i386-features.cc | 4 ++++ gcc/config/i386/x86-tune.def | 4 ++++ gcc/testsuite/gcc.target/i386/pr120941-1a.c | 19 +++++++++++++++++++ gcc/testsuite/gcc.target/i386/pr120941-1b.c | 5 +++++ gcc/testsuite/gcc.target/i386/pr120941-1c.c | 5 +++++ gcc/testsuite/gcc.target/i386/pr120941-1d.c | 5 +++++ 6 files changed, 42 insertions(+) create mode 100644 gcc/testsuite/gcc.target/i386/pr120941-1a.c create mode 100644 gcc/testsuite/gcc.target/i386/pr120941-1b.c create mode 100644 gcc/testsuite/gcc.target/i386/pr120941-1c.c create mode 100644 gcc/testsuite/gcc.target/i386/pr120941-1d.c diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc index 054f8d5ddc8..574eaf2e4d2 100644 --- a/gcc/config/i386/i386-features.cc +++ b/gcc/config/i386/i386-features.cc @@ -3552,6 +3552,10 @@ ix86_broadcast_inner (rtx op, machine_mode mode, return constm1_rtx; } + /* Skip if non all 0s/1s redundant vector loads should be kept. */ + if (ix86_tune_features[X86_TUNE_KEEP_REDUNDANT_VECTOR_LOAD]) + return nullptr; + mode = GET_MODE (op); int nunits = GET_MODE_NUNITS (mode); if (nunits < 2) diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index 91cdca7fbfc..d0b1da007f9 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -639,6 +639,10 @@ DEF_TUNE (X86_TUNE_AVX512_STORE_BY_PIECES, "avx512_store_by_pieces", DEF_TUNE (X86_TUNE_AVX512_TWO_EPILOGUES, "avx512_two_epilogues", m_ZNVER4 | m_ZNVER5) +/* X86_TUNE_KEEP_REDUNDANT_VECTOR_LOAD: Keep redundant vector loads. */ +DEF_TUNE (X86_TUNE_KEEP_REDUNDANT_VECTOR_LOAD, "keep_redundant_vector_load", + m_ZNVER) + /*****************************************************************************/ /*****************************************************************************/ /* Historical relics: tuning flags that helps a specific old CPU designs */ diff --git a/gcc/testsuite/gcc.target/i386/pr120941-1a.c b/gcc/testsuite/gcc.target/i386/pr120941-1a.c new file mode 100644 index 00000000000..daced44b4b3 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr120941-1a.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64-v4 -O2 -mtune=znver5" } */ +/* { dg-final { scan-assembler-times "vpbroadcastb" 2 } } */ + +#include <immintrin.h> + +extern __m512i sinkz; +extern __m256i sinky; +extern char f; + +void +foo(char c, int x) +{ + c += f; + sinkz = _mm512_set1_epi8(c); + if (x == 2) + f += 3; + sinky = _mm256_set1_epi8(c); +} diff --git a/gcc/testsuite/gcc.target/i386/pr120941-1b.c b/gcc/testsuite/gcc.target/i386/pr120941-1b.c new file mode 100644 index 00000000000..a00ba5eb8ac --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr120941-1b.c @@ -0,0 +1,5 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64-v4 -O2 -mtune=znver4" } */ +/* { dg-final { scan-assembler-times "vpbroadcastb" 2 } } */ + +#include "pr120941-1a.c" diff --git a/gcc/testsuite/gcc.target/i386/pr120941-1c.c b/gcc/testsuite/gcc.target/i386/pr120941-1c.c new file mode 100644 index 00000000000..2b10ecdee82 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr120941-1c.c @@ -0,0 +1,5 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64-v4 -O2 -mtune=znver3" } */ +/* { dg-final { scan-assembler-times "vpbroadcastb" 2 } } */ + +#include "pr120941-1a.c" diff --git a/gcc/testsuite/gcc.target/i386/pr120941-1d.c b/gcc/testsuite/gcc.target/i386/pr120941-1d.c new file mode 100644 index 00000000000..39a5714d99d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr120941-1d.c @@ -0,0 +1,5 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64-v4 -O2 -mtune=znver2" } */ +/* { dg-final { scan-assembler-times "vpbroadcastb" 2 } } */ + +#include "pr120941-1a.c" -- 2.50.0