Hello Alexander

On Tue, Feb 6, 2024 at 12:50 PM Alexander Monakov <amona...@ispras.ru>
wrote:

> Thanks to early checks in the inline buffer_is_zero wrapper, the SIMD
> routines are invoked much more rarely in normal use when most buffers
> are non-zero. This makes use of AVX512 unprofitable, as it incurs extra
> frequency and voltage transition periods during which the CPU operates
> at reduced performance, as described in
> https://travisdowns.github.io/blog/2020/01/17/avxfreq1.html


I would like to point out that the frequency scaling is not currently an
issue on AMD Zen4 Genoa CPUs, for example.
And microcode architecture description here:
https://www.amd.com/system/files/documents/4th-gen-epyc-processor-architecture-white-paper.pdf
Although, the cpu frequency downscaling mentioned in the above document is
only in relation to floating point operations.
But from other online discussions I gather that the data path for the
integer registers in Zen4 is also 256 bits and it allows to avoid
frequency downscaling for FP and heavy instructions.
And looking at the optimizations for AVX2 in your other patch, would
unrolling the loop for AVX512 ops benefit from the speedup taken that the
data path has the same width?
If the frequency downscaling is not observed on some of the CPUs, can
AVX512 be maintained and used selectively for some
of the CPUs?

Thank you!


>
>
> Signed-off-by: Mikhail Romanov <mmroma...@ispras.ru>
> Signed-off-by: Alexander Monakov <amona...@ispras.ru>
> ---
>  util/bufferiszero.c | 36 ++----------------------------------
>  1 file changed, 2 insertions(+), 34 deletions(-)
>
> diff --git a/util/bufferiszero.c b/util/bufferiszero.c
> index 01050694a6..c037d11d04 100644
> --- a/util/bufferiszero.c
> +++ b/util/bufferiszero.c
> @@ -64,7 +64,7 @@ buffer_is_zero_len_4_plus(const void *buf, size_t len)
>      }
>  }
>
> -#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) ||
> defined(__SSE2__)
> +#if defined(CONFIG_AVX2_OPT) || defined(__SSE2__)
>  #include <immintrin.h>
>
>  /* Note that each of these vectorized functions require len >= 64.  */
> @@ -128,35 +128,6 @@ buffer_zero_avx2(const void *buf, size_t len)
>  }
>  #endif /* CONFIG_AVX2_OPT */
>
> -#ifdef CONFIG_AVX512F_OPT
> -static bool __attribute__((target("avx512f")))
> -buffer_zero_avx512(const void *buf, size_t len)
> -{
> -    /* Begin with an unaligned head of 64 bytes.  */
> -    __m512i t = _mm512_loadu_si512(buf);
> -    __m512i *p = (__m512i *)(((uintptr_t)buf + 5 * 64) & -64);
> -    __m512i *e = (__m512i *)(((uintptr_t)buf + len) & -64);
> -
> -    /* Loop over 64-byte aligned blocks of 256.  */
> -    while (p <= e) {
> -        __builtin_prefetch(p);
> -        if (unlikely(_mm512_test_epi64_mask(t, t))) {
> -            return false;
> -        }
> -        t = p[-4] | p[-3] | p[-2] | p[-1];
> -        p += 4;
> -    }
> -
> -    t |= _mm512_loadu_si512(buf + len - 4 * 64);
> -    t |= _mm512_loadu_si512(buf + len - 3 * 64);
> -    t |= _mm512_loadu_si512(buf + len - 2 * 64);
> -    t |= _mm512_loadu_si512(buf + len - 1 * 64);
> -
> -    return !_mm512_test_epi64_mask(t, t);
> -
> -}
> -#endif /* CONFIG_AVX512F_OPT */
> -
>  static unsigned __attribute__((noinline))
>  select_accel_cpuinfo(unsigned info)
>  {
> @@ -165,9 +136,6 @@ select_accel_cpuinfo(unsigned info)
>          unsigned bit;
>          bool (*fn)(const void *, size_t);
>      } all[] = {
> -#ifdef CONFIG_AVX512F_OPT
> -        { CPUINFO_AVX512F, buffer_zero_avx512 },
> -#endif
>  #ifdef CONFIG_AVX2_OPT
>          { CPUINFO_AVX2,    buffer_zero_avx2 },
>  #endif
> @@ -191,7 +159,7 @@ static unsigned used_accel
>      = 0;
>  #endif
>
> -#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT)
> +#if defined(CONFIG_AVX2_OPT)
>  static void __attribute__((constructor)) init_accel(void)
>  {
>      used_accel = select_accel_cpuinfo(cpuinfo_init());
> --
> 2.32.0
>
>
>

-- 
Elena

Reply via email to