Hello Alexander On Tue, Feb 6, 2024 at 12:50 PM Alexander Monakov <amona...@ispras.ru> wrote:
> Thanks to early checks in the inline buffer_is_zero wrapper, the SIMD > routines are invoked much more rarely in normal use when most buffers > are non-zero. This makes use of AVX512 unprofitable, as it incurs extra > frequency and voltage transition periods during which the CPU operates > at reduced performance, as described in > https://travisdowns.github.io/blog/2020/01/17/avxfreq1.html I would like to point out that the frequency scaling is not currently an issue on AMD Zen4 Genoa CPUs, for example. And microcode architecture description here: https://www.amd.com/system/files/documents/4th-gen-epyc-processor-architecture-white-paper.pdf Although, the cpu frequency downscaling mentioned in the above document is only in relation to floating point operations. But from other online discussions I gather that the data path for the integer registers in Zen4 is also 256 bits and it allows to avoid frequency downscaling for FP and heavy instructions. And looking at the optimizations for AVX2 in your other patch, would unrolling the loop for AVX512 ops benefit from the speedup taken that the data path has the same width? If the frequency downscaling is not observed on some of the CPUs, can AVX512 be maintained and used selectively for some of the CPUs? Thank you! > > > Signed-off-by: Mikhail Romanov <mmroma...@ispras.ru> > Signed-off-by: Alexander Monakov <amona...@ispras.ru> > --- > util/bufferiszero.c | 36 ++---------------------------------- > 1 file changed, 2 insertions(+), 34 deletions(-) > > diff --git a/util/bufferiszero.c b/util/bufferiszero.c > index 01050694a6..c037d11d04 100644 > --- a/util/bufferiszero.c > +++ b/util/bufferiszero.c > @@ -64,7 +64,7 @@ buffer_is_zero_len_4_plus(const void *buf, size_t len) > } > } > > -#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) || > defined(__SSE2__) > +#if defined(CONFIG_AVX2_OPT) || defined(__SSE2__) > #include <immintrin.h> > > /* Note that each of these vectorized functions require len >= 64. */ > @@ -128,35 +128,6 @@ buffer_zero_avx2(const void *buf, size_t len) > } > #endif /* CONFIG_AVX2_OPT */ > > -#ifdef CONFIG_AVX512F_OPT > -static bool __attribute__((target("avx512f"))) > -buffer_zero_avx512(const void *buf, size_t len) > -{ > - /* Begin with an unaligned head of 64 bytes. */ > - __m512i t = _mm512_loadu_si512(buf); > - __m512i *p = (__m512i *)(((uintptr_t)buf + 5 * 64) & -64); > - __m512i *e = (__m512i *)(((uintptr_t)buf + len) & -64); > - > - /* Loop over 64-byte aligned blocks of 256. */ > - while (p <= e) { > - __builtin_prefetch(p); > - if (unlikely(_mm512_test_epi64_mask(t, t))) { > - return false; > - } > - t = p[-4] | p[-3] | p[-2] | p[-1]; > - p += 4; > - } > - > - t |= _mm512_loadu_si512(buf + len - 4 * 64); > - t |= _mm512_loadu_si512(buf + len - 3 * 64); > - t |= _mm512_loadu_si512(buf + len - 2 * 64); > - t |= _mm512_loadu_si512(buf + len - 1 * 64); > - > - return !_mm512_test_epi64_mask(t, t); > - > -} > -#endif /* CONFIG_AVX512F_OPT */ > - > static unsigned __attribute__((noinline)) > select_accel_cpuinfo(unsigned info) > { > @@ -165,9 +136,6 @@ select_accel_cpuinfo(unsigned info) > unsigned bit; > bool (*fn)(const void *, size_t); > } all[] = { > -#ifdef CONFIG_AVX512F_OPT > - { CPUINFO_AVX512F, buffer_zero_avx512 }, > -#endif > #ifdef CONFIG_AVX2_OPT > { CPUINFO_AVX2, buffer_zero_avx2 }, > #endif > @@ -191,7 +159,7 @@ static unsigned used_accel > = 0; > #endif > > -#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) > +#if defined(CONFIG_AVX2_OPT) > static void __attribute__((constructor)) init_accel(void) > { > used_accel = select_accel_cpuinfo(cpuinfo_init()); > -- > 2.32.0 > > > -- Elena