[PATCH 2/2] util: add util function buffer_zero_avx512()

2020-02-12 Thread Robert Hoo
And initialize buffer_is_zero() with it, when Intel AVX512F is
available on host.

This function utilizes Intel AVX512 fundamental instructions which
perform over previous AVX2 instructions.

Signed-off-by: Robert Hoo 
---
 include/qemu/cpuid.h |  3 +++
 util/bufferiszero.c  | 56 +---
 2 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/include/qemu/cpuid.h b/include/qemu/cpuid.h
index 6930170..09fc245 100644
--- a/include/qemu/cpuid.h
+++ b/include/qemu/cpuid.h
@@ -45,6 +45,9 @@
 #ifndef bit_AVX2
 #define bit_AVX2(1 << 5)
 #endif
+#ifndef bit_AVX512F
+#define bit_AVX512F(1 << 16)
+#endif
 #ifndef bit_BMI2
 #define bit_BMI2(1 << 8)
 #endif
diff --git a/util/bufferiszero.c b/util/bufferiszero.c
index bfb2605..cbb854a 100644
--- a/util/bufferiszero.c
+++ b/util/bufferiszero.c
@@ -187,12 +187,54 @@ buffer_zero_avx2(const void *buf, size_t len)
 #pragma GCC pop_options
 #endif /* CONFIG_AVX2_OPT */
 
+#ifdef CONFIG_AVX512F_OPT
+#pragma GCC push_options
+#pragma GCC target("avx512f")
+#include 
+
+static bool
+buffer_zero_avx512(const void *buf, size_t len)
+{
+__m512i t;
+__m512i *p, *e;
+
+if (unlikely(len < 64)) { /*buff less than 512 bits, unlikely*/
+return buffer_zero_int(buf, len);
+}
+/* Begin with an unaligned head of 64 bytes.  */
+t = _mm512_loadu_si512(buf);
+p = (__m512i *)(((uintptr_t)buf + 5 * 64) & -64);
+e = (__m512i *)(((uintptr_t)buf + len) & -64);
+
+/* Loop over 64-byte aligned blocks of 256.  */
+while (p < e) {
+__builtin_prefetch(p);
+if (unlikely(_mm512_test_epi64_mask(t, t))) {
+return false;
+}
+t = p[-4] | p[-3] | p[-2] | p[-1];
+p += 4;
+}
+
+t |= _mm512_loadu_si512(buf + len - 4 * 64);
+t |= _mm512_loadu_si512(buf + len - 3 * 64);
+t |= _mm512_loadu_si512(buf + len - 2 * 64);
+t |= _mm512_loadu_si512(buf + len - 1 * 64);
+
+return !_mm512_test_epi64_mask(t, t);
+
+}
+#pragma GCC pop_options
+#endif
+
+
 /* Note that for test_buffer_is_zero_next_accel, the most preferred
  * ISA must have the least significant bit.
  */
-#define CACHE_AVX21
-#define CACHE_SSE42
-#define CACHE_SSE24
+#define CACHE_AVX512F 1
+#define CACHE_AVX22
+#define CACHE_SSE44
+#define CACHE_SSE26
 
 /* Make sure that these variables are appropriately initialized when
  * SSE2 is enabled on the compiler command-line, but the compiler is
@@ -226,6 +268,11 @@ static void init_accel(unsigned cache)
 fn = buffer_zero_avx2;
 }
 #endif
+#ifdef CONFIG_AVX512F_OPT
+if (cache & CACHE_AVX512F) {
+fn = buffer_zero_avx512;
+}
+#endif
 buffer_accel = fn;
 }
 
@@ -255,6 +302,9 @@ static void __attribute__((constructor)) 
init_cpuid_cache(void)
 if ((bv & 6) == 6 && (b & bit_AVX2)) {
 cache |= CACHE_AVX2;
 }
+if ((bv & 6) == 6 && (b & bit_AVX512F)) {
+cache |= CACHE_AVX512F;
+}
 }
 }
 cpuid_cache = cache;
-- 
1.8.3.1




Re: [PATCH 2/2] util: add util function buffer_zero_avx512()

2020-02-13 Thread Paolo Bonzini
On 13/02/20 08:52, Robert Hoo wrote:
> +
> +}
> +#pragma GCC pop_options
> +#endif
> +
> +
>  /* Note that for test_buffer_is_zero_next_accel, the most preferred
>   * ISA must have the least significant bit.
>   */
> -#define CACHE_AVX21
> -#define CACHE_SSE42
> -#define CACHE_SSE24
> +#define CACHE_AVX512F 1
> +#define CACHE_AVX22
> +#define CACHE_SSE44
> +#define CACHE_SSE26

This should be 8, not 6.

Paolo

>  
>  /* Make sure that these variables are appropriately initialized when
>   * SSE2 is enabled on the compiler command-line, but the compiler is
> @@ -226,6 +268,11 @@ static void init_accel(unsigned cache)
>  fn = buffer_zero_avx2;
>  }
>  #endif
> +#ifdef CONFIG_AVX512F_OPT
> +if (cache & CACHE_AVX512F) {
> +fn = buffer_zero_avx512;
> +}
> +#endif
>  buffer_accel = fn;
>  }
>  
> @@ -255,6 +302,9 @@ static void __attribute__((constructor)) 
> init_cpuid_cache(void)
>  if ((bv & 6) == 6 && (b & bit_AVX2)) {
>  cache |= CACHE_AVX2;
>  }
> +if ((bv & 6) == 6 && (b & bit_AVX512F)) {
> +cache |= CACHE_AVX512F;
> +}
>  }




Re: [PATCH 2/2] util: add util function buffer_zero_avx512()

2020-02-13 Thread Robert Hoo
On Thu, 2020-02-13 at 11:30 +0100, Paolo Bonzini wrote:
> On 13/02/20 08:52, Robert Hoo wrote:
> > +
> > +}
> > +#pragma GCC pop_options
> > +#endif
> > +
> > +
> >  /* Note that for test_buffer_is_zero_next_accel, the most
> > preferred
> >   * ISA must have the least significant bit.
> >   */
> > -#define CACHE_AVX21
> > -#define CACHE_SSE42
> > -#define CACHE_SSE24
> > +#define CACHE_AVX512F 1
> > +#define CACHE_AVX22
> > +#define CACHE_SSE44
> > +#define CACHE_SSE26
> 
> This should be 8, not 6.
> 
> Paolo

Thanks Paolo, going to fix it in v2.
> 
> >  
> >  /* Make sure that these variables are appropriately initialized
> > when
> >   * SSE2 is enabled on the compiler command-line, but the compiler
> > is
> > @@ -226,6 +268,11 @@ static void init_accel(unsigned cache)
> >  fn = buffer_zero_avx2;
> >  }
> >  #endif
> > +#ifdef CONFIG_AVX512F_OPT
> > +if (cache & CACHE_AVX512F) {
> > +fn = buffer_zero_avx512;
> > +}
> > +#endif
> >  buffer_accel = fn;
> >  }
> >  
> > @@ -255,6 +302,9 @@ static void __attribute__((constructor))
> > init_cpuid_cache(void)
> >  if ((bv & 6) == 6 && (b & bit_AVX2)) {
> >  cache |= CACHE_AVX2;
> >  }
> > +if ((bv & 6) == 6 && (b & bit_AVX512F)) {
> > +cache |= CACHE_AVX512F;
> > +}
> >  }
> 
> 




Re: [PATCH 2/2] util: add util function buffer_zero_avx512()

2020-02-13 Thread Richard Henderson
On 2/12/20 11:52 PM, Robert Hoo wrote:
> And initialize buffer_is_zero() with it, when Intel AVX512F is
> available on host.
> 
> This function utilizes Intel AVX512 fundamental instructions which
> perform over previous AVX2 instructions.

Is it not still true that any AVX512 insn will cause the entire cpu package,
not just the current core, to drop frequency by 20%?

As far as I know one should only use the 512-bit instructions when you can
overcome that frequency drop, which seems unlikely in this case.  That said...


> +if (unlikely(len < 64)) { /*buff less than 512 bits, unlikely*/
> +return buffer_zero_int(buf, len);
> +}

First, len < 64 has been eliminated already in select_accel_fn.
Second, len < 256 is not handled properly by the code below...


> +/* Begin with an unaligned head of 64 bytes.  */
> +t = _mm512_loadu_si512(buf);
> +p = (__m512i *)(((uintptr_t)buf + 5 * 64) & -64);
> +e = (__m512i *)(((uintptr_t)buf + len) & -64);
> +
> +/* Loop over 64-byte aligned blocks of 256.  */
> +while (p < e) {
> +__builtin_prefetch(p);
> +if (unlikely(_mm512_test_epi64_mask(t, t))) {
> +return false;
> +}
> +t = p[-4] | p[-3] | p[-2] | p[-1];
> +p += 4;
> +}
> +
> +t |= _mm512_loadu_si512(buf + len - 4 * 64);
> +t |= _mm512_loadu_si512(buf + len - 3 * 64);
> +t |= _mm512_loadu_si512(buf + len - 2 * 64);
> +t |= _mm512_loadu_si512(buf + len - 1 * 64);

... because this final sequence loads 256 bytes.

Rather than make a second test vs 256 in buffer_zero_avx512, I wonder if it
would be better to have select_accel_fn do the job.  Have a global variable
buffer_accel_size alongside buffer_accel so there's only one branch
(mis)predict to worry about.

FWIW, something that the compiler should do, but doesn't currently, is use
vpternlogq to perform a 3-input OR.  Something like

/* 0xfe -> orABC */
t = _mm512_ternarylogic_epi64(t, p[-4], p[-3], 0xfe);
t = _mm512_ternarylogic_epi64(t, p[-2], p[-1], 0xfe);


r~



Re: [PATCH 2/2] util: add util function buffer_zero_avx512()

2020-02-24 Thread Richard Henderson
On 2/23/20 11:07 PM, Robert Hoo wrote:
> Inspired by your suggestion, I'm thinking go further: use immediate
> rather than a global variable, so that saves 1 memory(/cache) access. 
> 
> #ifdef CONFIG_AVX512F_OPT   
> #define OPTIMIZE_LEN256
> #else
> #define OPTIMIZE_LEN64
> #endif

With that, the testing in tests/test-bufferiszero.c, looping through the
implementations, is invalidated.  Because once you start compiling for avx512,
you're no longer testing sse2 et al with the same inputs.

IF we want to change the length to suit avx512, we would want to change it
unconditionally.  And then you could also tidy up avx2 to avoid the extra
comparisons there.


r~



Re: [PATCH 2/2] util: add util function buffer_zero_avx512()

2020-02-24 Thread Robert Hoo
On Mon, 2020-02-24 at 08:13 -0800, Richard Henderson wrote:
> On 2/23/20 11:07 PM, Robert Hoo wrote:
> > Inspired by your suggestion, I'm thinking go further: use immediate
> > rather than a global variable, so that saves 1 memory(/cache)
> > access. 
> > 
> > #ifdef CONFIG_AVX512F_OPT   
> > #define OPTIMIZE_LEN256
> > #else
> > #define OPTIMIZE_LEN64
> > #endif
> 
> With that, the testing in tests/test-bufferiszero.c, looping through
> the
> implementations, is invalidated.  Because once you start compiling
> for avx512,
> you're no longer testing sse2 et al with the same inputs.
> 
Right. Thanks pointing out. I didn't noticed that.
More precisely, it would cause no longer testing sse2 et al with < 256
length.

> IF we want to change the length to suit avx512, we would want to
> change it
> unconditionally.  And then you could also tidy up avx2 to avoid the
> extra
> comparisons there.
Considering the length's dependency on sse2/sse4/avx2/avx512 and the
algorithms, as well as future's possible changes, additions, I'd rather
roll back to your original suggestion, use a companion variable with
each accel_fn(). How do you like it?

> 
> 
> r~




Re: [PATCH 2/2] util: add util function buffer_zero_avx512()

2020-02-25 Thread Richard Henderson
On 2/24/20 11:34 PM, Robert Hoo wrote:
> Considering the length's dependency on sse2/sse4/avx2/avx512 and the
> algorithms, as well as future's possible changes, additions, I'd rather
> roll back to your original suggestion, use a companion variable with
> each accel_fn(). How do you like it?

How do I like it?

With a modification to init_accel() so that the function and the minimum length
are selected at the same time.


r~



Re: [PATCH 2/2] util: add util function buffer_zero_avx512()

2020-02-23 Thread Robert Hoo
Thanks Richard:-)
Sorry for late reply.
On Thu, 2020-02-13 at 10:20 -0800, Richard Henderson wrote:
> On 2/12/20 11:52 PM, Robert Hoo wrote:
> > And initialize buffer_is_zero() with it, when Intel AVX512F is
> > available on host.
> > 
> > This function utilizes Intel AVX512 fundamental instructions which
> > perform over previous AVX2 instructions.
> 
> Is it not still true that any AVX512 insn will cause the entire cpu
> package,
> not just the current core, to drop frequency by 20%?
> 
> As far as I know one should only use the 512-bit instructions when
> you can
> overcome that frequency drop, which seems unlikely in this
> case.  That said...
> I don't think so. AVX512 has been applied in various places.
> > +if (unlikely(len < 64)) { /*buff less than 512 bits,
> > unlikely*/
> > +return buffer_zero_int(buf, len);
> > +}
> 
> First, len < 64 has been eliminated already in select_accel_fn.
> Second, len < 256 is not handled properly by the code below...
> 
Right. I'm going to fix this in v2.
> 
> > +/* Begin with an unaligned head of 64 bytes.  */
> > +t = _mm512_loadu_si512(buf);
> > +p = (__m512i *)(((uintptr_t)buf + 5 * 64) & -64);
> > +e = (__m512i *)(((uintptr_t)buf + len) & -64);
> > +
> > +/* Loop over 64-byte aligned blocks of 256.  */
> > +while (p < e) {
> > +__builtin_prefetch(p);
> > +if (unlikely(_mm512_test_epi64_mask(t, t))) {
> > +return false;
> > +}
> > +t = p[-4] | p[-3] | p[-2] | p[-1];
> > +p += 4;
> > +}
> > +
> > +t |= _mm512_loadu_si512(buf + len - 4 * 64);
> > +t |= _mm512_loadu_si512(buf + len - 3 * 64);
> > +t |= _mm512_loadu_si512(buf + len - 2 * 64);
> > +t |= _mm512_loadu_si512(buf + len - 1 * 64);
> 
> ... because this final sequence loads 256 bytes.
> 
> Rather than make a second test vs 256 in buffer_zero_avx512, I wonder
> if it
> would be better to have select_accel_fn do the job.  Have a global
> variable
> buffer_accel_size alongside buffer_accel so there's only one branch
> (mis)predict to worry about.
> 
Thanks Richard, very enlightening!
Inspired by your suggestion, I'm thinking go further: use immediate
rather than a global variable, so that saves 1 memory(/cache) access. 

#ifdef CONFIG_AVX512F_OPT   
#define OPTIMIZE_LEN256
#else
#define OPTIMIZE_LEN64
#endif
> FWIW, something that the compiler should do, but doesn't currently,
> is use
> vpternlogq to perform a 3-input OR.  Something like
> 
> /* 0xfe -> orABC */
> t = _mm512_ternarylogic_epi64(t, p[-4], p[-3], 0xfe);
> t = _mm512_ternarylogic_epi64(t, p[-2], p[-1], 0xfe);
> 
Very enlightening. Yes, seems compiler doesn't do this.
I tried explicitly use this, however, looks it will have more
instructions generated, and unit test shows it performs less than then
conventional code.
Let me keep the conventional code for this moment, will ask around and
dig further outside this patch.

> 
> r~