On Thu, Jul 22, 2021 at 3:14 PM Dr. David Alan Gilbert <dgilb...@redhat.com>
wrote:

> * Richard Henderson (richard.hender...@linaro.org) wrote:
> > On 7/22/21 12:02 AM, Dr. David Alan Gilbert wrote:
> > > Hi Richard,
> > >    I think you were the last person to fiddle with the prefetching
> > > in buffer_zero_avx2 and friends; Joe (cc'd) wondered if explicit
> > > prefetching still made sense on modern CPUs, and that their hardware
> > > generally figures stuff out better on simple increments.
> > >
> > >    What was your thinking on this, and did you actually measure
> > > any improvement?
> >
> > Ah, well, that was 5 years ago so I have no particular memory of this.
> It
> > wouldn't surprise me if you can't measure any improvement on modern
> > hardware.
> >
> > Do you now measure an improvement with the prefetches gone?
>
> Not tried, it just came from Joe's suggestion that it was generally a
> bad idea these days; I do remember that the behaviour of those functions
> is quite tricky because there performance is VERY data dependent - many
> VMs actually have pages that are quite dirty so you never iterate the
> loop, but then you hit others with big zero pages and you spend your
> entire life in the loop.
>
>
Dave, Richard:
My curiosity got the best of me.  So I created a small test program that
used the buffer_zero_avx2() routine from qemu's bufferiszero.c.

When I run it on an Intel Cascade Lake processor, the cost of calling
"__builtin_prefetch(p)" is in the noise range .  It's always "just
slightly" slower.  I doubt it could ever be measured in qemu.

Ironically, when I disabled the hardware prefetchers, the program slowed
down over 33%.  And the call to "__builtin_prefetch(p)" actually hurt
performance by over 3%.

My results are below, (only with the hardware prefetchers enabled).  The
program is attached.
Joe

# gcc -mavx buffer_zero_avx.c -O -DDO_PREFETCH ; for i in {1..5}; do
./a.out; done
TSC 356144 Kcycles.
TSC 356714 Kcycles.
TSC 356707 Kcycles.
TSC 356565 Kcycles.
TSC 356853 Kcycles.
# gcc -mavx buffer_zero_avx.c -O ; for i in {1..5}; do ./a.out; done
TSC 355520 Kcycles.
TSC 355961 Kcycles.
TSC 355872 Kcycles.
TSC 355948 Kcycles.
TSC 355918 Kcycles.

Dave
> >
> > r~
> >
> --
> Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
>
>
/*
 * Simple program to test if a prefetch helps or hurts buffer_zero_avx2.
 *
 * Compile with either:
 *  gcc -mavx buffer_zero_avx.c -O 
 * or
 *  gcc -mavx buffer_zero_avx.c -O -DDO_PREFETCH 
 */

#include <immintrin.h>
#include <stdio.h>
#include <stdint.h>
#include <stddef.h>
#include <sys/mman.h>
#include <string.h>

#define likely(x)       __builtin_expect((x),1)
#define unlikely(x)     __builtin_expect((x),0)

static __inline__ u_int64_t start_clock();
static __inline__ u_int64_t stop_clock();
static int buffer_zero_avx2(const void *buf, size_t len);

/*
 * Allocate a large chuck of anon memory, touch/zero it, 
 * and then time the call to buffer_zero_avx2().
 */
int main() 
{
   long i;
   size_t mmap_len = 2UL*1024*1024*1024;
   char *ptr = mmap(NULL, mmap_len,
       PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0L);

   if (ptr == MAP_FAILED) {
       perror(" mmap");
       exit(1);
   }

   // Touch the pages (they're already cleared)
   memset(ptr,0x0,mmap_len);

   u_int64_t start_rdtsc = start_clock();

   buffer_zero_avx2(ptr, mmap_len);

   u_int64_t stop_rdtsc = stop_clock();
   u_int64_t diff = stop_rdtsc - start_rdtsc;

   printf("TSC %ld Kcycles. \n", diff/1000);

}

static int 
buffer_zero_avx2(const void *buf, size_t len)
{
    /* Begin with an unaligned head of 32 bytes.  */
    __m256i t = _mm256_loadu_si256(buf);
    __m256i *p = (__m256i *)(((uintptr_t)buf + 5 * 32) & -32);
    __m256i *e = (__m256i *)(((uintptr_t)buf + len) & -32);

    if (likely(p <= e)) {
        /* Loop over 32-byte aligned blocks of 128.  */
        do {
#ifdef DO_PREFETCH
             __builtin_prefetch(p);
#endif
            if (unlikely(!_mm256_testz_si256(t, t))) {
                printf("In unlikely buffer_zero, p:%lx \n",p);
                return 0;
            }
            t = p[-4] | p[-3] | p[-2] | p[-1];
            p += 4;
        } while (p <= e);
    } else {
        t |= _mm256_loadu_si256(buf + 32);
        if (len <= 128) {
            goto last2;
        }
    }

    /* Finish the last block of 128 unaligned.  */
    t |= _mm256_loadu_si256(buf + len - 4 * 32);
    t |= _mm256_loadu_si256(buf + len - 3 * 32);
last2:
    t |= _mm256_loadu_si256(buf + len - 2 * 32);
    t |= _mm256_loadu_si256(buf + len - 1 * 32);
  
    // printf("End of buffer_zero_avx2\n");
    return _mm256_testz_si256(t, t);
}

static __inline__ u_int64_t 
start_clock() {
    // See: Intel Doc #324264, "How to Benchmark Code Execution Times on Intel...",
    u_int32_t hi, lo;
    __asm__ __volatile__ (
        "CPUID\n\t"
        "RDTSC\n\t"
        "mov %%edx, %0\n\t"
        "mov %%eax, %1\n\t": "=r" (hi), "=r" (lo)::
        "%rax", "%rbx", "%rcx", "%rdx");
    return ( (u_int64_t)lo) | ( ((u_int64_t)hi) << 32);
}

static __inline__ u_int64_t 
stop_clock() {
    // See: Intel Doc #324264, "How to Benchmark Code Execution Times on Intel...",
    u_int32_t hi, lo;
    __asm__ __volatile__(
        "RDTSCP\n\t"
        "mov %%edx, %0\n\t"
        "mov %%eax, %1\n\t"
        "CPUID\n\t": "=r" (hi), "=r" (lo)::
        "%rax", "%rbx", "%rcx", "%rdx");
    return ( (u_int64_t)lo) | ( ((u_int64_t)hi) << 32);
}


Reply via email to