On 10/11/2015 03:51, Liang Li wrote: > buffer_find_nonzero_offset() is a hot function during live migration. > Now it use SSE2 intructions for optimization. For platform supports > AVX2 instructions, use the AVX2 instructions for optimization can help > to improve the performance about 30% comparing to SSE2. > Zero page check can be faster with this optimization, the test result > shows that for an 8GB RAM idle guest, this patch can help to shorten > the total live migration time about 6%. > > This patch use the ifunc mechanism to select the proper function when > running, for platform supports AVX2, excute the AVX2 instructions, > else, excute the original code. > > Signed-off-by: Liang Li <liang.z...@intel.com> > --- > include/qemu-common.h | 28 +++++++++++++++------ > util/Makefile.objs | 2 ++ > util/avx2.c | 69 > +++++++++++++++++++++++++++++++++++++++++++++++++++ > util/cutils.c | 53 +++++++++++++++++++++++++++++++++++++-- > 4 files changed, 143 insertions(+), 9 deletions(-) > create mode 100644 util/avx2.c > > diff --git a/include/qemu-common.h b/include/qemu-common.h > index 2f74540..9fa7501 100644 > --- a/include/qemu-common.h > +++ b/include/qemu-common.h > @@ -484,15 +484,29 @@ void qemu_hexdump(const char *buf, FILE *fp, const char > *prefix, size_t size); > #endif > > #define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8 > -static inline bool > -can_use_buffer_find_nonzero_offset(const void *buf, size_t len) > -{ > - return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR > - * sizeof(VECTYPE)) == 0 > - && ((uintptr_t) buf) % sizeof(VECTYPE) == 0); > -} > +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len); > + > size_t buffer_find_nonzero_offset(const void *buf, size_t len); > > +extern bool > +can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len); > + > +extern size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len); > + > +extern bool > +can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len); > + > +extern size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len); > + > +__asm__(".type can_use_buffer_find_nonzero_offset, \%gnu_indirect_function"); > +__asm__(".type buffer_find_nonzero_offset, \%gnu_indirect_function"); > + > + > +void *can_use_buffer_find_nonzero_offset_ifunc(void) \ > + __asm__("can_use_buffer_find_nonzero_offset"); > + > +void *buffer_find_nonzero_offset_ifunc(void) \ > + __asm__("buffer_find_nonzero_offset"); > /* > * helper to parse debug environment variables > */ > diff --git a/util/Makefile.objs b/util/Makefile.objs > index d7cc399..6aacad7 100644 > --- a/util/Makefile.objs > +++ b/util/Makefile.objs > @@ -1,4 +1,5 @@ > util-obj-y = osdep.o cutils.o unicode.o qemu-timer-common.o > +util-obj-y += avx2.o > util-obj-$(CONFIG_POSIX) += compatfd.o > util-obj-$(CONFIG_POSIX) += event_notifier-posix.o > util-obj-$(CONFIG_POSIX) += mmap-alloc.o > @@ -29,3 +30,4 @@ util-obj-y += qemu-coroutine.o qemu-coroutine-lock.o > qemu-coroutine-io.o > util-obj-y += qemu-coroutine-sleep.o > util-obj-y += coroutine-$(CONFIG_COROUTINE_BACKEND).o > util-obj-y += buffer.o > +avx2.o-cflags := $(AVX2_CFLAGS) > diff --git a/util/avx2.c b/util/avx2.c > new file mode 100644 > index 0000000..0e6915a > --- /dev/null > +++ b/util/avx2.c > @@ -0,0 +1,69 @@ > +#include "qemu-common.h" > + > +#ifdef __AVX2__ > +#include <immintrin.h> > +#define AVX2_VECTYPE __m256i > +#define AVX2_SPLAT(p) _mm256_set1_epi8(*(p)) > +#define AVX2_ALL_EQ(v1, v2) \ > + (_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0xFFFFFFFF) > +#define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2)) > + > +inline bool > +can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len) > +{ > + return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR > + * sizeof(AVX2_VECTYPE)) == 0 > + && ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0); > +} > + > +size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len) > +{ > + const AVX2_VECTYPE *p = buf; > + const AVX2_VECTYPE zero = (AVX2_VECTYPE){0}; > + size_t i; > + > + assert(can_use_buffer_find_nonzero_offset_avx2(buf, len)); > + > + if (!len) { > + return 0; > + } > + > + for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) { > + if (!AVX2_ALL_EQ(p[i], zero)) { > + return i * sizeof(AVX2_VECTYPE); > + } > + } > + > + for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; > + i < len / sizeof(AVX2_VECTYPE); > + i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) { > + AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]); > + AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]); > + AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]); > + AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]); > + AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1); > + AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2, tmp3); > + if (!AVX2_ALL_EQ(AVX2_VEC_OR(tmp01, tmp23), zero)) { > + break; > + } > + } > + > + return i * sizeof(AVX2_VECTYPE); > +} > + > +#else > +/* use the original functions if avx2 is not enabled when buiding*/ > + > +inline bool > +can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len) > +{ > + return can_use_buffer_find_nonzero_offset_inner(buf, len); > +} > + > +inline size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len) > +{ > + return buffer_find_nonzero_offset_inner(buf, len); > +} > + > +#endif > + > diff --git a/util/cutils.c b/util/cutils.c > index cfeb848..cd478ce 100644 > --- a/util/cutils.c > +++ b/util/cutils.c > @@ -26,6 +26,7 @@ > #include <math.h> > #include <limits.h> > #include <errno.h> > +#include <cpuid.h> > > #include "qemu/sockets.h" > #include "qemu/iov.h" > @@ -161,6 +162,54 @@ int qemu_fdatasync(int fd) > #endif > } > > +/* old compiler maynot define bit_AVX2 */ > +#ifndef bit_AVX2 > +#define bit_AVX2 (1 << 5) > +#endif > + > +static inline bool avx2_support(void) > +{ > + int a, b, c, d; > + > + if (__get_cpuid_max(0, NULL) < 7) { > + printf("max cpuid < 7\n"); > + return false; > + } > + > + __cpuid_count(7, 0, a, b, c, d); > + printf("b = %x\n", b); > + return b & bit_AVX2; > +} > + > +void *buffer_find_nonzero_offset_ifunc(void) > +{ > + printf("deciding %s\n", __func__); > + > + typeof(buffer_find_nonzero_offset) *func = (avx2_support()) ? > + buffer_find_nonzero_offset_avx2 : buffer_find_nonzero_offset_inner; > + > + return func; > +} > + > +void *can_use_buffer_find_nonzero_offset_ifunc(void) > +{ > + printf("deciding %s\n", __func__); > + > + typeof(can_use_buffer_find_nonzero_offset) *func = (avx2_support()) ? > + can_use_buffer_find_nonzero_offset_avx2 : > + can_use_buffer_find_nonzero_offset_inner; > + > + return func; > +} > + > +inline bool > +can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len) > +{ > + return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR > + * sizeof(VECTYPE)) == 0 > + && ((uintptr_t) buf) % sizeof(VECTYPE) == 0); > +} > + > /* > * Searches for an area with non-zero content in a buffer > * > @@ -181,13 +230,13 @@ int qemu_fdatasync(int fd) > * If the buffer is all zero the return value is equal to len. > */ > > -size_t buffer_find_nonzero_offset(const void *buf, size_t len) > +size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len) > { > const VECTYPE *p = buf; > const VECTYPE zero = (VECTYPE){0}; > size_t i; > > - assert(can_use_buffer_find_nonzero_offset(buf, len)); > + assert(can_use_buffer_find_nonzero_offset_inner(buf, len)); > > if (!len) { > return 0; >
The main issue here is that you are not testing whether the compiler supports gnu_indirect_function. I suggest that you start by moving the functions to util/buffer-zero.c Then the structure should be something like #ifdef CONFIG_HAVE_AVX2 #include <immintrin.h> #endif ... define buffer_find_nonzero_offset_inner ... ... define can_use_buffer_find_nonzero_offset_inner ... #if defined CONFIG_HAVE_GNU_IFUNC && defined CONFIG_HAVE_AVX2 ... define buffer_find_nonzero_offset_avx2 ... ... define can_use_buffer_find_nonzero_offset_avx2 ... ... define the indirect functions ... #else ... define buffer_find_nonzero_offset that just calls buffer_find_nonzero_offset_inner ... ... define can_use_buffer_find_nonzero_offset that just calls can_use_buffer_find_nonzero_offset_inner ... #endif Thanks, Paolo