Re: [PATCH v6 1/2] Update AVX512 support for xbzrle_encode_buffer
ling xu wrote: > This commit updates code of avx512 support for xbzrle_encode_buffer > function to accelerate xbzrle encoding speed. Runtime check of avx512 > support and benchmark for this feature are added. Compared with C > version of xbzrle_encode_buffer function, avx512 version can achieve > 50%-70% performance improvement on benchmarking. In addition, if dirty > data is randomly located in 4K page, the avx512 version can achieve > almost 140% performance gain. > > Signed-off-by: ling xu > Co-authored-by: Zhou Zhao > Co-authored-by: Jun Jin Reviewed-by: Juan Quintela queued.
RE: [PATCH v6 1/2] Update AVX512 support for xbzrle_encode_buffer
Hi, All, This is a "ping" email~. It seems that the newest version of our patch has been ignored. So I "ping" this patchset again. All comments and suggestions have been revised and updated in this V6 version patch, and link for the patch is below: https://lore.kernel.org/qemu-devel/20220826095719.2887535-2-ling1...@intel.com/ Looking forward to hearing from you! Best Regards Ling -Original Message- From: Xu, Ling1 Sent: Friday, August 26, 2022 5:57 PM To: qemu-devel@nongnu.org Cc: quint...@redhat.com; dgilb...@redhat.com; Xu, Ling1 ; Zhao, Zhou ; Jin, Jun I Subject: [PATCH v6 1/2] Update AVX512 support for xbzrle_encode_buffer This commit updates code of avx512 support for xbzrle_encode_buffer function to accelerate xbzrle encoding speed. Runtime check of avx512 support and benchmark for this feature are added. Compared with C version of xbzrle_encode_buffer function, avx512 version can achieve 50%-70% performance improvement on benchmarking. In addition, if dirty data is randomly located in 4K page, the avx512 version can achieve almost 140% performance gain. Signed-off-by: ling xu Co-authored-by: Zhou Zhao Co-authored-by: Jun Jin --- meson.build| 16 ++ meson_options.txt | 2 + migration/ram.c| 34 +++-- migration/xbzrle.c | 124 + migration/xbzrle.h | 4 ++ 5 files changed, 177 insertions(+), 3 deletions(-) diff --git a/meson.build b/meson.build index 20fddbd707..5d4b82d7f3 100644 --- a/meson.build +++ b/meson.build @@ -2264,6 +2264,22 @@ config_host_data.set('CONFIG_AVX512F_OPT', get_option('avx512f') \ int main(int argc, char *argv[]) { return bar(argv[0]); } '''), error_message: 'AVX512F not available').allowed()) +config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \ + .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot +enable AVX512BW') \ + .require(cc.links(''' +#pragma GCC push_options +#pragma GCC target("avx512bw") +#include +#include +static int bar(void *a) { + + __m512i *x = a; + __m512i res= _mm512_abs_epi8(*x); + return res[1]; +} +int main(int argc, char *argv[]) { return bar(argv[0]); } '''), + error_message: 'AVX512BW not available').allowed()) + have_pvrdma = get_option('pvrdma') \ .require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics libraries') \ .require(cc.compiles(gnu_source_prefix + ''' diff --git a/meson_options.txt b/meson_options.txt index e58e158396..07194bf680 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -104,6 +104,8 @@ option('avx2', type: 'feature', value: 'auto', description: 'AVX2 optimizations') option('avx512f', type: 'feature', value: 'disabled', description: 'AVX512F optimizations') +option('avx512bw', type: 'feature', value: 'auto', + description: 'AVX512BW optimizations') option('keyring', type: 'feature', value: 'auto', description: 'Linux keyring support') diff --git a/migration/ram.c b/migration/ram.c index dc1de9ddbc..ff4c15c9c3 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -83,6 +83,34 @@ /* 0x80 is reserved in migration.h start with 0x100 next */ #define RAM_SAVE_FLAG_COMPRESS_PAGE0x100 +int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int, + uint8_t *, int) = xbzrle_encode_buffer; #if +defined(CONFIG_AVX512BW_OPT) #include "qemu/cpuid.h" +static void __attribute__((constructor)) init_cpu_flag(void) { +unsigned max = __get_cpuid_max(0, NULL); +int a, b, c, d; +if (max >= 1) { +__cpuid(1, a, b, c, d); + /* We must check that AVX is not just available, but usable. */ +if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) { +int bv; +__asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0)); +__cpuid_count(7, 0, a, b, c, d); + /* 0xe6: +* XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15 +*and ZMM16-ZMM31 state are enabled by OS) +* XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS) +*/ +if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) { +xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512; +} +} +} +} +#endif + XBZRLECacheStats xbzrle_counters; /* struct contains XBZRLE cache and a static page @@ -802,9 +830,9 @@ static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); /* XBZRLE encoding (if there is no overflow) */ -encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, - TARGET_PAGE_SIZE, XBZRLE.encoded_buf, - TARGET_PAGE_SIZE); +encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf, +
RE: [PATCH v6 1/2] Update AVX512 support for xbzrle_encode_buffer
Hi, All, This is a "ping" email~. It seems that my patch has been ignored. So I "ping" this patchset. Link for the patch: https://lore.kernel.org/qemu-devel/20220826095719.2887535-2-ling1...@intel.com/ Best Regards Ling -Original Message- From: Xu, Ling1 Sent: Friday, August 26, 2022 5:57 PM To: qemu-devel@nongnu.org Cc: quint...@redhat.com; dgilb...@redhat.com; Xu, Ling1 ; Zhao, Zhou ; Jin, Jun I Subject: [PATCH v6 1/2] Update AVX512 support for xbzrle_encode_buffer This commit updates code of avx512 support for xbzrle_encode_buffer function to accelerate xbzrle encoding speed. Runtime check of avx512 support and benchmark for this feature are added. Compared with C version of xbzrle_encode_buffer function, avx512 version can achieve 50%-70% performance improvement on benchmarking. In addition, if dirty data is randomly located in 4K page, the avx512 version can achieve almost 140% performance gain. Signed-off-by: ling xu Co-authored-by: Zhou Zhao Co-authored-by: Jun Jin --- meson.build| 16 ++ meson_options.txt | 2 + migration/ram.c| 34 +++-- migration/xbzrle.c | 124 + migration/xbzrle.h | 4 ++ 5 files changed, 177 insertions(+), 3 deletions(-) diff --git a/meson.build b/meson.build index 20fddbd707..5d4b82d7f3 100644 --- a/meson.build +++ b/meson.build @@ -2264,6 +2264,22 @@ config_host_data.set('CONFIG_AVX512F_OPT', get_option('avx512f') \ int main(int argc, char *argv[]) { return bar(argv[0]); } '''), error_message: 'AVX512F not available').allowed()) +config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \ + .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot +enable AVX512BW') \ + .require(cc.links(''' +#pragma GCC push_options +#pragma GCC target("avx512bw") +#include +#include +static int bar(void *a) { + + __m512i *x = a; + __m512i res= _mm512_abs_epi8(*x); + return res[1]; +} +int main(int argc, char *argv[]) { return bar(argv[0]); } '''), + error_message: 'AVX512BW not available').allowed()) + have_pvrdma = get_option('pvrdma') \ .require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics libraries') \ .require(cc.compiles(gnu_source_prefix + ''' diff --git a/meson_options.txt b/meson_options.txt index e58e158396..07194bf680 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -104,6 +104,8 @@ option('avx2', type: 'feature', value: 'auto', description: 'AVX2 optimizations') option('avx512f', type: 'feature', value: 'disabled', description: 'AVX512F optimizations') +option('avx512bw', type: 'feature', value: 'auto', + description: 'AVX512BW optimizations') option('keyring', type: 'feature', value: 'auto', description: 'Linux keyring support') diff --git a/migration/ram.c b/migration/ram.c index dc1de9ddbc..ff4c15c9c3 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -83,6 +83,34 @@ /* 0x80 is reserved in migration.h start with 0x100 next */ #define RAM_SAVE_FLAG_COMPRESS_PAGE0x100 +int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int, + uint8_t *, int) = xbzrle_encode_buffer; #if +defined(CONFIG_AVX512BW_OPT) #include "qemu/cpuid.h" +static void __attribute__((constructor)) init_cpu_flag(void) { +unsigned max = __get_cpuid_max(0, NULL); +int a, b, c, d; +if (max >= 1) { +__cpuid(1, a, b, c, d); + /* We must check that AVX is not just available, but usable. */ +if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) { +int bv; +__asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0)); +__cpuid_count(7, 0, a, b, c, d); + /* 0xe6: +* XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15 +*and ZMM16-ZMM31 state are enabled by OS) +* XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS) +*/ +if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) { +xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512; +} +} +} +} +#endif + XBZRLECacheStats xbzrle_counters; /* struct contains XBZRLE cache and a static page @@ -802,9 +830,9 @@ static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); /* XBZRLE encoding (if there is no overflow) */ -encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, - TARGET_PAGE_SIZE, XBZRLE.encoded_buf, - TARGET_PAGE_SIZE); +encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf, +TARGET_PAGE_SIZE, XBZRLE.encoded_buf, +TARGET_PAGE_SIZE); /* * Update the cache contents, so that it corresponds