Re: [PATCH v6 1/2] Update AVX512 support for xbzrle_encode_buffer

2022-11-14 Thread Juan Quintela
ling xu  wrote:
> This commit updates code of avx512 support for xbzrle_encode_buffer
> function to accelerate xbzrle encoding speed. Runtime check of avx512
> support and benchmark for this feature are added. Compared with C
> version of xbzrle_encode_buffer function, avx512 version can achieve
> 50%-70% performance improvement on benchmarking. In addition, if dirty
> data is randomly located in 4K page, the avx512 version can achieve
> almost 140% performance gain.
>
> Signed-off-by: ling xu 
> Co-authored-by: Zhou Zhao 
> Co-authored-by: Jun Jin 

Reviewed-by: Juan Quintela 

queued.




RE: [PATCH v6 1/2] Update AVX512 support for xbzrle_encode_buffer

2022-10-26 Thread Xu, Ling1
Hi, All,
 This is a "ping" email~. 
 It seems that the newest version of our patch has been ignored. So I 
"ping" this patchset again. 
 All comments and suggestions have been revised and updated in this V6 
version patch, and link for the patch is below:
 
https://lore.kernel.org/qemu-devel/20220826095719.2887535-2-ling1...@intel.com/
 Looking forward to hearing from you!

Best Regards
Ling

-Original Message-
From: Xu, Ling1  
Sent: Friday, August 26, 2022 5:57 PM
To: qemu-devel@nongnu.org
Cc: quint...@redhat.com; dgilb...@redhat.com; Xu, Ling1 ; 
Zhao, Zhou ; Jin, Jun I 
Subject: [PATCH v6 1/2] Update AVX512 support for xbzrle_encode_buffer

This commit updates code of avx512 support for xbzrle_encode_buffer function to 
accelerate xbzrle encoding speed. Runtime check of avx512 support and benchmark 
for this feature are added. Compared with C version of xbzrle_encode_buffer 
function, avx512 version can achieve 50%-70% performance improvement on 
benchmarking. In addition, if dirty data is randomly located in 4K page, the 
avx512 version can achieve almost 140% performance gain.

Signed-off-by: ling xu 
Co-authored-by: Zhou Zhao 
Co-authored-by: Jun Jin 
---
 meson.build|  16 ++
 meson_options.txt  |   2 +
 migration/ram.c|  34 +++--
 migration/xbzrle.c | 124 +
 migration/xbzrle.h |   4 ++
 5 files changed, 177 insertions(+), 3 deletions(-)

diff --git a/meson.build b/meson.build
index 20fddbd707..5d4b82d7f3 100644
--- a/meson.build
+++ b/meson.build
@@ -2264,6 +2264,22 @@ config_host_data.set('CONFIG_AVX512F_OPT', 
get_option('avx512f') \
 int main(int argc, char *argv[]) { return bar(argv[0]); }
   '''), error_message: 'AVX512F not available').allowed())
 
+config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \
+  .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot 
+enable AVX512BW') \
+  .require(cc.links('''
+#pragma GCC push_options
+#pragma GCC target("avx512bw")
+#include 
+#include 
+static int bar(void *a) {
+
+  __m512i *x = a;
+  __m512i res= _mm512_abs_epi8(*x);
+  return res[1];
+}
+int main(int argc, char *argv[]) { return bar(argv[0]); }  '''), 
+ error_message: 'AVX512BW not available').allowed())
+
 have_pvrdma = get_option('pvrdma') \
   .require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics 
libraries') \
   .require(cc.compiles(gnu_source_prefix + '''
diff --git a/meson_options.txt b/meson_options.txt index e58e158396..07194bf680 
100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -104,6 +104,8 @@ option('avx2', type: 'feature', value: 'auto',
description: 'AVX2 optimizations')  option('avx512f', type: 'feature', 
value: 'disabled',
description: 'AVX512F optimizations')
+option('avx512bw', type: 'feature', value: 'auto',
+   description: 'AVX512BW optimizations')
 option('keyring', type: 'feature', value: 'auto',
description: 'Linux keyring support')
 
diff --git a/migration/ram.c b/migration/ram.c index dc1de9ddbc..ff4c15c9c3 
100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -83,6 +83,34 @@
 /* 0x80 is reserved in migration.h start with 0x100 next */
 #define RAM_SAVE_FLAG_COMPRESS_PAGE0x100
 
+int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int,
+ uint8_t *, int) = xbzrle_encode_buffer; #if 
+defined(CONFIG_AVX512BW_OPT) #include "qemu/cpuid.h"
+static void __attribute__((constructor)) init_cpu_flag(void) {
+unsigned max = __get_cpuid_max(0, NULL);
+int a, b, c, d;
+if (max >= 1) {
+__cpuid(1, a, b, c, d);
+ /* We must check that AVX is not just available, but usable.  */
+if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
+int bv;
+__asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
+__cpuid_count(7, 0, a, b, c, d);
+   /* 0xe6:
+*  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
+*and ZMM16-ZMM31 state are enabled by OS)
+*  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
+*/
+if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
+xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
+}
+}
+}
+}
+#endif
+
 XBZRLECacheStats xbzrle_counters;
 
 /* struct contains XBZRLE cache and a static page @@ -802,9 +830,9 @@ static 
int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 
 /* XBZRLE encoding (if there is no overflow) */
-encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
-   TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
-   TARGET_PAGE_SIZE);
+encoded_len = xbzrle_encode_buffer_func(prev_cached_page, 
XBZRLE.current_buf,
+

RE: [PATCH v6 1/2] Update AVX512 support for xbzrle_encode_buffer

2022-09-19 Thread Xu, Ling1
Hi, All,
 This is a "ping" email~. 
 It seems that my patch has been ignored. So I "ping" this patchset. 
 Link for the patch: 
https://lore.kernel.org/qemu-devel/20220826095719.2887535-2-ling1...@intel.com/

Best Regards
Ling

-Original Message-
From: Xu, Ling1  
Sent: Friday, August 26, 2022 5:57 PM
To: qemu-devel@nongnu.org
Cc: quint...@redhat.com; dgilb...@redhat.com; Xu, Ling1 ; 
Zhao, Zhou ; Jin, Jun I 
Subject: [PATCH v6 1/2] Update AVX512 support for xbzrle_encode_buffer

This commit updates code of avx512 support for xbzrle_encode_buffer function to 
accelerate xbzrle encoding speed. Runtime check of avx512 support and benchmark 
for this feature are added. Compared with C version of xbzrle_encode_buffer 
function, avx512 version can achieve 50%-70% performance improvement on 
benchmarking. In addition, if dirty data is randomly located in 4K page, the 
avx512 version can achieve almost 140% performance gain.

Signed-off-by: ling xu 
Co-authored-by: Zhou Zhao 
Co-authored-by: Jun Jin 
---
 meson.build|  16 ++
 meson_options.txt  |   2 +
 migration/ram.c|  34 +++--
 migration/xbzrle.c | 124 +
 migration/xbzrle.h |   4 ++
 5 files changed, 177 insertions(+), 3 deletions(-)

diff --git a/meson.build b/meson.build
index 20fddbd707..5d4b82d7f3 100644
--- a/meson.build
+++ b/meson.build
@@ -2264,6 +2264,22 @@ config_host_data.set('CONFIG_AVX512F_OPT', 
get_option('avx512f') \
 int main(int argc, char *argv[]) { return bar(argv[0]); }
   '''), error_message: 'AVX512F not available').allowed())
 
+config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \
+  .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot 
+enable AVX512BW') \
+  .require(cc.links('''
+#pragma GCC push_options
+#pragma GCC target("avx512bw")
+#include 
+#include 
+static int bar(void *a) {
+
+  __m512i *x = a;
+  __m512i res= _mm512_abs_epi8(*x);
+  return res[1];
+}
+int main(int argc, char *argv[]) { return bar(argv[0]); }  '''), 
+ error_message: 'AVX512BW not available').allowed())
+
 have_pvrdma = get_option('pvrdma') \
   .require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics 
libraries') \
   .require(cc.compiles(gnu_source_prefix + '''
diff --git a/meson_options.txt b/meson_options.txt index e58e158396..07194bf680 
100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -104,6 +104,8 @@ option('avx2', type: 'feature', value: 'auto',
description: 'AVX2 optimizations')  option('avx512f', type: 'feature', 
value: 'disabled',
description: 'AVX512F optimizations')
+option('avx512bw', type: 'feature', value: 'auto',
+   description: 'AVX512BW optimizations')
 option('keyring', type: 'feature', value: 'auto',
description: 'Linux keyring support')
 
diff --git a/migration/ram.c b/migration/ram.c index dc1de9ddbc..ff4c15c9c3 
100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -83,6 +83,34 @@
 /* 0x80 is reserved in migration.h start with 0x100 next */
 #define RAM_SAVE_FLAG_COMPRESS_PAGE0x100
 
+int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int,
+ uint8_t *, int) = xbzrle_encode_buffer; #if 
+defined(CONFIG_AVX512BW_OPT) #include "qemu/cpuid.h"
+static void __attribute__((constructor)) init_cpu_flag(void) {
+unsigned max = __get_cpuid_max(0, NULL);
+int a, b, c, d;
+if (max >= 1) {
+__cpuid(1, a, b, c, d);
+ /* We must check that AVX is not just available, but usable.  */
+if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
+int bv;
+__asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
+__cpuid_count(7, 0, a, b, c, d);
+   /* 0xe6:
+*  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
+*and ZMM16-ZMM31 state are enabled by OS)
+*  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
+*/
+if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
+xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
+}
+}
+}
+}
+#endif
+
 XBZRLECacheStats xbzrle_counters;
 
 /* struct contains XBZRLE cache and a static page @@ -802,9 +830,9 @@ static 
int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 
 /* XBZRLE encoding (if there is no overflow) */
-encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
-   TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
-   TARGET_PAGE_SIZE);
+encoded_len = xbzrle_encode_buffer_func(prev_cached_page, 
XBZRLE.current_buf,
+TARGET_PAGE_SIZE, 
XBZRLE.encoded_buf,
+TARGET_PAGE_SIZE);
 
 /*
  * Update the cache contents, so that it corresponds