RE: [PATCH] migration/multifd: Fix build for qatzip

2024-09-10 Thread Liu, Yuan1
> -Original Message-
> From: Peter Xu 
> Sent: Wednesday, September 11, 2024 5:05 AM
> To: qemu-devel@nongnu.org
> Cc: pet...@redhat.com; Fabiano Rosas ; Prasad Pandit
> ; Wang, Yichen ; Bryan
> Zhang ; Hao Xiang ; Liu,
> Yuan1 
> Subject: [PATCH] migration/multifd: Fix build for qatzip
> 
> The qatzip series was based on an older commit, it applied cleanly even
> though it has conflicts.  Neither CI nor myself found the build will break
> as it's skipped by default when qatzip library was missing.
> 
> Fix the build issues.  No need to copy stable as it just landed 9.2.
> 
> Cc: Yichen Wang 
> Cc: Bryan Zhang 
> Cc: Hao Xiang 
> Cc: Yuan Liu 
> Fixes: 80484f9459 ("migration: Introduce 'qatzip' compression method")
> Signed-off-by: Peter Xu 
> ---
> 
> Qatzip developers: would you help me to double check whether this is the
> right fix?

Looks good to me, thanks

>  migration/multifd-qatzip.c | 18 +-
>  1 file changed, 9 insertions(+), 9 deletions(-)
> 
> diff --git a/migration/multifd-qatzip.c b/migration/multifd-qatzip.c
> index 3c787ed879..7b68397625 100644
> --- a/migration/multifd-qatzip.c
> +++ b/migration/multifd-qatzip.c
> @@ -160,7 +160,8 @@ static void qatzip_send_cleanup(MultiFDSendParams *p,
> Error **errp)
>   */
>  static int qatzip_send_prepare(MultiFDSendParams *p, Error **errp)
>  {
> -MultiFDPages_t *pages = p->pages;
> +uint32_t page_size = multifd_ram_page_size();
> +MultiFDPages_t *pages = &p->data->u.ram;
>  QatzipData *q = p->compress_data;
>  int ret;
>  unsigned int in_len, out_len;
> @@ -179,12 +180,12 @@ static int qatzip_send_prepare(MultiFDSendParams *p,
> Error **errp)
>   * implementation.
>   */
>  for (int i = 0; i < pages->normal_num; i++) {
> -memcpy(q->in_buf + (i * p->page_size),
> +memcpy(q->in_buf + (i * page_size),
> pages->block->host + pages->offset[i],
> -   p->page_size);
> +   page_size);
>  }
> 
> -in_len = pages->normal_num * p->page_size;
> +in_len = pages->normal_num * page_size;
>  if (in_len > q->in_len) {
>  error_setg(errp, "multifd %u: unexpectedly large input", p->id);
>  return -1;
> @@ -197,7 +198,7 @@ static int qatzip_send_prepare(MultiFDSendParams *p,
> Error **errp)
> p->id, ret);
>  return -1;
>  }
> -if (in_len != pages->normal_num * p->page_size) {
> +if (in_len != pages->normal_num * page_size) {
>  error_setg(errp, "multifd %u: QATzip failed to compress all
> input",
> p->id);
>  return -1;
> @@ -329,7 +330,8 @@ static int qatzip_recv(MultiFDRecvParams *p, Error
> **errp)
>  int ret;
>  unsigned int in_len, out_len;
>  uint32_t in_size = p->next_packet_size;
> -uint32_t expected_size = p->normal_num * p->page_size;
> +uint32_t page_size = multifd_ram_page_size();
> +uint32_t expected_size = p->normal_num * page_size;
>  uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK;
> 
>  if (in_size > q->in_len) {
> @@ -370,9 +372,7 @@ static int qatzip_recv(MultiFDRecvParams *p, Error
> **errp)
> 
>  /* Copy each page to its appropriate location. */
>  for (int i = 0; i < p->normal_num; i++) {
> -memcpy(p->host + p->normal[i],
> -   q->out_buf + p->page_size * i,
> -   p->page_size);
> +memcpy(p->host + p->normal[i], q->out_buf + page_size * i,
> page_size);
>  }
>  return 0;
>  }
> --
> 2.45.0




RE: [PATCH v6 5/5] tests/migration: Add integration test for 'qatzip' compression method

2024-07-16 Thread Liu, Yuan1
> -Original Message-
> From: Yichen Wang 
> Sent: Tuesday, July 16, 2024 6:13 AM
> To: Peter Xu ; Fabiano Rosas ; Paolo
> Bonzini ; Daniel P. Berrangé ;
> Eduardo Habkost ; Marc-André Lureau
> ; Thomas Huth ; Philippe
> Mathieu-Daudé ; Eric Blake ; Markus
> Armbruster ; Laurent Vivier ; qemu-
> de...@nongnu.org
> Cc: Hao Xiang ; Liu, Yuan1 ;
> Zou, Nanhai ; Ho-Ren (Jack) Chuang
> ; Wang, Yichen ;
> Bryan Zhang 
> Subject: [PATCH v6 5/5] tests/migration: Add integration test for 'qatzip'
> compression method
> 
> From: Bryan Zhang 
> 
> Adds an integration test for 'qatzip'.
> 
> Signed-off-by: Bryan Zhang 
> Signed-off-by: Hao Xiang 
> Signed-off-by: Yichen Wang 
> Reviewed-by: Fabiano Rosas 
> ---
>  tests/qtest/migration-test.c | 31 +++
>  1 file changed, 31 insertions(+)
> 
> diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
> index 70b606b888..98f60d58a7 100644
> --- a/tests/qtest/migration-test.c
> +++ b/tests/qtest/migration-test.c
> @@ -32,6 +32,10 @@
>  # endif /* CONFIG_TASN1 */
>  #endif /* CONFIG_GNUTLS */
> 
> +#ifdef CONFIG_QATZIP
> +#include 
> +#endif /* CONFIG_QATZIP */

It looks like  file is not needed, maybe
it can be removed.

>  /* For dirty ring test; so far only x86_64 is supported */
>  #if defined(__linux__) && defined(HOST_X86_64)
>  #include "linux/kvm.h"
> @@ -2992,6 +2996,18 @@
> test_migrate_precopy_tcp_multifd_zstd_start(QTestState *from,
>  }
>  #endif /* CONFIG_ZSTD */
> 
> +#ifdef CONFIG_QATZIP
> +static void *
> +test_migrate_precopy_tcp_multifd_qatzip_start(QTestState *from,
> +  QTestState *to)
> +{
> +migrate_set_parameter_int(from, "multifd-qatzip-level", 2);
> +migrate_set_parameter_int(to, "multifd-qatzip-level", 2);
> +
> +return test_migrate_precopy_tcp_multifd_start_common(from, to,
> "qatzip");
> +}
> +#endif
> +
>  #ifdef CONFIG_QPL
>  static void *
>  test_migrate_precopy_tcp_multifd_qpl_start(QTestState *from,
> @@ -3089,6 +3105,17 @@ static void test_multifd_tcp_zstd(void)
>  }
>  #endif
> 
> +#ifdef CONFIG_QATZIP
> +static void test_multifd_tcp_qatzip(void)
> +{
> +MigrateCommon args = {
> +.listen_uri = "defer",
> +.start_hook = test_migrate_precopy_tcp_multifd_qatzip_start,
> +};
> +test_precopy_common(&args);
> +}
> +#endif
> +
>  #ifdef CONFIG_QPL
>  static void test_multifd_tcp_qpl(void)
>  {
> @@ -3992,6 +4019,10 @@ int main(int argc, char **argv)
>  migration_test_add("/migration/multifd/tcp/plain/zstd",
> test_multifd_tcp_zstd);
>  #endif
> +#ifdef CONFIG_QATZIP
> +migration_test_add("/migration/multifd/tcp/plain/qatzip",
> +test_multifd_tcp_qatzip);
> +#endif
>  #ifdef CONFIG_QPL
>  migration_test_add("/migration/multifd/tcp/plain/qpl",
> test_multifd_tcp_qpl);
> --
> Yichen Wang




RE: [PATCH v6 4/5] migration: Introduce 'qatzip' compression method

2024-07-16 Thread Liu, Yuan1
> -Original Message-
> From: Yichen Wang 
> Sent: Tuesday, July 16, 2024 6:13 AM
> To: Peter Xu ; Fabiano Rosas ; Paolo
> Bonzini ; Daniel P. Berrangé ;
> Eduardo Habkost ; Marc-André Lureau
> ; Thomas Huth ; Philippe
> Mathieu-Daudé ; Eric Blake ; Markus
> Armbruster ; Laurent Vivier ; qemu-
> de...@nongnu.org
> Cc: Hao Xiang ; Liu, Yuan1 ;
> Zou, Nanhai ; Ho-Ren (Jack) Chuang
> ; Wang, Yichen ;
> Bryan Zhang 
> Subject: [PATCH v6 4/5] migration: Introduce 'qatzip' compression method
> 
> From: Bryan Zhang 
> 
> Adds support for 'qatzip' as an option for the multifd compression
> method parameter, and implements using QAT for 'qatzip' compression and
> decompression.
> 
> Signed-off-by: Bryan Zhang 
> Signed-off-by: Hao Xiang 
> Signed-off-by: Yichen Wang 
> ---
>  hw/core/qdev-properties-system.c |   2 +-
>  migration/meson.build|   1 +
>  migration/multifd-qatzip.c   | 382 +++
>  migration/multifd.h  |   5 +-
>  qapi/migration.json  |   3 +
>  tests/qtest/meson.build  |   4 +
>  6 files changed, 394 insertions(+), 3 deletions(-)
>  create mode 100644 migration/multifd-qatzip.c
> 
> diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-
> system.c
> index f13350b4fb..a56fbf728d 100644
> --- a/hw/core/qdev-properties-system.c
> +++ b/hw/core/qdev-properties-system.c
> @@ -659,7 +659,7 @@ const PropertyInfo qdev_prop_fdc_drive_type = {
>  const PropertyInfo qdev_prop_multifd_compression = {
>  .name = "MultiFDCompression",
>  .description = "multifd_compression values, "
> -   "none/zlib/zstd/qpl/uadk",
> +   "none/zlib/zstd/qpl/uadk/qatzip",
>  .enum_table = &MultiFDCompression_lookup,
>  .get = qdev_propinfo_get_enum,
>  .set = qdev_propinfo_set_enum,
> diff --git a/migration/meson.build b/migration/meson.build
> index 5ce2acb41e..c9454c26ae 100644
> --- a/migration/meson.build
> +++ b/migration/meson.build
> @@ -41,6 +41,7 @@ system_ss.add(when: rdma, if_true: files('rdma.c'))
>  system_ss.add(when: zstd, if_true: files('multifd-zstd.c'))
>  system_ss.add(when: qpl, if_true: files('multifd-qpl.c'))
>  system_ss.add(when: uadk, if_true: files('multifd-uadk.c'))
> +system_ss.add(when: qatzip, if_true: files('multifd-qatzip.c'))
> 
>  specific_ss.add(when: 'CONFIG_SYSTEM_ONLY',
>  if_true: files('ram.c',
> diff --git a/migration/multifd-qatzip.c b/migration/multifd-qatzip.c
> new file mode 100644
> index 00..b74cda503a
> --- /dev/null
> +++ b/migration/multifd-qatzip.c
> @@ -0,0 +1,382 @@
> +/*
> + * Multifd QATzip compression implementation
> + *
> + * Copyright (c) Bytedance
> + *
> + * Authors:
> + *  Bryan Zhang 
> + *  Hao Xiang 
> + *  Yichen Wang 
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or
> later.
> + * See the COPYING file in the top-level directory.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "exec/ramblock.h"
> +#include "qapi/error.h"
> +#include "qemu/error-report.h"
> +#include "qapi/qapi-types-migration.h"
> +#include "options.h"
> +#include "multifd.h"
> +#include 
> +
> +typedef struct {
> +/*
> + * Unique session for use with QATzip API
> + */
> +QzSession_T sess;
> +
> +/*
> + * For compression: Buffer for pages to compress
> + * For decompression: Buffer for data to decompress
> + */
> +uint8_t *in_buf;
> +uint32_t in_len;
> +
> +/*
> + * For compression: Output buffer of compressed data
> + * For decompression: Output buffer of decompressed data
> + */
> +uint8_t *out_buf;
> +uint32_t out_len;
> +} QatzipData;
> +
> +/**
> + * qatzip_send_setup: Set up QATzip session and private buffers.
> + *
> + * @param pMultifd channel params
> + * @param errp Pointer to error, which will be set in case of error
> + * @return 0 on success, -1 on error (and *errp will be set)
> + */
> +static int qatzip_send_setup(MultiFDSendParams *p, Error **errp)
> +{
> +QatzipData *q;
> +QzSessionParamsDeflate_T params;
> +const char *err_msg;
> +int ret;
> +
> +q = g_new0(QatzipData, 1);
> +p->compress_data = q;
> +/* We need one extra place for the packet header */
> +p->iov = g_new0(struct iovec, 2);
> +
> +/*
> + * Prefer without sw_fallback because of bad performance with

RE: [PATCH v6 2/5] meson: Introduce 'qatzip' feature to the build system

2024-07-15 Thread Liu, Yuan1
> -Original Message-
> From: Yichen Wang 
> Sent: Tuesday, July 16, 2024 6:13 AM
> To: Peter Xu ; Fabiano Rosas ; Paolo
> Bonzini ; Daniel P. Berrangé ;
> Eduardo Habkost ; Marc-André Lureau
> ; Thomas Huth ; Philippe
> Mathieu-Daudé ; Eric Blake ; Markus
> Armbruster ; Laurent Vivier ; qemu-
> de...@nongnu.org
> Cc: Hao Xiang ; Liu, Yuan1 ;
> Zou, Nanhai ; Ho-Ren (Jack) Chuang
> ; Wang, Yichen ;
> Bryan Zhang 
> Subject: [PATCH v6 2/5] meson: Introduce 'qatzip' feature to the build
> system
> 
> From: Bryan Zhang 
> 
> Add a 'qatzip' feature, which is automatically disabled, and which
> depends on the QATzip library if enabled.
> 
> Signed-off-by: Bryan Zhang 
> Signed-off-by: Hao Xiang 
> Signed-off-by: Yichen Wang 
> ---
>  meson.build   | 10 ++
>  meson_options.txt |  2 ++
>  scripts/meson-buildoptions.sh |  3 +++
>  3 files changed, 15 insertions(+)
> 
> diff --git a/meson.build b/meson.build
> index 6a93da48e1..ea977c6cbf 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -1244,6 +1244,14 @@ if not get_option('uadk').auto() or have_system
>   uadk = declare_dependency(dependencies: [libwd, libwd_comp])
>endif
>  endif
> +
> +qatzip = not_found
> +if get_option('qatzip').enabled()
> +  qatzip = dependency('qatzip', version: '>=1.1.2',
> +  required: get_option('qatzip'),
> +  method: 'pkg-config')
> +endif
> +

How about changing to the following code 
if not get_option('qatzip').auto() or have_system
  qatzip = dependency('qatzip', version: '>=1.1.2',
  required: get_option('qatzip'),
  method: 'pkg-config')
endif

This means that on all Qemu emulation targets, always use 
--enable-qatzip and --disable-qatzip to determine whether
qatzip is required.

I think your previous code can remove if get_option('qatzip').enabled()
And it also can work

>  virgl = not_found
> 
>  have_vhost_user_gpu = have_tools and host_os == 'linux' and
> pixman.found()
> @@ -2378,6 +2386,7 @@ config_host_data.set('CONFIG_STATX_MNT_ID',
> has_statx_mnt_id)
>  config_host_data.set('CONFIG_ZSTD', zstd.found())
>  config_host_data.set('CONFIG_QPL', qpl.found())
>  config_host_data.set('CONFIG_UADK', uadk.found())
> +config_host_data.set('CONFIG_QATZIP', qatzip.found())
>  config_host_data.set('CONFIG_FUSE', fuse.found())
>  config_host_data.set('CONFIG_FUSE_LSEEK', fuse_lseek.found())
>  config_host_data.set('CONFIG_SPICE_PROTOCOL', spice_protocol.found())
> @@ -4484,6 +4493,7 @@ summary_info += {'lzfse support': liblzfse}
>  summary_info += {'zstd support':  zstd}
>  summary_info += {'Query Processing Library support': qpl}
>  summary_info += {'UADK Library support': uadk}
> +summary_info += {'qatzip support':qatzip}
>  summary_info += {'NUMA host support': numa}
>  summary_info += {'capstone':  capstone}
>  summary_info += {'libpmem support':   libpmem}
> diff --git a/meson_options.txt b/meson_options.txt
> index 0269fa0f16..35a69f6697 100644
> --- a/meson_options.txt
> +++ b/meson_options.txt
> @@ -261,6 +261,8 @@ option('qpl', type : 'feature', value : 'auto',
> description: 'Query Processing Library support')
>  option('uadk', type : 'feature', value : 'auto',
> description: 'UADK Library support')
> +option('qatzip', type: 'feature', value: 'disabled',

If you agree with the above changes, set the qatzip value to auto
option('qatzip', type: 'feature', value: 'auto'

> +   description: 'QATzip compression support')
>  option('fuse', type: 'feature', value: 'auto',
> description: 'FUSE block device export')
>  option('fuse_lseek', type : 'feature', value : 'auto',
> diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
> index cfadb5ea86..1ce467e9cc 100644
> --- a/scripts/meson-buildoptions.sh
> +++ b/scripts/meson-buildoptions.sh
> @@ -163,6 +163,7 @@ meson_options_help() {
>printf "%s\n" '  pixman  pixman support'
>printf "%s\n" '  plugins TCG plugins via shared library
> loading'
>printf "%s\n" '  png PNG support with libpng'
> +  printf "%s\n" '  qatzip  QATzip compression support'
>printf "%s\n" '  qcow1   qcow1 image format support'
>printf "%s\n" '  qed qed image format support'
>printf "%s\n" '  qga-vss build QGA VSS support (broken with
> MinGW)'
> @@ -427,6 +428,8 @@ _meson_option_parse() {
>  --enable-png) printf "%s" -Dpng=enabled ;;
>  --disable-png) printf "%s" -Dpng=disabled ;;
>  --prefix=*) quote_sh "-Dprefix=$2" ;;
> +--enable-qatzip) printf "%s" -Dqatzip=enabled ;;
> +--disable-qatzip) printf "%s" -Dqatzip=disabled ;;
>  --enable-qcow1) printf "%s" -Dqcow1=enabled ;;
>  --disable-qcow1) printf "%s" -Dqcow1=disabled ;;
>  --enable-qed) printf "%s" -Dqed=enabled ;;
> --
> Yichen Wang




RE: [PATCH v6 1/5] docs/migration: add qatzip compression feature

2024-07-15 Thread Liu, Yuan1
> -Original Message-
> From: Yichen Wang 
> Sent: Tuesday, July 16, 2024 6:13 AM
> To: Peter Xu ; Fabiano Rosas ; Paolo
> Bonzini ; Daniel P. Berrangé ;
> Eduardo Habkost ; Marc-André Lureau
> ; Thomas Huth ; Philippe
> Mathieu-Daudé ; Eric Blake ; Markus
> Armbruster ; Laurent Vivier ; qemu-
> de...@nongnu.org
> Cc: Hao Xiang ; Liu, Yuan1 ;
> Zou, Nanhai ; Ho-Ren (Jack) Chuang
> ; Wang, Yichen 
> Subject: [PATCH v6 1/5] docs/migration: add qatzip compression feature
> 
> From: Yuan Liu 
> 
> add Intel QATzip compression method introduction
> 
> Signed-off-by: Yuan Liu 
> Reviewed-by: Nanhai Zou 
> Reviewed-by: Peter Xu 
> Reviewed-by: Yichen Wang 
> ---
>  docs/devel/migration/features.rst   |   1 +
>  docs/devel/migration/qatzip-compression.rst | 251 
>  2 files changed, 252 insertions(+)
>  create mode 100644 docs/devel/migration/qatzip-compression.rst
> 
> diff --git a/docs/devel/migration/features.rst
> b/docs/devel/migration/features.rst
> index 58f8fd9e16..8f431d52f9 100644
> --- a/docs/devel/migration/features.rst
> +++ b/docs/devel/migration/features.rst
> @@ -14,3 +14,4 @@ Migration has plenty of features to support different
> use cases.
> CPR
> qpl-compression
> uadk-compression
> +   qatzip-compression
> diff --git a/docs/devel/migration/qatzip-compression.rst
> b/docs/devel/migration/qatzip-compression.rst
> new file mode 100644
> index 00..72fa3e2826
> --- /dev/null
> +++ b/docs/devel/migration/qatzip-compression.rst
> @@ -0,0 +1,251 @@
> +==
> +QATzip Compression
> +==
> +In scenarios with limited network bandwidth, the ``QATzip`` solution can
> help
> +users save a lot of host CPU resources by accelerating compression and
> +decompression through the Intel QuickAssist Technology(``QAT``) hardware.

Hi Yichen

Thanks for adding the part of Performance Testing with QATzip, I wonder if we
can remove Performance Testing with QATzip part and directly add the following
content. 

Here, we use a typical example of limited bandwidth to illustrate the advantages
of QATzip. If the user is interested in qatzip, he still needs to verify the 
performance
by himself.

+The following test was conducted using 8 multifd channels and 10Gbps network
+bandwidth. The results show that, compared to zstd, ``QATzip`` significantly
+saves CPU resources on the sender and reduces migration time. Compared to the
+uncompressed solution, ``QATzip`` greatly improves the dirty page processing
+capability, indicated by the Pages per Second metric, and also reduces the
+total migration time.
+
+::
+
+   VM Configuration: 16 vCPU and 64G memory
+   VM Workload: all vCPUs are idle and 54G memory is filled with Silesia data.
+   QAT Devices: 4
+   |---||-|--|--|--|--|
+   |8 Channels |Total   |down |throughput|pages per | send | recv |
+   |   |time(ms)|time(ms) |(mbps)|second| cpu %| cpu% |
+   |---||-|--|--|--|--|
+   |qatzip |   16630|   28| 10467|   2940235|   160|   360|
+   |---||-|--|--|--|--|
+   |zstd   |   20165|   24|  8579|   2391465|   810|   340|
+   |---||-|--|--|--|--|
+   |none   |   46063|   40| 10848|330240|45|85|
+   |---||-|--|--|--|--|


> +``QATzip`` is a user space library which builds on top of the Intel
> QuickAssist
> +Technology user space library, to provide extended accelerated
> compression and
> +decompression services.
> +
> +For more ``QATzip`` introduction, please refer to `QATzip Introduction
> +<https://github.com/intel/QATzip?tab=readme-ov-file#introductionl>`_
> +
> +QATzip Compression Framework
> +
> +
> +::
> +
> +  ++
> +  | MultiFd Thread |
> +  +---++
> +  |
> +  | compress/decompress
> +  +---++
> +  | QATzip library |
> +  +---++
> +  |
> +  +---++
> +  |  QAT library   |
> +  +---++
> +  | user space
> +  +-
> +  | kernel space
> +   +--+---+
> +   |  QAT  Driver |
> +   +--+---+
> +  |
> +   +--+---+
> +   | QAT Devices  |
> +   +--+
> +
> +
> +QATzip Installation
> +---
> +
> +The ``QATzip`` installation package has been integrated into some Linux
> +distributions and can be installed directly. For example, the Ubuntu
> Server
> +24.04 LTS s

RE: [PATCH v5 00/13] WIP: Use Intel DSA accelerator to offload zero page checking in multifd live migration.

2024-07-15 Thread Liu, Yuan1
> -Original Message-
> From: Michael S. Tsirkin 
> Sent: Tuesday, July 16, 2024 12:24 AM
> To: Liu, Yuan1 
> Cc: Wang, Yichen ; Paolo Bonzini
> ; Marc-André Lureau ;
> Daniel P. Berrangé ; Thomas Huth ;
> Philippe Mathieu-Daudé ; Peter Xu ;
> Fabiano Rosas ; Eric Blake ; Markus
> Armbruster ; Cornelia Huck ; qemu-
> de...@nongnu.org; Hao Xiang ; Kumar, Shivam
> ; Ho-Ren (Jack) Chuang
> 
> Subject: Re: [PATCH v5 00/13] WIP: Use Intel DSA accelerator to offload
> zero page checking in multifd live migration.
> 
> On Mon, Jul 15, 2024 at 03:57:42PM +, Liu, Yuan1 wrote:
> > > > > > > > > that is 23% total CPU usage savings.
> > > > > > > >
> > > > > > > >
> > > > > > > > Here the DSA was mostly idle.
> > > > > > > >
> > > > > > > > Sounds good but a question: what if several qemu instances
> are
> > > > > > > > migrated in parallel?
> > > > > > > >
> > > > > > > > Some accelerators tend to basically stall if several tasks
> > > > > > > > are trying to use them at the same time.
> > > > > > > >
> > > > > > > > Where is the boundary here?
> >
> > If I understand correctly, you are concerned that in some scenarios the
> > accelerator itself is the migration bottleneck, causing the migration
> performance
> > to be degraded.
> >
> > My understanding is to make full use of the accelerator bandwidth, and
> once
> > the accelerator is the bottleneck, it will fall back to zero-page
> detection
> > by the CPU.
> >
> > For example, when the enqcmd command returns an error which means the
> work queue
> > is full, then we can add some retry mechanisms or directly use CPU
> detection.
> 
> 
> How is it handled in your patch? If you just abort migration unless
> enqcmd succeeds then would that not be a bug, where loading the system
> leads to migraton failures?

Sorry for this, I have just started reviewing this patch. The content we
discussed before is only related to the DSA device itself and may not be
related to this patch's implementation. I will review the issue you mentioned
carefully. Thank you for your reminder.

> --
> MST




RE: [PATCH v5 00/13] WIP: Use Intel DSA accelerator to offload zero page checking in multifd live migration.

2024-07-15 Thread Liu, Yuan1
> -Original Message-
> From: Michael S. Tsirkin 
> Sent: Tuesday, July 16, 2024 12:09 AM
> To: Liu, Yuan1 
> Cc: Wang, Yichen ; Paolo Bonzini
> ; Marc-André Lureau ;
> Daniel P. Berrangé ; Thomas Huth ;
> Philippe Mathieu-Daudé ; Peter Xu ;
> Fabiano Rosas ; Eric Blake ; Markus
> Armbruster ; Cornelia Huck ; qemu-
> de...@nongnu.org; Hao Xiang ; Kumar, Shivam
> ; Ho-Ren (Jack) Chuang
> 
> Subject: Re: [PATCH v5 00/13] WIP: Use Intel DSA accelerator to offload
> zero page checking in multifd live migration.
> 
> On Mon, Jul 15, 2024 at 03:23:13PM +, Liu, Yuan1 wrote:
> > > -Original Message-
> > > From: Michael S. Tsirkin 
> > > Sent: Monday, July 15, 2024 10:43 PM
> > > To: Liu, Yuan1 
> > > Cc: Wang, Yichen ; Paolo Bonzini
> > > ; Marc-André Lureau
> ;
> > > Daniel P. Berrangé ; Thomas Huth
> ;
> > > Philippe Mathieu-Daudé ; Peter Xu
> ;
> > > Fabiano Rosas ; Eric Blake ;
> Markus
> > > Armbruster ; Cornelia Huck ;
> qemu-
> > > de...@nongnu.org; Hao Xiang ; Kumar, Shivam
> > > ; Ho-Ren (Jack) Chuang
> > > 
> > > Subject: Re: [PATCH v5 00/13] WIP: Use Intel DSA accelerator to
> offload
> > > zero page checking in multifd live migration.
> > >
> > > On Mon, Jul 15, 2024 at 01:09:59PM +, Liu, Yuan1 wrote:
> > > > > -Original Message-
> > > > > From: Michael S. Tsirkin 
> > > > > Sent: Monday, July 15, 2024 8:24 PM
> > > > > To: Liu, Yuan1 
> > > > > Cc: Wang, Yichen ; Paolo Bonzini
> > > > > ; Marc-André Lureau
> > > ;
> > > > > Daniel P. Berrangé ; Thomas Huth
> > > ;
> > > > > Philippe Mathieu-Daudé ; Peter Xu
> > > ;
> > > > > Fabiano Rosas ; Eric Blake ;
> > > Markus
> > > > > Armbruster ; Cornelia Huck ;
> > > qemu-
> > > > > de...@nongnu.org; Hao Xiang ; Kumar, Shivam
> > > > > ; Ho-Ren (Jack) Chuang
> > > > > 
> > > > > Subject: Re: [PATCH v5 00/13] WIP: Use Intel DSA accelerator to
> > > offload
> > > > > zero page checking in multifd live migration.
> > > > >
> > > > > On Mon, Jul 15, 2024 at 08:29:03AM +, Liu, Yuan1 wrote:
> > > > > > > -Original Message-
> > > > > > > From: Michael S. Tsirkin 
> > > > > > > Sent: Friday, July 12, 2024 6:49 AM
> > > > > > > To: Wang, Yichen 
> > > > > > > Cc: Paolo Bonzini ; Marc-André Lureau
> > > > > > > ; Daniel P. Berrangé
> > > > > ;
> > > > > > > Thomas Huth ; Philippe Mathieu-Daudé
> > > > > > > ; Peter Xu ; Fabiano
> Rosas
> > > > > > > ; Eric Blake ; Markus
> > > Armbruster
> > > > > > > ; Cornelia Huck ; qemu-
> > > > > > > de...@nongnu.org; Hao Xiang ; Liu, Yuan1
> > > > > > > ; Kumar, Shivam
> ;
> > > Ho-
> > > > > Ren
> > > > > > > (Jack) Chuang 
> > > > > > > Subject: Re: [PATCH v5 00/13] WIP: Use Intel DSA accelerator
> to
> > > > > offload
> > > > > > > zero page checking in multifd live migration.
> > > > > > >
> > > > > > > On Thu, Jul 11, 2024 at 02:52:35PM -0700, Yichen Wang wrote:
> > > > > > > > * Performance:
> > > > > > > >
> > > > > > > > We use two Intel 4th generation Xeon servers for testing.
> > > > > > > >
> > > > > > > > Architecture:x86_64
> > > > > > > > CPU(s):  192
> > > > > > > > Thread(s) per core:  2
> > > > > > > > Core(s) per socket:  48
> > > > > > > > Socket(s):   2
> > > > > > > > NUMA node(s):2
> > > > > > > > Vendor ID:   GenuineIntel
> > > > > > > > CPU family:  6
> > > > > > > > Model:   143
> > > > > > > > Model name:  Intel(R) Xeon(R) Platinum 8457C
> > > > > > > > Stepping:8
> > > > > > > > CPU MHz: 2538.624
> > > > > > > > CPU max MHz: 3800.
> > > > >

RE: [PATCH v5 00/13] WIP: Use Intel DSA accelerator to offload zero page checking in multifd live migration.

2024-07-15 Thread Liu, Yuan1
> -Original Message-
> From: Liu, Yuan1
> Sent: Monday, July 15, 2024 11:23 PM
> To: Michael S. Tsirkin 
> Cc: Wang, Yichen ; Paolo Bonzini
> ; Marc-André Lureau ;
> Daniel P. Berrangé ; Thomas Huth ;
> Philippe Mathieu-Daudé ; Peter Xu ;
> Fabiano Rosas ; Eric Blake ; Markus
> Armbruster ; Cornelia Huck ; qemu-
> de...@nongnu.org; Hao Xiang ; Kumar, Shivam
> ; Ho-Ren (Jack) Chuang
> 
> Subject: RE: [PATCH v5 00/13] WIP: Use Intel DSA accelerator to offload
> zero page checking in multifd live migration.
> 
> > -Original Message-
> > From: Michael S. Tsirkin 
> > Sent: Monday, July 15, 2024 10:43 PM
> > To: Liu, Yuan1 
> > Cc: Wang, Yichen ; Paolo Bonzini
> > ; Marc-André Lureau ;
> > Daniel P. Berrangé ; Thomas Huth
> ;
> > Philippe Mathieu-Daudé ; Peter Xu
> ;
> > Fabiano Rosas ; Eric Blake ; Markus
> > Armbruster ; Cornelia Huck ; qemu-
> > de...@nongnu.org; Hao Xiang ; Kumar, Shivam
> > ; Ho-Ren (Jack) Chuang
> > 
> > Subject: Re: [PATCH v5 00/13] WIP: Use Intel DSA accelerator to offload
> > zero page checking in multifd live migration.
> >
> > On Mon, Jul 15, 2024 at 01:09:59PM +0000, Liu, Yuan1 wrote:
> > > > -Original Message-
> > > > From: Michael S. Tsirkin 
> > > > Sent: Monday, July 15, 2024 8:24 PM
> > > > To: Liu, Yuan1 
> > > > Cc: Wang, Yichen ; Paolo Bonzini
> > > > ; Marc-André Lureau
> > ;
> > > > Daniel P. Berrangé ; Thomas Huth
> > ;
> > > > Philippe Mathieu-Daudé ; Peter Xu
> > ;
> > > > Fabiano Rosas ; Eric Blake ;
> > Markus
> > > > Armbruster ; Cornelia Huck ;
> > qemu-
> > > > de...@nongnu.org; Hao Xiang ; Kumar, Shivam
> > > > ; Ho-Ren (Jack) Chuang
> > > > 
> > > > Subject: Re: [PATCH v5 00/13] WIP: Use Intel DSA accelerator to
> > offload
> > > > zero page checking in multifd live migration.
> > > >
> > > > On Mon, Jul 15, 2024 at 08:29:03AM +, Liu, Yuan1 wrote:
> > > > > > -Original Message-
> > > > > > From: Michael S. Tsirkin 
> > > > > > Sent: Friday, July 12, 2024 6:49 AM
> > > > > > To: Wang, Yichen 
> > > > > > Cc: Paolo Bonzini ; Marc-André Lureau
> > > > > > ; Daniel P. Berrangé
> > > > ;
> > > > > > Thomas Huth ; Philippe Mathieu-Daudé
> > > > > > ; Peter Xu ; Fabiano Rosas
> > > > > > ; Eric Blake ; Markus
> > Armbruster
> > > > > > ; Cornelia Huck ; qemu-
> > > > > > de...@nongnu.org; Hao Xiang ; Liu, Yuan1
> > > > > > ; Kumar, Shivam
> ;
> > Ho-
> > > > Ren
> > > > > > (Jack) Chuang 
> > > > > > Subject: Re: [PATCH v5 00/13] WIP: Use Intel DSA accelerator to
> > > > offload
> > > > > > zero page checking in multifd live migration.
> > > > > >
> > > > > > On Thu, Jul 11, 2024 at 02:52:35PM -0700, Yichen Wang wrote:
> > > > > > > * Performance:
> > > > > > >
> > > > > > > We use two Intel 4th generation Xeon servers for testing.
> > > > > > >
> > > > > > > Architecture:x86_64
> > > > > > > CPU(s):  192
> > > > > > > Thread(s) per core:  2
> > > > > > > Core(s) per socket:  48
> > > > > > > Socket(s):   2
> > > > > > > NUMA node(s):2
> > > > > > > Vendor ID:   GenuineIntel
> > > > > > > CPU family:  6
> > > > > > > Model:   143
> > > > > > > Model name:  Intel(R) Xeon(R) Platinum 8457C
> > > > > > > Stepping:8
> > > > > > > CPU MHz: 2538.624
> > > > > > > CPU max MHz: 3800.
> > > > > > > CPU min MHz: 800.
> > > > > > >
> > > > > > > We perform multifd live migration with below setup:
> > > > > > > 1. VM has 100GB memory.
> > > > > > > 2. Use the new migration option multifd-set-normal-page-ratio
> to
> > > > control
> > > > > > the total
> > > > > > > size of the payload sent over the network.
> > > > > > > 3. Use 8 multifd channels.
> > 

RE: [PATCH v5 00/13] WIP: Use Intel DSA accelerator to offload zero page checking in multifd live migration.

2024-07-15 Thread Liu, Yuan1
> -Original Message-
> From: Michael S. Tsirkin 
> Sent: Monday, July 15, 2024 10:43 PM
> To: Liu, Yuan1 
> Cc: Wang, Yichen ; Paolo Bonzini
> ; Marc-André Lureau ;
> Daniel P. Berrangé ; Thomas Huth ;
> Philippe Mathieu-Daudé ; Peter Xu ;
> Fabiano Rosas ; Eric Blake ; Markus
> Armbruster ; Cornelia Huck ; qemu-
> de...@nongnu.org; Hao Xiang ; Kumar, Shivam
> ; Ho-Ren (Jack) Chuang
> 
> Subject: Re: [PATCH v5 00/13] WIP: Use Intel DSA accelerator to offload
> zero page checking in multifd live migration.
> 
> On Mon, Jul 15, 2024 at 01:09:59PM +, Liu, Yuan1 wrote:
> > > -Original Message-
> > > From: Michael S. Tsirkin 
> > > Sent: Monday, July 15, 2024 8:24 PM
> > > To: Liu, Yuan1 
> > > Cc: Wang, Yichen ; Paolo Bonzini
> > > ; Marc-André Lureau
> ;
> > > Daniel P. Berrangé ; Thomas Huth
> ;
> > > Philippe Mathieu-Daudé ; Peter Xu
> ;
> > > Fabiano Rosas ; Eric Blake ;
> Markus
> > > Armbruster ; Cornelia Huck ;
> qemu-
> > > de...@nongnu.org; Hao Xiang ; Kumar, Shivam
> > > ; Ho-Ren (Jack) Chuang
> > > 
> > > Subject: Re: [PATCH v5 00/13] WIP: Use Intel DSA accelerator to
> offload
> > > zero page checking in multifd live migration.
> > >
> > > On Mon, Jul 15, 2024 at 08:29:03AM +, Liu, Yuan1 wrote:
> > > > > -Original Message-
> > > > > From: Michael S. Tsirkin 
> > > > > Sent: Friday, July 12, 2024 6:49 AM
> > > > > To: Wang, Yichen 
> > > > > Cc: Paolo Bonzini ; Marc-André Lureau
> > > > > ; Daniel P. Berrangé
> > > ;
> > > > > Thomas Huth ; Philippe Mathieu-Daudé
> > > > > ; Peter Xu ; Fabiano Rosas
> > > > > ; Eric Blake ; Markus
> Armbruster
> > > > > ; Cornelia Huck ; qemu-
> > > > > de...@nongnu.org; Hao Xiang ; Liu, Yuan1
> > > > > ; Kumar, Shivam ;
> Ho-
> > > Ren
> > > > > (Jack) Chuang 
> > > > > Subject: Re: [PATCH v5 00/13] WIP: Use Intel DSA accelerator to
> > > offload
> > > > > zero page checking in multifd live migration.
> > > > >
> > > > > On Thu, Jul 11, 2024 at 02:52:35PM -0700, Yichen Wang wrote:
> > > > > > * Performance:
> > > > > >
> > > > > > We use two Intel 4th generation Xeon servers for testing.
> > > > > >
> > > > > > Architecture:x86_64
> > > > > > CPU(s):  192
> > > > > > Thread(s) per core:  2
> > > > > > Core(s) per socket:  48
> > > > > > Socket(s):   2
> > > > > > NUMA node(s):2
> > > > > > Vendor ID:   GenuineIntel
> > > > > > CPU family:  6
> > > > > > Model:   143
> > > > > > Model name:  Intel(R) Xeon(R) Platinum 8457C
> > > > > > Stepping:8
> > > > > > CPU MHz: 2538.624
> > > > > > CPU max MHz: 3800.
> > > > > > CPU min MHz: 800.
> > > > > >
> > > > > > We perform multifd live migration with below setup:
> > > > > > 1. VM has 100GB memory.
> > > > > > 2. Use the new migration option multifd-set-normal-page-ratio to
> > > control
> > > > > the total
> > > > > > size of the payload sent over the network.
> > > > > > 3. Use 8 multifd channels.
> > > > > > 4. Use tcp for live migration.
> > > > > > 4. Use CPU to perform zero page checking as the baseline.
> > > > > > 5. Use one DSA device to offload zero page checking to compare
> with
> > > the
> > > > > baseline.
> > > > > > 6. Use "perf sched record" and "perf sched timehist" to analyze
> CPU
> > > > > usage.
> > > > > >
> > > > > > A) Scenario 1: 50% (50GB) normal pages on an 100GB vm.
> > > > > >
> > > > > > CPU usage
> > > > > >
> > > > > > |---|---|---|-
> > > --|
> > > > > > |   |comm   |runtime(msec)  
> > > > > > |totaltime(msec)|
> > > > > > |---|---|---|-
> > &

RE: [PATCH v5 01/13] meson: Introduce new instruction set enqcmd to the build system.

2024-07-15 Thread Liu, Yuan1
> -Original Message-
> From: Yichen Wang 
> Sent: Friday, July 12, 2024 5:53 AM
> To: Paolo Bonzini ; Marc-André Lureau
> ; Daniel P. Berrangé ;
> Thomas Huth ; Philippe Mathieu-Daudé
> ; Peter Xu ; Fabiano Rosas
> ; Eric Blake ; Markus Armbruster
> ; Michael S. Tsirkin ; Cornelia Huck
> ; qemu-devel@nongnu.org
> Cc: Hao Xiang ; Liu, Yuan1 ;
> Kumar, Shivam ; Ho-Ren (Jack) Chuang
> ; Wang, Yichen 
> Subject: [PATCH v5 01/13] meson: Introduce new instruction set enqcmd to
> the build system.
> 
> From: Hao Xiang 
> 
> Enable instruction set enqcmd in build.
> 
> Signed-off-by: Hao Xiang 
> Signed-off-by: Yichen Wang 
> ---
>  meson.build   | 14 ++
>  meson_options.txt |  2 ++
>  scripts/meson-buildoptions.sh |  3 +++
>  3 files changed, 19 insertions(+)
> 
> diff --git a/meson.build b/meson.build
> index 6a93da48e1..af650cfabf 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -2893,6 +2893,20 @@ config_host_data.set('CONFIG_AVX512BW_OPT',
> get_option('avx512bw') \
>  int main(int argc, char *argv[]) { return bar(argv[0]); }
>'''), error_message: 'AVX512BW not available').allowed())
> 
> +config_host_data.set('CONFIG_DSA_OPT', get_option('enqcmd') \
> +  .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot
> enable ENQCMD') \
> +  .require(cc.links('''
> +#include 
> +#include 
> +#include 
> +static int __attribute__((target("enqcmd"))) bar(void *a) {
> +  uint64_t dst[8] = { 0 };
> +  uint64_t src[8] = { 0 };
> +  return _enqcmd(dst, src);
> +}
> +int main(int argc, char *argv[]) { return bar(argv[argc - 1]); }
> +  '''), error_message: 'ENQCMD not available').allowed())
> +

How about using cpuid instruction to dynamically detect enqcmd and movdir64b 
instructions? 

My reasons are as follows
1. enqcmd/movdir64b and DSA devices are used together. DSA devices are 
dynamically
   detected, so enqcmd can also dynamically detect.

   Simple code for dynamically detect movdir64b and enqcmd
   bool check_dsa_instructions(void) {
   uint32_t eax, ebx, ecx, edx;
   bool movedirb_enabled;
   bool enqcmd_enabled;

   cpuid(0x07, 0x0, &eax, &ebx, &ecx, &edx);
   movedirb_enabled = (ecx >> 28) & 0x1;
   if (!movedirb_enabled) {
   return false;
   }
   enqcmd_enabled = (ecx >> 29) & 0x1;
   if (!enqcmd_enabled) {
   return false;
   }
   return true;
}

https://cdrdv2-public.intel.com/819680/architecture-instruction-set-extensions-programming-reference.pdf

2. The enqcmd/movdir64b are new instructions, I checked they are integrated 
into GCC10
   However, users do not need gcc10 or higher to use two instructions.
   Simple code to implement enqcmd
   static inline int enqcmd(volatile void *reg, struct dsa_hw_desc *desc)
   {
   uint8_t retry;
   asm volatile (".byte 0xf2, 0x0f, 0x38, 0xf8, 0x02\t\n"
   "setz %0\t\n":"=r" (retry):"a"(reg), "d"(desc));
   return (int)retry;
   } 
   
file:///C:/Users/yliu80/Downloads/353216-data-streaming-accelerator-user-guide-002.pdf

>  # For both AArch64 and AArch32, detect if builtins are available.
>  config_host_data.set('CONFIG_ARM_AES_BUILTIN', cc.compiles('''
>  #include 
> diff --git a/meson_options.txt b/meson_options.txt
> index 0269fa0f16..4ed820bb8d 100644
> --- a/meson_options.txt
> +++ b/meson_options.txt
> @@ -121,6 +121,8 @@ option('avx2', type: 'feature', value: 'auto',
> description: 'AVX2 optimizations')
>  option('avx512bw', type: 'feature', value: 'auto',
> description: 'AVX512BW optimizations')
> +option('enqcmd', type: 'feature', value: 'disabled',
> +   description: 'ENQCMD optimizations')
>  option('keyring', type: 'feature', value: 'auto',
> description: 'Linux keyring support')
>  option('libkeyutils', type: 'feature', value: 'auto',
> diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
> index cfadb5ea86..280e117687 100644
> --- a/scripts/meson-buildoptions.sh
> +++ b/scripts/meson-buildoptions.sh
> @@ -95,6 +95,7 @@ meson_options_help() {
>printf "%s\n" '  auth-pamPAM access control'
>printf "%s\n" '  avx2AVX2 optimizations'
>printf "%s\n" '  avx512bwAVX51

RE: [PATCH v5 00/13] WIP: Use Intel DSA accelerator to offload zero page checking in multifd live migration.

2024-07-15 Thread Liu, Yuan1
> -Original Message-
> From: Michael S. Tsirkin 
> Sent: Monday, July 15, 2024 8:24 PM
> To: Liu, Yuan1 
> Cc: Wang, Yichen ; Paolo Bonzini
> ; Marc-André Lureau ;
> Daniel P. Berrangé ; Thomas Huth ;
> Philippe Mathieu-Daudé ; Peter Xu ;
> Fabiano Rosas ; Eric Blake ; Markus
> Armbruster ; Cornelia Huck ; qemu-
> de...@nongnu.org; Hao Xiang ; Kumar, Shivam
> ; Ho-Ren (Jack) Chuang
> 
> Subject: Re: [PATCH v5 00/13] WIP: Use Intel DSA accelerator to offload
> zero page checking in multifd live migration.
> 
> On Mon, Jul 15, 2024 at 08:29:03AM +, Liu, Yuan1 wrote:
> > > -Original Message-
> > > From: Michael S. Tsirkin 
> > > Sent: Friday, July 12, 2024 6:49 AM
> > > To: Wang, Yichen 
> > > Cc: Paolo Bonzini ; Marc-André Lureau
> > > ; Daniel P. Berrangé
> ;
> > > Thomas Huth ; Philippe Mathieu-Daudé
> > > ; Peter Xu ; Fabiano Rosas
> > > ; Eric Blake ; Markus Armbruster
> > > ; Cornelia Huck ; qemu-
> > > de...@nongnu.org; Hao Xiang ; Liu, Yuan1
> > > ; Kumar, Shivam ; Ho-
> Ren
> > > (Jack) Chuang 
> > > Subject: Re: [PATCH v5 00/13] WIP: Use Intel DSA accelerator to
> offload
> > > zero page checking in multifd live migration.
> > >
> > > On Thu, Jul 11, 2024 at 02:52:35PM -0700, Yichen Wang wrote:
> > > > * Performance:
> > > >
> > > > We use two Intel 4th generation Xeon servers for testing.
> > > >
> > > > Architecture:x86_64
> > > > CPU(s):  192
> > > > Thread(s) per core:  2
> > > > Core(s) per socket:  48
> > > > Socket(s):   2
> > > > NUMA node(s):2
> > > > Vendor ID:   GenuineIntel
> > > > CPU family:  6
> > > > Model:   143
> > > > Model name:  Intel(R) Xeon(R) Platinum 8457C
> > > > Stepping:8
> > > > CPU MHz: 2538.624
> > > > CPU max MHz: 3800.
> > > > CPU min MHz: 800.
> > > >
> > > > We perform multifd live migration with below setup:
> > > > 1. VM has 100GB memory.
> > > > 2. Use the new migration option multifd-set-normal-page-ratio to
> control
> > > the total
> > > > size of the payload sent over the network.
> > > > 3. Use 8 multifd channels.
> > > > 4. Use tcp for live migration.
> > > > 4. Use CPU to perform zero page checking as the baseline.
> > > > 5. Use one DSA device to offload zero page checking to compare with
> the
> > > baseline.
> > > > 6. Use "perf sched record" and "perf sched timehist" to analyze CPU
> > > usage.
> > > >
> > > > A) Scenario 1: 50% (50GB) normal pages on an 100GB vm.
> > > >
> > > > CPU usage
> > > >
> > > > |---|---|---|-
> --|
> > > > |   |comm   |runtime(msec)  
> > > > |totaltime(msec)|
> > > > |---|---|---|-
> --|
> > > > |Baseline   |live_migration |5657.58|   
> > > > |
> > > > |   |multifdsend_0  |3931.563   |   
> > > > |
> > > > |   |multifdsend_1  |4405.273   |   
> > > > |
> > > > |   |multifdsend_2  |3941.968   |   
> > > > |
> > > > |   |multifdsend_3  |5032.975   |   
> > > > |
> > > > |   |multifdsend_4  |4533.865   |   
> > > > |
> > > > |   |multifdsend_5  |4530.461   |   
> > > > |
> > > > |   |multifdsend_6  |5171.916   |   
> > > > |
> > > > |   |multifdsend_7  |4722.769   |41922  
> > > > |
> > > > |---|---|---|-
> --|
> > > > |DSA|live_migration |6129.168   |   
> > > > |
> > > > |   |multifdsend_0  |2954.717   |   
> > > > |
> > > > |   |multifdsend_1  |2766.359   |   
> > > >

RE: [PATCH v5 00/13] WIP: Use Intel DSA accelerator to offload zero page checking in multifd live migration.

2024-07-15 Thread Liu, Yuan1
> -Original Message-
> From: Michael S. Tsirkin 
> Sent: Friday, July 12, 2024 6:49 AM
> To: Wang, Yichen 
> Cc: Paolo Bonzini ; Marc-André Lureau
> ; Daniel P. Berrangé ;
> Thomas Huth ; Philippe Mathieu-Daudé
> ; Peter Xu ; Fabiano Rosas
> ; Eric Blake ; Markus Armbruster
> ; Cornelia Huck ; qemu-
> de...@nongnu.org; Hao Xiang ; Liu, Yuan1
> ; Kumar, Shivam ; Ho-Ren
> (Jack) Chuang 
> Subject: Re: [PATCH v5 00/13] WIP: Use Intel DSA accelerator to offload
> zero page checking in multifd live migration.
> 
> On Thu, Jul 11, 2024 at 02:52:35PM -0700, Yichen Wang wrote:
> > * Performance:
> >
> > We use two Intel 4th generation Xeon servers for testing.
> >
> > Architecture:x86_64
> > CPU(s):  192
> > Thread(s) per core:  2
> > Core(s) per socket:  48
> > Socket(s):   2
> > NUMA node(s):2
> > Vendor ID:   GenuineIntel
> > CPU family:  6
> > Model:   143
> > Model name:  Intel(R) Xeon(R) Platinum 8457C
> > Stepping:8
> > CPU MHz: 2538.624
> > CPU max MHz: 3800.
> > CPU min MHz: 800.
> >
> > We perform multifd live migration with below setup:
> > 1. VM has 100GB memory.
> > 2. Use the new migration option multifd-set-normal-page-ratio to control
> the total
> > size of the payload sent over the network.
> > 3. Use 8 multifd channels.
> > 4. Use tcp for live migration.
> > 4. Use CPU to perform zero page checking as the baseline.
> > 5. Use one DSA device to offload zero page checking to compare with the
> baseline.
> > 6. Use "perf sched record" and "perf sched timehist" to analyze CPU
> usage.
> >
> > A) Scenario 1: 50% (50GB) normal pages on an 100GB vm.
> >
> > CPU usage
> >
> > |---|---|---|---|
> > |   |comm   |runtime(msec)  |totaltime(msec)|
> > |---|---|---|---|
> > |Baseline   |live_migration |5657.58|   |
> > |   |multifdsend_0  |3931.563   |   |
> > |   |multifdsend_1  |4405.273   |   |
> > |   |multifdsend_2  |3941.968   |   |
> > |   |multifdsend_3  |5032.975   |   |
> > |   |multifdsend_4  |4533.865   |   |
> > |   |multifdsend_5  |4530.461   |   |
> > |   |multifdsend_6  |5171.916   |   |
> > |   |multifdsend_7  |4722.769   |41922  |
> > |---|---|---|---|
> > |DSA|live_migration |6129.168   |   |
> > |   |multifdsend_0  |2954.717   |   |
> > |   |multifdsend_1  |2766.359   |   |
> > |   |multifdsend_2  |2853.519   |   |
> > |   |multifdsend_3  |2740.717   |   |
> > |   |multifdsend_4  |2824.169   |   |
> > |   |multifdsend_5  |2966.908   |   |
> > |   |multifdsend_6  |2611.137   |   |
> > |   |multifdsend_7  |3114.732   |   |
> > |   |dsa_completion |3612.564   |32568  |
> > |---|---|---|---|
> >
> > Baseline total runtime is calculated by adding up all multifdsend_X
> > and live_migration threads runtime. DSA offloading total runtime is
> > calculated by adding up all multifdsend_X, live_migration and
> > dsa_completion threads runtime. 41922 msec VS 32568 msec runtime and
> > that is 23% total CPU usage savings.
> 
> 
> Here the DSA was mostly idle.
> 
> Sounds good but a question: what if several qemu instances are
> migrated in parallel?
> 
> Some accelerators tend to basically stall if several tasks
> are trying to use them at the same time.
> 
> Where is the boundary here?

A DSA device can be assigned to multiple Qemu instances. 
The DSA resource used by each process is called a work queue, each DSA
device can support up to 8 work queues and work queues are classified into 
dedicated queues and shared queues. 

A dedicated queue can only serve one process. Theoretically, there is no limit 
on the number of processes in a shared queue, it is based on enqcmd + SVM 
technology.

https://www.kernel.org/doc/html/v5.17/x86/sva.html

> --
> MST




RE: [PATCH v4 0/4] Implement using Intel QAT to offload ZLIB

2024-07-10 Thread Liu, Yuan1
> -Original Message-
> From: Peter Xu 
> Sent: Wednesday, July 10, 2024 11:19 PM
> To: Liu, Yuan1 
> Cc: Wang, Yichen ; Paolo Bonzini
> ; Daniel P. Berrangé ; Eduardo
> Habkost ; Marc-André Lureau
> ; Thomas Huth ; Philippe
> Mathieu-Daudé ; Fabiano Rosas ; Eric
> Blake ; Markus Armbruster ; Laurent
> Vivier ; qemu-devel@nongnu.org; Hao Xiang
> ; Zou, Nanhai ; Ho-Ren (Jack)
> Chuang 
> Subject: Re: [PATCH v4 0/4] Implement using Intel QAT to offload ZLIB
> 
> On Wed, Jul 10, 2024 at 01:55:23PM +, Liu, Yuan1 wrote:
> 
> [...]
> 
> > migrate_set_parameter max-bandwidth 1250M
> > |---||-|--|--|--|--|
> > |8 Channels |Total   |down |throughput|pages per | send | recv |
> > |   |time(ms)|time(ms) |(mbps)|second| cpu %| cpu% |
> > |---||-|--|--|--|--|
> > |qatzip |   16630|   28| 10467|   2940235|   160|   360|
> > |---||-|--|--|--|--|
> > |zstd   |   20165|   24|  8579|   2391465|   810|   340|
> > |---||-|--|--|--|--|
> > |none   |   46063|   40| 10848|330240|45|85|
> > |---||-|--|--|--|--|
> >
> > QATzip's dirty page processing throughput is much higher than that no
> compression.
> > In this test, the vCPUs are in idle state, so the migration can be
> successful even
> > without compression.
> 
> Thanks!  Maybe good material to be put into the docs/ too, if Yichen's
> going to pick up your doc patch when repost.

Sure, Yichen will add my doc patch, if he doesn't add this part in 
the next version, I will add it later.

> [...]
> 
> > I don’t have much experience with postcopy, here are some of my thoughts
> > 1. For write-intensive VMs, this solution can improve the migration
> success,
> >because in a limited bandwidth network scenario, the dirty page
> processing
> >throughput will be significantly reduced for no compression, the
> previous
> >data includes this(pages_per_second), it means that in the no
> compression
> >precopy, the dirty pages generated by the workload are greater than
> the
> >migration processing, resulting in migration failure.
> 
> Yes.
> 
> >
> > 2. If the VM is read-intensive or has low vCPU utilization (for example,
> my
> >current test scenario is that the vCPUs are all idle). I think no
> compression +
> >precopy + postcopy also cannot improve the migration performance, and
> may also
> >cause timeout failure due to long migration time, same with no
> compression precopy.
> 
> I don't think postcopy will trigger timeout failures - postcopy should use
> constant time to complete a migration, that is guest memsize / bw.

Yes, the migration total time is predictable, failure due to timeout is 
incorrect, 
migration taking a long time may be more accurate.

> The challenge is normally on the delay of page requests higher than
> precopy, but in this case it might not be a big deal. And I wonder if on
> 100G*2 cards it can also perform pretty well, as the delay might be
> minimal
> even if bandwidth is throttled.

I got your point, I don't have much experience in this area.
So you mean to reserve a small amount of bandwidth on a NIC for postcopy 
migration, and compare the migration performance with and without traffic
on the NIC? Will data plane traffic affect page request delays in postcopy?

> >
> > 3. In my opinion, the postcopy is a good solution in this scenario(low
> network bandwidth,
> >VM is not critical), because even if compression is turned on, the
> migration may still
> >fail(page_per_second may still less than the new dirty pages), and it
> is hard to predict
> >whether VM memory is compression-friendly.
> 
> Yes.
> 
> Thanks,
> 
> --
> Peter Xu



RE: [PATCH v4 3/4] migration: Introduce 'qatzip' compression method

2024-07-10 Thread Liu, Yuan1
> -Original Message-
> From: Yichen Wang 
> Sent: Saturday, July 6, 2024 2:29 AM
> To: Paolo Bonzini ; Daniel P. Berrangé
> ; Eduardo Habkost ; Marc-André
> Lureau ; Thomas Huth ;
> Philippe Mathieu-Daudé ; Peter Xu ;
> Fabiano Rosas ; Eric Blake ; Markus
> Armbruster ; Laurent Vivier ; qemu-
> de...@nongnu.org
> Cc: Hao Xiang ; Liu, Yuan1 ;
> Zou, Nanhai ; Ho-Ren (Jack) Chuang
> ; Wang, Yichen ;
> Bryan Zhang 
> Subject: [PATCH v4 3/4] migration: Introduce 'qatzip' compression method
> 
> From: Bryan Zhang 
> 
> Adds support for 'qatzip' as an option for the multifd compression
> method parameter, and implements using QAT for 'qatzip' compression and
> decompression.
> 
> Signed-off-by: Bryan Zhang 
> Signed-off-by: Hao Xiang 
> Signed-off-by: Yichen Wang 
> ---
>  hw/core/qdev-properties-system.c |   6 +-
>  migration/meson.build|   1 +
>  migration/multifd-qatzip.c   | 391 +++
>  migration/multifd.h  |   5 +-
>  qapi/migration.json  |   3 +
>  tests/qtest/meson.build  |   4 +
>  6 files changed, 407 insertions(+), 3 deletions(-)
>  create mode 100644 migration/multifd-qatzip.c
> 
> diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-
> system.c
> index f13350b4fb..eb50d6ec5b 100644
> --- a/hw/core/qdev-properties-system.c
> +++ b/hw/core/qdev-properties-system.c
> @@ -659,7 +659,11 @@ const PropertyInfo qdev_prop_fdc_drive_type = {
>  const PropertyInfo qdev_prop_multifd_compression = {
>  .name = "MultiFDCompression",
>  .description = "multifd_compression values, "
> -   "none/zlib/zstd/qpl/uadk",
> +   "none/zlib/zstd/qpl/uadk"
> +#ifdef CONFIG_QATZIP
> +   "/qatzip"
> +#endif
> +   ,
>  .enum_table = &MultiFDCompression_lookup,
>  .get = qdev_propinfo_get_enum,
>  .set = qdev_propinfo_set_enum,
> diff --git a/migration/meson.build b/migration/meson.build
> index 5ce2acb41e..c9454c26ae 100644
> --- a/migration/meson.build
> +++ b/migration/meson.build
> @@ -41,6 +41,7 @@ system_ss.add(when: rdma, if_true: files('rdma.c'))
>  system_ss.add(when: zstd, if_true: files('multifd-zstd.c'))
>  system_ss.add(when: qpl, if_true: files('multifd-qpl.c'))
>  system_ss.add(when: uadk, if_true: files('multifd-uadk.c'))
> +system_ss.add(when: qatzip, if_true: files('multifd-qatzip.c'))
> 
>  specific_ss.add(when: 'CONFIG_SYSTEM_ONLY',
>  if_true: files('ram.c',
> diff --git a/migration/multifd-qatzip.c b/migration/multifd-qatzip.c
> new file mode 100644
> index 00..a1502a5589
> --- /dev/null
> +++ b/migration/multifd-qatzip.c
> @@ -0,0 +1,391 @@
> +/*
> + * Multifd QATzip compression implementation
> + *
> + * Copyright (c) Bytedance
> + *
> + * Authors:
> + *  Bryan Zhang 
> + *  Hao Xiang 
> + *  Yichen Wang 
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or
> later.
> + * See the COPYING file in the top-level directory.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "exec/ramblock.h"
> +#include "exec/target_page.h"
> +#include "qapi/error.h"
> +#include "migration.h"
> +#include "options.h"
> +#include "multifd.h"
> +#include 

"exec/target_page.h" may not required

use "qapi/qapi-types-migration.h" to instead "migration.h"

> +struct qatzip_data {
> +/*
> + * Unique session for use with QATzip API
> + */
> +QzSession_T sess;
> +
> +/*
> + * For compression: Buffer for pages to compress
> + * For decompression: Buffer for data to decompress
> + */
> +uint8_t *in_buf;
> +uint32_t in_len;
> +
> +/*
> + * For compression: Output buffer of compressed data
> + * For decompression: Output buffer of decompressed data
> + */
> +uint8_t *out_buf;
> +uint32_t out_len;
> +};

Add typedef and CamelCase name.
typedef struct QatzipData
https://www.qemu.org/docs/master/devel/style.html#comment-style

Typedefs are used to eliminate the redundant 'struct' keyword, since
type names have a different style than other identifiers 
("CamelCase" versus "snake_case"). Each named struct type should have 
a CamelCase name and a corresponding typedef.

> +/**
> + * qatzip_send_setup: Set up QATzip session and private buffers.
> + *
> + * @param pMultifd channel params
> + * @param errp Pointer to erro

RE: [PATCH v4 0/4] Implement using Intel QAT to offload ZLIB

2024-07-10 Thread Liu, Yuan1
> -Original Message-
> From: Peter Xu 
> Sent: Wednesday, July 10, 2024 2:43 AM
> To: Liu, Yuan1 
> Cc: Wang, Yichen ; Paolo Bonzini
> ; Daniel P. Berrangé ; Eduardo
> Habkost ; Marc-André Lureau
> ; Thomas Huth ; Philippe
> Mathieu-Daudé ; Fabiano Rosas ; Eric
> Blake ; Markus Armbruster ; Laurent
> Vivier ; qemu-devel@nongnu.org; Hao Xiang
> ; Zou, Nanhai ; Ho-Ren (Jack)
> Chuang 
> Subject: Re: [PATCH v4 0/4] Implement using Intel QAT to offload ZLIB
> 
> On Tue, Jul 09, 2024 at 08:42:59AM +, Liu, Yuan1 wrote:
> > > -Original Message-
> > > From: Yichen Wang 
> > > Sent: Saturday, July 6, 2024 2:29 AM
> > > To: Paolo Bonzini ; Daniel P. Berrangé
> > > ; Eduardo Habkost ; Marc-
> André
> > > Lureau ; Thomas Huth ;
> > > Philippe Mathieu-Daudé ; Peter Xu
> ;
> > > Fabiano Rosas ; Eric Blake ;
> Markus
> > > Armbruster ; Laurent Vivier ;
> qemu-
> > > de...@nongnu.org
> > > Cc: Hao Xiang ; Liu, Yuan1 ;
> > > Zou, Nanhai ; Ho-Ren (Jack) Chuang
> > > ; Wang, Yichen 
> > > Subject: [PATCH v4 0/4] Implement using Intel QAT to offload ZLIB
> > >
> > > v4:
> > > - Rebase changes on top of 1a2d52c7fcaeaaf4f2fe8d4d5183dccaeab67768
> > > - Move the IOV initialization to qatzip implementation
> > > - Only use qatzip to compress normal pages
> > >
> > > v3:
> > > - Rebase changes on top of master
> > > - Merge two patches per Fabiano Rosas's comment
> > > - Add versions into comments and documentations
> > >
> > > v2:
> > > - Rebase changes on top of recent multifd code changes.
> > > - Use QATzip API 'qzMalloc' and 'qzFree' to allocate QAT buffers.
> > > - Remove parameter tuning and use QATzip's defaults for better
> > >   performance.
> > > - Add parameter to enable QAT software fallback.
> > >
> > > v1:
> > > https://lists.nongnu.org/archive/html/qemu-devel/2023-12/msg03761.html
> > >
> > > * Performance
> > >
> > > We present updated performance results. For circumstantial reasons, v1
> > > presented performance on a low-bandwidth (1Gbps) network.
> > >
> > > Here, we present updated results with a similar setup as before but
> with
> > > two main differences:
> > >
> > > 1. Our machines have a ~50Gbps connection, tested using 'iperf3'.
> > > 2. We had a bug in our memory allocation causing us to only use ~1/2
> of
> > > the VM's RAM. Now we properly allocate and fill nearly all of the VM's
> > > RAM.
> > >
> > > Thus, the test setup is as follows:
> > >
> > > We perform multifd live migration over TCP using a VM with 64GB
> memory.
> > > We prepare the machine's memory by powering it on, allocating a large
> > > amount of memory (60GB) as a single buffer, and filling the buffer
> with
> > > the repeated contents of the Silesia corpus[0]. This is in lieu of a
> more
> > > realistic memory snapshot, which proved troublesome to acquire.
> > >
> > > We analyze CPU usage by averaging the output of 'top' every second
> > > during migration. This is admittedly imprecise, but we feel that it
> > > accurately portrays the different degrees of CPU usage of varying
> > > compression methods.
> > >
> > > We present the latency, throughput, and CPU usage results for all of
> the
> > > compression methods, with varying numbers of multifd threads (4, 8,
> and
> > > 16).
> > >
> > > [0] The Silesia corpus can be accessed here:
> > > https://sun.aei.polsl.pl//~sdeor/index.php?page=silesia
> > >
> > > ** Results
> > >
> > > 4 multifd threads:
> > >
> > > |---|---||-|--
> ---|
> > > |method |time(sec)  |throughput(mbps)|send cpu%|recv
> cpu%|
> > > |---|---||-|--
> ---|
> > > |qatzip | 23.13 | 8749.94|117.50   |186.49
> |
> > > |---|---||-|--
> ---|
> > > |zlib   |254.35 |  771.87|388.20   |144.40
> |
> > > |---|---||-|--
> ---|
> > > |zstd   | 54.52 | 3442.59|414.59   |149.77
> |
> > > |

RE: [PATCH v4 0/4] Implement using Intel QAT to offload ZLIB

2024-07-09 Thread Liu, Yuan1
> -Original Message-
> From: Yichen Wang 
> Sent: Saturday, July 6, 2024 2:29 AM
> To: Paolo Bonzini ; Daniel P. Berrangé
> ; Eduardo Habkost ; Marc-André
> Lureau ; Thomas Huth ;
> Philippe Mathieu-Daudé ; Peter Xu ;
> Fabiano Rosas ; Eric Blake ; Markus
> Armbruster ; Laurent Vivier ; qemu-
> de...@nongnu.org
> Cc: Hao Xiang ; Liu, Yuan1 ;
> Zou, Nanhai ; Ho-Ren (Jack) Chuang
> ; Wang, Yichen 
> Subject: [PATCH v4 0/4] Implement using Intel QAT to offload ZLIB
> 
> v4:
> - Rebase changes on top of 1a2d52c7fcaeaaf4f2fe8d4d5183dccaeab67768
> - Move the IOV initialization to qatzip implementation
> - Only use qatzip to compress normal pages
> 
> v3:
> - Rebase changes on top of master
> - Merge two patches per Fabiano Rosas's comment
> - Add versions into comments and documentations
> 
> v2:
> - Rebase changes on top of recent multifd code changes.
> - Use QATzip API 'qzMalloc' and 'qzFree' to allocate QAT buffers.
> - Remove parameter tuning and use QATzip's defaults for better
>   performance.
> - Add parameter to enable QAT software fallback.
> 
> v1:
> https://lists.nongnu.org/archive/html/qemu-devel/2023-12/msg03761.html
> 
> * Performance
> 
> We present updated performance results. For circumstantial reasons, v1
> presented performance on a low-bandwidth (1Gbps) network.
> 
> Here, we present updated results with a similar setup as before but with
> two main differences:
> 
> 1. Our machines have a ~50Gbps connection, tested using 'iperf3'.
> 2. We had a bug in our memory allocation causing us to only use ~1/2 of
> the VM's RAM. Now we properly allocate and fill nearly all of the VM's
> RAM.
> 
> Thus, the test setup is as follows:
> 
> We perform multifd live migration over TCP using a VM with 64GB memory.
> We prepare the machine's memory by powering it on, allocating a large
> amount of memory (60GB) as a single buffer, and filling the buffer with
> the repeated contents of the Silesia corpus[0]. This is in lieu of a more
> realistic memory snapshot, which proved troublesome to acquire.
> 
> We analyze CPU usage by averaging the output of 'top' every second
> during migration. This is admittedly imprecise, but we feel that it
> accurately portrays the different degrees of CPU usage of varying
> compression methods.
> 
> We present the latency, throughput, and CPU usage results for all of the
> compression methods, with varying numbers of multifd threads (4, 8, and
> 16).
> 
> [0] The Silesia corpus can be accessed here:
> https://sun.aei.polsl.pl//~sdeor/index.php?page=silesia
> 
> ** Results
> 
> 4 multifd threads:
> 
> |---|---||-|-|
> |method |time(sec)  |throughput(mbps)|send cpu%|recv cpu%|
> |---|---||-|-|
> |qatzip | 23.13 | 8749.94|117.50   |186.49   |
> |---|---||-|-|
> |zlib   |254.35 |  771.87|388.20   |144.40   |
> |---|---||-|-|
> |zstd   | 54.52 | 3442.59|414.59   |149.77   |
> |---|---||-|-|
> |none   | 12.45 |43739.60|159.71   |204.96   |
> |---|---||-|-|
> 
> 8 multifd threads:
> 
> |---|---||-|-|
> |method |time(sec)  |throughput(mbps)|send cpu%|recv cpu%|
> |---|---||-|-|
> |qatzip | 16.91 |12306.52|186.37   |391.84   |
> |---|---||-|-|
> |zlib   |130.11 | 1508.89|753.86   |289.35   |
> |---|---||-|-|
> |zstd   | 27.57 | 6823.23|786.83   |303.80   |
> |---|---||-|-|
> |none   | 11.82 |46072.63|163.74   |238.56   |
> |---|---||-|-|
> 
> 16 multifd threads:
> 
> |---|---||-|-|
> |method |time(sec)  |throughput(mbps)|send cpu%|recv cpu%|
> |---|---||-|-|
>  

RE: [PATCH v3 0/4] Implement using Intel QAT to offload ZLIB

2024-07-05 Thread Liu, Yuan1
> -Original Message-
> From: Peter Xu 
> Sent: Thursday, July 4, 2024 11:36 PM
> To: Liu, Yuan1 
> Cc: Wang, Yichen ; Paolo Bonzini
> ; Daniel P. Berrangé ; Eduardo
> Habkost ; Marc-André Lureau
> ; Thomas Huth ; Philippe
> Mathieu-Daudé ; Fabiano Rosas ; Eric
> Blake ; Markus Armbruster ; Laurent
> Vivier ; qemu-devel@nongnu.org; Hao Xiang
> ; Zou, Nanhai ; Ho-Ren (Jack)
> Chuang 
> Subject: Re: [PATCH v3 0/4] Implement using Intel QAT to offload ZLIB
> 
> On Thu, Jul 04, 2024 at 03:15:51AM +, Liu, Yuan1 wrote:
> > > -Original Message-
> > > From: Peter Xu 
> > > Sent: Wednesday, July 3, 2024 3:16 AM
> > > To: Wang, Yichen 
> > > Cc: Paolo Bonzini ; Daniel P. Berrangé
> > > ; Eduardo Habkost ; Marc-
> André
> > > Lureau ; Thomas Huth ;
> > > Philippe Mathieu-Daudé ; Fabiano Rosas
> > > ; Eric Blake ; Markus Armbruster
> > > ; Laurent Vivier ; qemu-
> > > de...@nongnu.org; Hao Xiang ; Liu, Yuan1
> > > ; Zou, Nanhai ; Ho-Ren
> (Jack)
> > > Chuang 
> > > Subject: Re: [PATCH v3 0/4] Implement using Intel QAT to offload ZLIB
> > >
> > > On Thu, Jun 27, 2024 at 03:34:41PM -0700, Yichen Wang wrote:
> > > > v3:
> > > > - Rebase changes on top of master
> > > > - Merge two patches per Fabiano Rosas's comment
> > > > - Add versions into comments and documentations
> > > >
> > > > v2:
> > > > - Rebase changes on top of recent multifd code changes.
> > > > - Use QATzip API 'qzMalloc' and 'qzFree' to allocate QAT buffers.
> > > > - Remove parameter tuning and use QATzip's defaults for better
> > > >   performance.
> > > > - Add parameter to enable QAT software fallback.
> > > >
> > > > v1:
> > > > https://lists.nongnu.org/archive/html/qemu-devel/2023-
> 12/msg03761.html
> > > >
> > > > * Performance
> > > >
> > > > We present updated performance results. For circumstantial reasons,
> v1
> > > > presented performance on a low-bandwidth (1Gbps) network.
> > > >
> > > > Here, we present updated results with a similar setup as before but
> with
> > > > two main differences:
> > > >
> > > > 1. Our machines have a ~50Gbps connection, tested using 'iperf3'.
> > > > 2. We had a bug in our memory allocation causing us to only use ~1/2
> of
> > > > the VM's RAM. Now we properly allocate and fill nearly all of the
> VM's
> > > > RAM.
> > > >
> > > > Thus, the test setup is as follows:
> > > >
> > > > We perform multifd live migration over TCP using a VM with 64GB
> memory.
> > > > We prepare the machine's memory by powering it on, allocating a
> large
> > > > amount of memory (60GB) as a single buffer, and filling the buffer
> with
> > > > the repeated contents of the Silesia corpus[0]. This is in lieu of a
> > > more
> > > > realistic memory snapshot, which proved troublesome to acquire.
> > > >
> > > > We analyze CPU usage by averaging the output of 'top' every second
> > > > during migration. This is admittedly imprecise, but we feel that it
> > > > accurately portrays the different degrees of CPU usage of varying
> > > > compression methods.
> > > >
> > > > We present the latency, throughput, and CPU usage results for all of
> the
> > > > compression methods, with varying numbers of multifd threads (4, 8,
> and
> > > > 16).
> > > >
> > > > [0] The Silesia corpus can be accessed here:
> > > > https://sun.aei.polsl.pl//~sdeor/index.php?page=silesia
> > > >
> > > > ** Results
> > > >
> > > > 4 multifd threads:
> > > >
> > > > |---|---||-|
> 
> > > -|
> > > > |method |time(sec)  |throughput(mbps)|send cpu%|recv
> > > cpu%|
> > > > |---|---||-|
> 
> > > -|
> > > > |qatzip | 23.13 | 8749.94|117.50
> |186.49
> > > |
> > > > |---|---||-|
> 
> > > -|
> > > > |zlib   |254.35 |  771.87

RE: [PATCH v3 0/4] Implement using Intel QAT to offload ZLIB

2024-07-03 Thread Liu, Yuan1
> -Original Message-
> From: Peter Xu 
> Sent: Wednesday, July 3, 2024 3:16 AM
> To: Wang, Yichen 
> Cc: Paolo Bonzini ; Daniel P. Berrangé
> ; Eduardo Habkost ; Marc-André
> Lureau ; Thomas Huth ;
> Philippe Mathieu-Daudé ; Fabiano Rosas
> ; Eric Blake ; Markus Armbruster
> ; Laurent Vivier ; qemu-
> de...@nongnu.org; Hao Xiang ; Liu, Yuan1
> ; Zou, Nanhai ; Ho-Ren (Jack)
> Chuang 
> Subject: Re: [PATCH v3 0/4] Implement using Intel QAT to offload ZLIB
> 
> On Thu, Jun 27, 2024 at 03:34:41PM -0700, Yichen Wang wrote:
> > v3:
> > - Rebase changes on top of master
> > - Merge two patches per Fabiano Rosas's comment
> > - Add versions into comments and documentations
> >
> > v2:
> > - Rebase changes on top of recent multifd code changes.
> > - Use QATzip API 'qzMalloc' and 'qzFree' to allocate QAT buffers.
> > - Remove parameter tuning and use QATzip's defaults for better
> >   performance.
> > - Add parameter to enable QAT software fallback.
> >
> > v1:
> > https://lists.nongnu.org/archive/html/qemu-devel/2023-12/msg03761.html
> >
> > * Performance
> >
> > We present updated performance results. For circumstantial reasons, v1
> > presented performance on a low-bandwidth (1Gbps) network.
> >
> > Here, we present updated results with a similar setup as before but with
> > two main differences:
> >
> > 1. Our machines have a ~50Gbps connection, tested using 'iperf3'.
> > 2. We had a bug in our memory allocation causing us to only use ~1/2 of
> > the VM's RAM. Now we properly allocate and fill nearly all of the VM's
> > RAM.
> >
> > Thus, the test setup is as follows:
> >
> > We perform multifd live migration over TCP using a VM with 64GB memory.
> > We prepare the machine's memory by powering it on, allocating a large
> > amount of memory (60GB) as a single buffer, and filling the buffer with
> > the repeated contents of the Silesia corpus[0]. This is in lieu of a
> more
> > realistic memory snapshot, which proved troublesome to acquire.
> >
> > We analyze CPU usage by averaging the output of 'top' every second
> > during migration. This is admittedly imprecise, but we feel that it
> > accurately portrays the different degrees of CPU usage of varying
> > compression methods.
> >
> > We present the latency, throughput, and CPU usage results for all of the
> > compression methods, with varying numbers of multifd threads (4, 8, and
> > 16).
> >
> > [0] The Silesia corpus can be accessed here:
> > https://sun.aei.polsl.pl//~sdeor/index.php?page=silesia
> >
> > ** Results
> >
> > 4 multifd threads:
> >
> > |---|---||-|
> -|
> > |method |time(sec)  |throughput(mbps)|send cpu%|recv
> cpu%|
> > |---|---||-|
> -|
> > |qatzip | 23.13 | 8749.94|117.50   |186.49
> |
> > |---|---||-|
> -|
> > |zlib   |254.35 |  771.87|388.20   |144.40
> |
> > |---|---||-|
> -|
> > |zstd   | 54.52 | 3442.59|414.59   |149.77
> |
> > |---|---||-|
> -|
> > |none   | 12.45 |43739.60|159.71   |204.96
> |
> > |---|---||-|
> -|
> >
> > 8 multifd threads:
> >
> > |---|---||-|
> -|
> > |method |time(sec)  |throughput(mbps)|send cpu%|recv
> cpu%|
> > |---|---||-|
> -|
> > |qatzip | 16.91 |12306.52|186.37   |391.84
> |
> > |---|---||-|
> -|
> > |zlib   |130.11 | 1508.89|753.86   |289.35
> |
> > |---|---||-|
> -|
> > |zstd   | 27.57 | 6823.23|786.83   |303.80
> |
> > |---|---||-|
> -|
> > |none   | 11.82 |46072.63|163.74   |238.56
> |
> > |---|---||-|--

RE: [PATCH v2 2/5] migration: Add migration parameters for QATzip

2024-06-27 Thread Liu, Yuan1
> -Original Message-
> From: Yichen Wang 
> Sent: Thursday, June 27, 2024 8:17 AM
> To: Liu, Yuan1 
> Cc: Bryan Zhang ; qemu-devel@nongnu.org;
> pet...@redhat.com; faro...@suse.de; berra...@redhat.com; Zou, Nanhai
> ; hao.xi...@linux.dev
> Subject: Re: [PATCH v2 2/5] migration: Add migration parameters for QATzip
> 
> 
> 
> > On Mar 28, 2024, at 12:23 AM, Liu, Yuan1  wrote:
> >
> >> -Original Message-
> >> From: Bryan Zhang 
> >> Sent: Wednesday, March 27, 2024 6:42 AM
> >> To: qemu-devel@nongnu.org
> >> Cc: pet...@redhat.com; faro...@suse.de; Liu, Yuan1
> ;
> >> berra...@redhat.com; Zou, Nanhai ;
> >> hao.xi...@linux.dev; Bryan Zhang 
> >> Subject: [PATCH v2 2/5] migration: Add migration parameters for QATzip
> >>
> >> Adds support for migration parameters to control QATzip compression
> >> level and to enable/disable software fallback when QAT hardware is
> >> unavailable. This is a preparatory commit for a subsequent commit that
> >> will actually use QATzip compression.
> >>
> >> Signed-off-by: Bryan Zhang 
> >> Signed-off-by: Hao Xiang 
> >> ---
> >> Revision: This commit now includes a parameter for controlling software
> >> fallback. Fallback is generally intended to be disabled, but having
> this
> >> option available enables using software fallback for testing.
> >>
> >> This commit also now has some glue code to properly set parameters.
> >>
> >> migration/migration-hmp-cmds.c |  8 +
> >> migration/options.c| 57 ++
> >> migration/options.h|  2 ++
> >> qapi/migration.json| 35 +
> >> 4 files changed, 102 insertions(+)
> >>
> >> diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-
> >> cmds.c
> >> index 99b49df5dd..4bd23ba14d 100644
> >> --- a/migration/migration-hmp-cmds.c
> >> +++ b/migration/migration-hmp-cmds.c
> >> @@ -630,6 +630,14 @@ void hmp_migrate_set_parameter(Monitor *mon, const
> >> QDict *qdict)
> >> p->has_multifd_zlib_level = true;
> >> visit_type_uint8(v, param, &p->multifd_zlib_level, &err);
> >> break;
> >> +case MIGRATION_PARAMETER_MULTIFD_QATZIP_LEVEL:
> >> +p->has_multifd_qatzip_level = true;
> >> +visit_type_uint8(v, param, &p->multifd_qatzip_level, &err);
> >> +break;
> >> +case MIGRATION_PARAMETER_MULTIFD_QATZIP_SW_FALLBACK:
> >> +p->has_multifd_qatzip_sw_fallback = true;
> >> +visit_type_bool(v, param, &p->multifd_qatzip_sw_fallback,
> &err);
> >> +break;
> >> case MIGRATION_PARAMETER_MULTIFD_ZSTD_LEVEL:
> >> p->has_multifd_zstd_level = true;
> >> visit_type_uint8(v, param, &p->multifd_zstd_level, &err);
> >> diff --git a/migration/options.c b/migration/options.c
> >> index 3e3e0b93b4..1316ea605a 100644
> >> --- a/migration/options.c
> >> +++ b/migration/options.c
> >> @@ -62,6 +62,15 @@
> >> #define DEFAULT_MIGRATE_MULTIFD_COMPRESSION MULTIFD_COMPRESSION_NONE
> >> /* 0: means nocompress, 1: best speed, ... 9: best compress ratio */
> >> #define DEFAULT_MIGRATE_MULTIFD_ZLIB_LEVEL 1
> >> +/*
> >> + * 1: best speed, ... 9: best compress ratio
> >> + * There is some nuance here. Refer to QATzip documentation to
> understand
> >> + * the mapping of QATzip levels to standard deflate levels.
> >> + */
> >> +#define DEFAULT_MIGRATE_MULTIFD_QATZIP_LEVEL 1
> >> +/* QATzip's SW fallback implementation is extremely slow, so avoid
> >> fallback */
> >> +#define DEFAULT_MIGRATE_MULTIFD_QATZIP_SW_FALLBACK false
> >> +
> >> /* 0: means nocompress, 1: best speed, ... 20: best compress ratio */
> >> #define DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL 1
> >
> > Hi Bryan
> >
> > The default compression level may be set higher, such as 6. I checked
> QAT throughput
> > If the data size is less than or equal to 64K, level 1 has much better
> throughput
> > performance than level 6 and level 9. But if the data size is greater
> than 128K, little
> > change in throughput, and the default MULTIFD_PACKET_SIZE is 512K, you
> can have a try
> > to use a high compression level, to get better compression performance
> without affecting
> > throughput.
> >
> > In addition

RE: [PATCH v8 4/7] migration/multifd: add qpl compression method

2024-06-12 Thread Liu, Yuan1
> -Original Message-
> From: Fabiano Rosas 
> Sent: Wednesday, June 12, 2024 10:31 PM
> To: Liu, Yuan1 ; pet...@redhat.com;
> pbonz...@redhat.com; marcandre.lur...@redhat.com; berra...@redhat.com;
> th...@redhat.com; phi...@linaro.org
> Cc: qemu-devel@nongnu.org; Liu, Yuan1 ; Zou, Nanhai
> ; shameerali.kolothum.th...@huawei.com
> Subject: Re: [PATCH v8 4/7] migration/multifd: add qpl compression method
> 
> Fabiano Rosas  writes:
> 
> > Yuan Liu  writes:
> >
> >> add the Query Processing Library (QPL) compression method
> >>
> >> Introduce the qpl as a new multifd migration compression method, it can
> >> use In-Memory Analytics Accelerator(IAA) to accelerate compression and
> >> decompression, which can not only reduce network bandwidth requirement
> >> but also reduce host compression and decompression CPU overhead.
> >>
> >> How to enable qpl compression during migration:
> >> migrate_set_parameter multifd-compression qpl
> >>
> >> There is no qpl compression level parameter added since it only
> supports
> >> level one, users do not need to specify the qpl compression level.
> >>
> >> Signed-off-by: Yuan Liu 
> >> Reviewed-by: Nanhai Zou 
> >> Reviewed-by: Peter Xu 
> >> Reviewed-by: Fabiano Rosas 
> >
> > I don't think I ever reviewed this patch. Please drop this when you
> > resubmit.
> 
> Actually, just leave it. I thought you'd need to fix the output size on
> 6/7, but I saw you just moved it elsewhere. So no need to respin. I'll
> queue this version shortly unless anyone else has comments.

Got it and thank you for your review.



RE: [PATCH v8 4/7] migration/multifd: add qpl compression method

2024-06-12 Thread Liu, Yuan1
> -Original Message-
> From: Fabiano Rosas 
> Sent: Wednesday, June 12, 2024 10:27 PM
> To: Liu, Yuan1 ; pet...@redhat.com;
> pbonz...@redhat.com; marcandre.lur...@redhat.com; berra...@redhat.com;
> th...@redhat.com; phi...@linaro.org
> Cc: qemu-devel@nongnu.org; Liu, Yuan1 ; Zou, Nanhai
> ; shameerali.kolothum.th...@huawei.com
> Subject: Re: [PATCH v8 4/7] migration/multifd: add qpl compression method
> 
> Yuan Liu  writes:
> 
> > add the Query Processing Library (QPL) compression method
> >
> > Introduce the qpl as a new multifd migration compression method, it can
> > use In-Memory Analytics Accelerator(IAA) to accelerate compression and
> > decompression, which can not only reduce network bandwidth requirement
> > but also reduce host compression and decompression CPU overhead.
> >
> > How to enable qpl compression during migration:
> > migrate_set_parameter multifd-compression qpl
> >
> > There is no qpl compression level parameter added since it only supports
> > level one, users do not need to specify the qpl compression level.
> >
> > Signed-off-by: Yuan Liu 
> > Reviewed-by: Nanhai Zou 
> > Reviewed-by: Peter Xu 
> > Reviewed-by: Fabiano Rosas 
> 
> I don't think I ever reviewed this patch. Please drop this when you
> resubmit.

You are right, this is my mistake, I am very sorry.



RE: [PATCH v7 6/7] migration/multifd: implement qpl compression and decompression

2024-06-06 Thread Liu, Yuan1


> -Original Message-
> From: Fabiano Rosas 
> Sent: Thursday, June 6, 2024 9:52 PM
> To: Liu, Yuan1 ; pet...@redhat.com;
> pbonz...@redhat.com; marcandre.lur...@redhat.com; berra...@redhat.com;
> th...@redhat.com; phi...@linaro.org
> Cc: qemu-devel@nongnu.org; Zou, Nanhai ;
> shameerali.kolothum.th...@huawei.com
> Subject: RE: [PATCH v7 6/7] migration/multifd: implement qpl compression
> and decompression
> 
> "Liu, Yuan1"  writes:
> 
> >> -Original Message-
> >> From: Fabiano Rosas 
> >> Sent: Thursday, June 6, 2024 6:26 AM
> >> To: Liu, Yuan1 ; pet...@redhat.com;
> >> pbonz...@redhat.com; marcandre.lur...@redhat.com; berra...@redhat.com;
> >> th...@redhat.com; phi...@linaro.org
> >> Cc: qemu-devel@nongnu.org; Liu, Yuan1 ; Zou,
> Nanhai
> >> ; shameerali.kolothum.th...@huawei.com
> >> Subject: Re: [PATCH v7 6/7] migration/multifd: implement qpl
> compression
> >> and decompression
> >>
> >> Yuan Liu  writes:
> >>
> >> > QPL compression and decompression will use IAA hardware first.
> >> > If IAA hardware is not available, it will automatically fall
> >> > back to QPL software path, if the software job also fails,
> >> > the uncompressed page is sent directly.
> >> >
> >> > Signed-off-by: Yuan Liu 
> >> > Reviewed-by: Nanhai Zou 
> >> > ---
> >> >  migration/multifd-qpl.c | 412
> +++-
> >> >  1 file changed, 408 insertions(+), 4 deletions(-)
> >> >
> >> > diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c
> >> > index 6791a204d5..18b3384bd5 100644
> >> > --- a/migration/multifd-qpl.c
> >> > +++ b/migration/multifd-qpl.c
> >> > @@ -13,9 +13,14 @@
> >> >  #include "qemu/osdep.h"
> >> >  #include "qemu/module.h"
> >> >  #include "qapi/error.h"
> >> > +#include "qapi/qapi-types-migration.h"
> >> > +#include "exec/ramblock.h"
> >> >  #include "multifd.h"
> >> >  #include "qpl/qpl.h"
> >> >
> >> > +/* Maximum number of retries to resubmit a job if IAA work queues
> are
> >> full */
> >> > +#define MAX_SUBMIT_RETRY_NUM (3)
> >> > +
> >> >  typedef struct {
> >> >  /* the QPL hardware path job */
> >> >  qpl_job *job;
> >> > @@ -260,6 +265,219 @@ static void
> >> multifd_qpl_send_cleanup(MultiFDSendParams *p, Error **errp)
> >> >  p->iov = NULL;
> >> >  }
> >> >
> >> > +/**
> >> > + * multifd_qpl_prepare_job: prepare the job
> >> > + *
> >> > + * Set the QPL job parameters and properties.
> >> > + *
> >> > + * @job: pointer to the qpl_job structure
> >> > + * @is_compression: indicates compression and decompression
> >> > + * @input: pointer to the input data buffer
> >> > + * @input_len: the length of the input data
> >> > + * @output: pointer to the output data buffer
> >> > + * @output_len: the length of the output data
> >> > + */
> >> > +static void multifd_qpl_prepare_job(qpl_job *job, bool
> is_compression,
> >> > +uint8_t *input, uint32_t
> input_len,
> >> > +uint8_t *output, uint32_t
> >> output_len)
> >> > +{
> >> > +job->op = is_compression ? qpl_op_compress : qpl_op_decompress;
> >> > +job->next_in_ptr = input;
> >> > +job->next_out_ptr = output;
> >> > +job->available_in = input_len;
> >> > +job->available_out = output_len;
> >> > +job->flags = QPL_FLAG_FIRST | QPL_FLAG_LAST |
> QPL_FLAG_OMIT_VERIFY;
> >> > +/* only supports compression level 1 */
> >> > +job->level = 1;
> >> > +}
> >> > +
> >> > +/**
> >> > + * multifd_qpl_prepare_job: prepare the compression job
> >>
> >> function name is wrong
> >
> > Thanks, I will fix this next version.
> >
> >> > + *
> >> > + * Set the compression job parameters and properties.
> >> > + *
> >> > + * @job: pointer to the qpl_job structure
> >> > + * @input: pointer to the input data buffer
> >> > + * @input_len: the length of the 

RE: [PATCH v7 1/7] docs/migration: add qpl compression feature

2024-06-06 Thread Liu, Yuan1
> -Original Message-
> From: Fabiano Rosas 
> Sent: Thursday, June 6, 2024 4:00 AM
> To: Liu, Yuan1 ; pet...@redhat.com;
> pbonz...@redhat.com; marcandre.lur...@redhat.com; berra...@redhat.com;
> th...@redhat.com; phi...@linaro.org
> Cc: qemu-devel@nongnu.org; Liu, Yuan1 ; Zou, Nanhai
> ; shameerali.kolothum.th...@huawei.com
> Subject: Re: [PATCH v7 1/7] docs/migration: add qpl compression feature
> 
> Yuan Liu  writes:
> 
> > add Intel Query Processing Library (QPL) compression method
> > introduction
> >
> > Signed-off-by: Yuan Liu 
> > Reviewed-by: Nanhai Zou 
> 
> Just some nits if you need to respin. Otherwise I can touch up in the
> migration tree.
> 
> Reviewed-by: Fabiano Rosas 

Thank you very much, there is nothing I need to change for this patch, 
if this set of patches needs the next version, I will fix the nits
according to your suggestions.

> > ---
> >  docs/devel/migration/features.rst|   1 +
> >  docs/devel/migration/qpl-compression.rst | 262 +++
> >  2 files changed, 263 insertions(+)
> >  create mode 100644 docs/devel/migration/qpl-compression.rst
> >
> > diff --git a/docs/devel/migration/features.rst
> b/docs/devel/migration/features.rst
> > index d5ca7b86d5..bc98b65075 100644
> > --- a/docs/devel/migration/features.rst
> > +++ b/docs/devel/migration/features.rst
> > @@ -12,3 +12,4 @@ Migration has plenty of features to support different
> use cases.
> > virtio
> > mapped-ram
> > CPR
> > +   qpl-compression
> > diff --git a/docs/devel/migration/qpl-compression.rst
> b/docs/devel/migration/qpl-compression.rst
> > new file mode 100644
> > index 00..13fb7a67b1
> > --- /dev/null
> > +++ b/docs/devel/migration/qpl-compression.rst
> > @@ -0,0 +1,262 @@
> > +===
> > +QPL Compression
> > +===
> > +The Intel Query Processing Library (Intel ``QPL``) is an open-source
> library to
> > +provide compression and decompression features and it is based on
> deflate
> > +compression algorithm (RFC 1951).
> > +
> > +The ``QPL`` compression relies on Intel In-Memory Analytics
> Accelerator(``IAA``)
> > +and Shared Virtual Memory(``SVM``) technology, they are new features
> supported
> > +from Intel 4th Gen Intel Xeon Scalable processors, codenamed Sapphire
> Rapids
> > +processor(``SPR``).
> > +
> > +For more ``QPL`` introduction, please refer to `QPL Introduction
> >
> +<https://intel.github.io/qpl/documentation/introduction_docs/introduction
> .html>`_
> > +
> > +QPL Compression Framework
> > +=
> > +
> > +::
> > +
> > +  ++   +--+
> > +  | MultiFD Thread |   |accel-config tool |
> > +  +---++   ++-+
> > +  | |
> > +  | |
> > +  |compress/decompress  |
> > +  +---++| Setup IAA
> > +  |  QPL library   || Resources
> > +  +---+---++|
> > +  |   | |
> > +  |   +-+---+
> > +  |   Open IAA  |
> > +  |   Devices +-+-+
> > +  |   |idxd driver|
> > +  |   +-+-+
> > +  | |
> > +  | |
> > +  |   +-+-+
> > +  +---+IAA Devices|
> > +  Submit jobs +---+
> > +  via enqcmd
> > +
> > +
> > +QPL Build And Installation
> > +--
> > +
> > +.. code-block:: shell
> > +
> > +  $git clone --recursive https://github.com/intel/qpl.git qpl
> > +  $mkdir qpl/build
> > +  $cd qpl/build
> > +  $cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr -
> DQPL_LIBRARY_TYPE=SHARED ..
> > +  $sudo cmake --build . --target install
> > +
> > +For more details about ``QPL`` installation, please refer to `QPL
> Installation
> >
> +<https://intel.github.io/qpl/documentation/get_started_docs/installation.
> html>`_
> > +
> > +IAA Device Management
> > +-
> > +
> > +The number of ``IAA`` devices will vary depending on the Xeon product
> model.
> > +On a ``SPR`` server, there can be a maximum of 8 ``IAA`` devices, with
> up to
> > +4 devices per socket.
> > +
> > +By default, all ``IAA`` devices are disabled and 

RE: [PATCH v7 6/7] migration/multifd: implement qpl compression and decompression

2024-06-05 Thread Liu, Yuan1
> -Original Message-
> From: Fabiano Rosas 
> Sent: Thursday, June 6, 2024 6:26 AM
> To: Liu, Yuan1 ; pet...@redhat.com;
> pbonz...@redhat.com; marcandre.lur...@redhat.com; berra...@redhat.com;
> th...@redhat.com; phi...@linaro.org
> Cc: qemu-devel@nongnu.org; Liu, Yuan1 ; Zou, Nanhai
> ; shameerali.kolothum.th...@huawei.com
> Subject: Re: [PATCH v7 6/7] migration/multifd: implement qpl compression
> and decompression
> 
> Yuan Liu  writes:
> 
> > QPL compression and decompression will use IAA hardware first.
> > If IAA hardware is not available, it will automatically fall
> > back to QPL software path, if the software job also fails,
> > the uncompressed page is sent directly.
> >
> > Signed-off-by: Yuan Liu 
> > Reviewed-by: Nanhai Zou 
> > ---
> >  migration/multifd-qpl.c | 412 +++-
> >  1 file changed, 408 insertions(+), 4 deletions(-)
> >
> > diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c
> > index 6791a204d5..18b3384bd5 100644
> > --- a/migration/multifd-qpl.c
> > +++ b/migration/multifd-qpl.c
> > @@ -13,9 +13,14 @@
> >  #include "qemu/osdep.h"
> >  #include "qemu/module.h"
> >  #include "qapi/error.h"
> > +#include "qapi/qapi-types-migration.h"
> > +#include "exec/ramblock.h"
> >  #include "multifd.h"
> >  #include "qpl/qpl.h"
> >
> > +/* Maximum number of retries to resubmit a job if IAA work queues are
> full */
> > +#define MAX_SUBMIT_RETRY_NUM (3)
> > +
> >  typedef struct {
> >  /* the QPL hardware path job */
> >  qpl_job *job;
> > @@ -260,6 +265,219 @@ static void
> multifd_qpl_send_cleanup(MultiFDSendParams *p, Error **errp)
> >  p->iov = NULL;
> >  }
> >
> > +/**
> > + * multifd_qpl_prepare_job: prepare the job
> > + *
> > + * Set the QPL job parameters and properties.
> > + *
> > + * @job: pointer to the qpl_job structure
> > + * @is_compression: indicates compression and decompression
> > + * @input: pointer to the input data buffer
> > + * @input_len: the length of the input data
> > + * @output: pointer to the output data buffer
> > + * @output_len: the length of the output data
> > + */
> > +static void multifd_qpl_prepare_job(qpl_job *job, bool is_compression,
> > +uint8_t *input, uint32_t input_len,
> > +uint8_t *output, uint32_t
> output_len)
> > +{
> > +job->op = is_compression ? qpl_op_compress : qpl_op_decompress;
> > +job->next_in_ptr = input;
> > +job->next_out_ptr = output;
> > +job->available_in = input_len;
> > +job->available_out = output_len;
> > +job->flags = QPL_FLAG_FIRST | QPL_FLAG_LAST | QPL_FLAG_OMIT_VERIFY;
> > +/* only supports compression level 1 */
> > +job->level = 1;
> > +}
> > +
> > +/**
> > + * multifd_qpl_prepare_job: prepare the compression job
> 
> function name is wrong

Thanks, I will fix this next version.
 
> > + *
> > + * Set the compression job parameters and properties.
> > + *
> > + * @job: pointer to the qpl_job structure
> > + * @input: pointer to the input data buffer
> > + * @input_len: the length of the input data
> > + * @output: pointer to the output data buffer
> > + * @output_len: the length of the output data
> > + */
> > +static void multifd_qpl_prepare_comp_job(qpl_job *job, uint8_t *input,
> > + uint32_t input_len, uint8_t
> *output,
> > + uint32_t output_len)
> > +{
> > +multifd_qpl_prepare_job(job, true, input, input_len, output,
> output_len);
> > +}
> > +
> > +/**
> > + * multifd_qpl_prepare_job: prepare the decompression job

Thanks, I will fix this next version.
 
> > + *
> > + * Set the decompression job parameters and properties.
> > + *
> > + * @job: pointer to the qpl_job structure
> > + * @input: pointer to the input data buffer
> > + * @input_len: the length of the input data
> > + * @output: pointer to the output data buffer
> > + * @output_len: the length of the output data
> > + */
> > +static void multifd_qpl_prepare_decomp_job(qpl_job *job, uint8_t
> *input,
> > +   uint32_t input_len, uint8_t
> *output,
> > +   uint32_t output_len)
> > +{
> > +multifd_qpl_prepar

RE: [PATCH 1/7] docs/migration: add uadk compression feature

2024-05-30 Thread Liu, Yuan1
> -Original Message-
> From: Shameerali Kolothum Thodi 
> Sent: Thursday, May 30, 2024 10:01 PM
> To: Liu, Yuan1 ; pet...@redhat.com; faro...@suse.de
> Cc: qemu-devel@nongnu.org; Linuxarm ; linwenkai (C)
> ; zhangfei@linaro.org; huangchenghai
> 
> Subject: RE: [PATCH 1/7] docs/migration: add uadk compression feature
> 
> 
> 
> > -----Original Message-
> > From: Liu, Yuan1 
> > Sent: Thursday, May 30, 2024 2:25 PM
> > To: Shameerali Kolothum Thodi ;
> > pet...@redhat.com; faro...@suse.de
> > Cc: qemu-devel@nongnu.org; Linuxarm ; linwenkai (C)
> > ; zhangfei@linaro.org; huangchenghai
> > 
> > Subject: RE: [PATCH 1/7] docs/migration: add uadk compression feature
> >
> > > -Original Message-
> > > From: Shameer Kolothum 
> > > Sent: Wednesday, May 29, 2024 5:44 PM
> > > To: pet...@redhat.com; faro...@suse.de; Liu, Yuan1
> 
> > > Cc: qemu-devel@nongnu.org; linux...@huawei.com;
> > linwenk...@hisilicon.com;
> > > zhangfei@linaro.org; huangchengh...@huawei.com
> > > Subject: [PATCH 1/7] docs/migration: add uadk compression feature
> 
> [...]
> 
> > > +Since UADK uses Shared Virtual Addressing(SVA) and device access
> virtual
> > > memory
> > > +directly it is possible that SMMUv3 may enounter page faults while
> > > walking the
> > > +IO page tables. This may impact the performance. In order to mitigate
> > > this,
> > > +please make sure to specify ``-mem-prealloc`` parameter to the
> > > destination VM
> > > +boot parameters.
> >
> > Thank you so much for putting the IAA solution at the top and cc me.
> >
> > I think migration performance will be better with '-mem-prealloc'
> option,
> > but I am considering whether '-mem-prealloc' is a mandatory option, from
> my
> > experience, SVA performance drops mainly caused by IOTLB flush and IO
> page
> > fault,
> > I had some discussions with Peter Xu about the IOTLB flush issue, and it
> has
> > been improved.
> > https://patchew.org/QEMU/PH7PR11MB5941F04FBFB964CB2C968866A33E2@
> > PH7PR11MB5941.namprd11.prod.outlook.com/
> 
> Thanks for the link. Yes I have seen that discussion and this series is on
> top of  that
> patch for avoiding the zero page read fault.
> 
> >
> > For IO page fault, the QPL(IAA userspace library) can process page fault
> > request instead of IOMMU,
> 
> Sorry I didn't get this part completely. So if the page fault happens how
> the library
> can handle it without IOMMU? Or you meant library will do memory
> perfecting before
> to avoid the page fault?

Yes, when the I/O page fault happens, the hardware will return the fault address
to the QPL, QPL will populate the memory as below, then resubmit the job to
hardware again.

if (AD_STATUS_READ_PAGE_FAULT == completion_record_ptr->status) {
volatile char* read_fault_address = (char *)fault_address;
*read_fault_address;
} 
else { // AD_STATUS_WRITE_PAGE_FAULT
volatile char* write_fault_address = (char *)fault_address;
*write_fault_address = *write_fault_address;
}

>  it means we can disable the I/O page fault feature
> > on the IAA device, and let the device still use SVA technology to avoid
> memory
> > copy.
> >
> > I will provide the test results in my next version, do you have any
> ideas or
> > suggestions about this, thanks.
> 
> I think our UADK test tool had an option to prefect the memory(write some
> random data
> to memory) to avoid page fault penalty. I am not sure that is exposed
> through the API or not.
> I will check with our UADK team.
> 
> Please do CC me when you post your next revision.

Sure

> Thanks,
> Shameer


RE: [PATCH 1/7] docs/migration: add uadk compression feature

2024-05-30 Thread Liu, Yuan1
> -Original Message-
> From: Shameer Kolothum 
> Sent: Wednesday, May 29, 2024 5:44 PM
> To: pet...@redhat.com; faro...@suse.de; Liu, Yuan1 
> Cc: qemu-devel@nongnu.org; linux...@huawei.com; linwenk...@hisilicon.com;
> zhangfei@linaro.org; huangchengh...@huawei.com
> Subject: [PATCH 1/7] docs/migration: add uadk compression feature
> 
> Document UADK(User Space Accelerator Development Kit) library details
> and how to use that for migration.
> 
> Signed-off-by: Shameer Kolothum 
> ---
>  docs/devel/migration/uadk-compression.rst | 144 ++
>  1 file changed, 144 insertions(+)
>  create mode 100644 docs/devel/migration/uadk-compression.rst
> 
> diff --git a/docs/devel/migration/uadk-compression.rst
> b/docs/devel/migration/uadk-compression.rst
> new file mode 100644
> index 00..988b92631e
> --- /dev/null
> +++ b/docs/devel/migration/uadk-compression.rst
> @@ -0,0 +1,144 @@
> +=
> +User Space Accelerator Development Kit (UADK) Compression
> +=
> +UADK is a general-purpose user space accelerator framework that uses
> shared
> +virtual addressing (SVA) to provide a unified programming interface for
> +hardware acceleration of cryptographic and compression algorithms.
> +
> +UADK includes Unified/User-space-access-intended Accelerator Framework
> (UACCE),
> +which enables hardware accelerators from different vendors that support
> SVA to
> +adapt to UADK.
> +
> +Currently, HiSilicon Kunpeng hardware accelerators have been registered
> with
> +UACCE. Through the UADK framework, users can run cryptographic and
> compression
> +algorithms using hardware accelerators instead of CPUs, freeing up CPU
> +computing power and improving computing performance.
> +
> +https://github.com/Linaro/uadk/tree/master/docs
> +
> +UADK Framework
> +==
> +UADK consists of UACCE, vendors' drivers, and an algorithm layer. UADK
> requires
> +the hardware accelerator to support SVA, and the operating system to
> support
> +IOMMU and SVA. Hardware accelerators from different vendors are
> registered as
> +different character devices with UACCE by using kernel-mode drivers of
> the
> +vendors. A user can access the hardware accelerators by performing user-
> mode
> +operations on the character devices.
> +
> +::
> +
> +  +--+
> +  |apps  |
> +  ++++
> +   ||
> +   ||
> +   +---++   +---+---+
> +   |   scheduler|   | alg libraries |
> +   +---++   +---+---+
> +   | |
> +   | |
> +   | |
> +   |++--+
> +   || vendor drivers|
> +   |+-+-+
> +   |  |
> +   |  |
> ++--+--+--+
> +| libwd  |
> +User++-+-+
> +--
> +Kernel+--+-+   +--+
> +  | uacce  |   | smmu |
> +  +---++   +--+
> +  |
> +  +---+--+
> +  | vendor kernel driver |
> +  +--+
> +--
> + +--+
> + |   HW Accelerators|
> + +--+
> +
> +UADK Installation
> +-
> +Build UADK
> +^^
> +
> +.. code-block:: shell
> +
> +git clone https://github.com/Linaro/uadk.git
> +cd uadk
> +mkdir build
> +./autogen.sh
> +./configure --prefix=$PWD/build
> +make
> +make install
> +
> +Without --prefix, UADK will be installed to /usr/local/lib by default.
> +If get error:"cannot find -lnuma", please install the libnuma-dev
> +
> +Run pkg-config libwd to ensure env is setup correctly
> +^^
> +
> +* export PKG_CONFIG_PATH=$PWD/build/lib/pkgconfig
> +* pkg-config libwd --cflags --libs
> +  -I/usr/local/include -L/usr/local/lib -lwd
> +
> +* export PKG_CONFIG_PATH is required on demand.
> +  Not required if UADK is installed to /usr/local/lib
> +
> +UADK 

RE: [PATCH v6 2/7] migration/multifd: put IOV initialization into compression method

2024-05-28 Thread Liu, Yuan1
> -Original Message-
> From: Peter Xu 
> Sent: Tuesday, May 28, 2024 4:51 AM
> To: Liu, Yuan1 
> Cc: faro...@suse.de; qemu-devel@nongnu.org; Zou, Nanhai
> 
> Subject: Re: [PATCH v6 2/7] migration/multifd: put IOV initialization into
> compression method
> 
> On Mon, May 06, 2024 at 12:57:46AM +0800, Yuan Liu wrote:
> > Different compression methods may require different numbers of IOVs.
> > Based on streaming compression of zlib and zstd, all pages will be
> > compressed to a data block, so two IOVs are needed for packet header
> > and compressed data block.
> >
> > Signed-off-by: Yuan Liu 
> > Reviewed-by: Nanhai Zou 
> > ---
> >  migration/multifd-zlib.c |  7 +++
> >  migration/multifd-zstd.c |  8 +++-
> >  migration/multifd.c  | 22 --
> >  3 files changed, 26 insertions(+), 11 deletions(-)
> >
> > diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c
> > index 737a9645d2..2ced69487e 100644
> > --- a/migration/multifd-zlib.c
> > +++ b/migration/multifd-zlib.c
> > @@ -70,6 +70,10 @@ static int zlib_send_setup(MultiFDSendParams *p,
> Error **errp)
> >  goto err_free_zbuff;
> >  }
> >  p->compress_data = z;
> > +
> > +/* Needs 2 IOVs, one for packet header and one for compressed data
> */
> > +p->iov = g_new0(struct iovec, 2);
> > +
> >  return 0;
> >
> >  err_free_zbuff:
> > @@ -101,6 +105,9 @@ static void zlib_send_cleanup(MultiFDSendParams *p,
> Error **errp)
> >  z->buf = NULL;
> >  g_free(p->compress_data);
> >  p->compress_data = NULL;
> > +
> > +g_free(p->iov);
> > +p->iov = NULL;
> >  }
> >
> >  /**
> > diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c
> > index 256858df0a..ca17b7e310 100644
> > --- a/migration/multifd-zstd.c
> > +++ b/migration/multifd-zstd.c
> > @@ -52,7 +52,6 @@ static int zstd_send_setup(MultiFDSendParams *p, Error
> **errp)
> >  struct zstd_data *z = g_new0(struct zstd_data, 1);
> >  int res;
> >
> > -p->compress_data = z;
> >  z->zcs = ZSTD_createCStream();
> >  if (!z->zcs) {
> >  g_free(z);
> > @@ -77,6 +76,10 @@ static int zstd_send_setup(MultiFDSendParams *p,
> Error **errp)
> >  error_setg(errp, "multifd %u: out of memory for zbuff", p->id);
> >  return -1;
> >  }
> > +p->compress_data = z;
> > +
> > +/* Needs 2 IOVs, one for packet header and one for compressed data
> */
> > +p->iov = g_new0(struct iovec, 2);
> >  return 0;
> >  }
> >
> > @@ -98,6 +101,9 @@ static void zstd_send_cleanup(MultiFDSendParams *p,
> Error **errp)
> >  z->zbuff = NULL;
> >  g_free(p->compress_data);
> >  p->compress_data = NULL;
> > +
> > +g_free(p->iov);
> > +p->iov = NULL;
> >  }
> >
> >  /**
> > diff --git a/migration/multifd.c b/migration/multifd.c
> > index f317bff077..d82885fdbb 100644
> > --- a/migration/multifd.c
> > +++ b/migration/multifd.c
> > @@ -137,6 +137,13 @@ static int nocomp_send_setup(MultiFDSendParams *p,
> Error **errp)
> >  p->write_flags |= QIO_CHANNEL_WRITE_FLAG_ZERO_COPY;
> >  }
> >
> > +if (multifd_use_packets()) {
> > +/* We need one extra place for the packet header */
> > +p->iov = g_new0(struct iovec, p->page_count + 1);
> > +} else {
> > +p->iov = g_new0(struct iovec, p->page_count);
> > +}
> > +
> >  return 0;
> >  }
> >
> > @@ -150,6 +157,8 @@ static int nocomp_send_setup(MultiFDSendParams *p,
> Error **errp)
> >   */
> >  static void nocomp_send_cleanup(MultiFDSendParams *p, Error **errp)
> >  {
> > +g_free(p->iov);
> > +p->iov = NULL;
> >  return;
> >  }
> >
> > @@ -228,6 +237,7 @@ static int nocomp_send_prepare(MultiFDSendParams *p,
> Error **errp)
> >   */
> >  static int nocomp_recv_setup(MultiFDRecvParams *p, Error **errp)
> >  {
> > +p->iov = g_new0(struct iovec, p->page_count);
> >  return 0;
> >  }
> >
> > @@ -240,6 +250,8 @@ static int nocomp_recv_setup(MultiFDRecvParams *p,
> Error **errp)
> >   */
> >  static void nocomp_recv_cleanup(MultiFDRecvParams *p)
> >  {
> > +g_free(p->iov);
> > +p->iov = NULL;
> >  }
> 
> Are recv_

RE: [PATCH v6 6/7] migration/multifd: implement qpl compression and decompression

2024-05-14 Thread Liu, Yuan1
> -Original Message-
> From: Fabiano Rosas 
> Sent: Tuesday, May 14, 2024 10:08 PM
> To: Liu, Yuan1 ; pet...@redhat.com
> Cc: qemu-devel@nongnu.org; Zou, Nanhai 
> Subject: RE: [PATCH v6 6/7] migration/multifd: implement qpl compression
> and decompression
> 
> "Liu, Yuan1"  writes:
> 
> >> -Original Message-
> >> From: Fabiano Rosas 
> >> Sent: Monday, May 13, 2024 11:14 PM
> >> To: Liu, Yuan1 ; pet...@redhat.com
> >> Cc: qemu-devel@nongnu.org; Liu, Yuan1 ; Zou,
> Nanhai
> >> 
> >> Subject: Re: [PATCH v6 6/7] migration/multifd: implement qpl
> compression
> >> and decompression
> >>
> >> Yuan Liu  writes:
> >>
> >> > each qpl job is used to (de)compress a normal page and it can
> >> > be processed independently by the IAA hardware. All qpl jobs
> >> > are submitted to the hardware at once, and wait for all jobs
> >> > completion. If hardware path(IAA) is not available, use software
> >> > for compression and decompression.
> >> >
> >> > Signed-off-by: Yuan Liu 
> >> > Reviewed-by: Nanhai Zou 
> >> > ---
> >> >  migration/multifd-qpl.c | 284
> +++-
> >> >  1 file changed, 280 insertions(+), 4 deletions(-)
> >> >
> >> > diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c
> >> > index 89fa51091a..9a1fddbdd0 100644
> >> > --- a/migration/multifd-qpl.c
> >> > +++ b/migration/multifd-qpl.c
> >> > @@ -13,6 +13,7 @@
> >> >  #include "qemu/osdep.h"
> >> >  #include "qemu/module.h"
> >> >  #include "qapi/error.h"
> >> > +#include "exec/ramblock.h"
> >> >  #include "migration.h"
> >> >  #include "multifd.h"
> >> >  #include "qpl/qpl.h"
> >> > @@ -204,6 +205,139 @@ static void
> >> multifd_qpl_send_cleanup(MultiFDSendParams *p, Error **errp)
> >> >  p->iov = NULL;
> >> >  }
> >> >
> >> > +/**
> >> > + * multifd_qpl_prepare_job: prepare a compression or decompression
> job
> >> > + *
> >> > + * Prepare a compression or decompression job and configure job
> >> attributes
> >> > + * including job compression level and flags.
> >> > + *
> >> > + * @job: pointer to the QplData structure
> >>
> >> qpl_job structure
> >
> > Thanks for the comment, I will fix this next version.
> >
> >> > + * @is_compression: compression or decompression indication
> >> > + * @input: pointer to the input data buffer
> >> > + * @input_len: the length of the input data
> >> > + * @output: pointer to the output data buffer
> >> > + * @output_len: the size of the output data buffer
> >> > + */
> >> > +static void multifd_qpl_prepare_job(qpl_job *job, bool
> is_compression,
> >> > +uint8_t *input, uint32_t
> input_len,
> >> > +uint8_t *output, uint32_t
> >> output_len)
> >> > +{
> >> > +job->op = is_compression ? qpl_op_compress : qpl_op_decompress;
> >> > +job->next_in_ptr = input;
> >> > +job->next_out_ptr = output;
> >> > +job->available_in = input_len;
> >> > +job->available_out = output_len;
> >> > +job->flags = QPL_FLAG_FIRST | QPL_FLAG_LAST |
> QPL_FLAG_OMIT_VERIFY;
> >> > +/* only supports one compression level */
> >> > +job->level = 1;
> >> > +}
> >> > +
> >> > +/**
> >> > + * multifd_qpl_build_packet: build a qpl compressed data packet
> >> > + *
> >> > + * The qpl compressed data packet consists of two parts, one part
> >> stores
> >> > + * the compressed length of each page, and the other part is the
> >> compressed
> >> > + * data of each page. The zbuf_hdr stores the compressed length of
> all
> >> pages,
> >> > + * and use a separate IOV to store the compressed data of each page.
> >> > + *
> >> > + * @qpl: pointer to the QplData structure
> >> > + * @p: Params for the channel that we are using
> >> > + * @idx: The index of the compressed length array
> >> > + * @addr: pointer to the compressed data
> >> > + * @len: 

RE: [PATCH v6 6/7] migration/multifd: implement qpl compression and decompression

2024-05-13 Thread Liu, Yuan1
> -Original Message-
> From: Fabiano Rosas 
> Sent: Monday, May 13, 2024 11:14 PM
> To: Liu, Yuan1 ; pet...@redhat.com
> Cc: qemu-devel@nongnu.org; Liu, Yuan1 ; Zou, Nanhai
> 
> Subject: Re: [PATCH v6 6/7] migration/multifd: implement qpl compression
> and decompression
> 
> Yuan Liu  writes:
> 
> > each qpl job is used to (de)compress a normal page and it can
> > be processed independently by the IAA hardware. All qpl jobs
> > are submitted to the hardware at once, and wait for all jobs
> > completion. If hardware path(IAA) is not available, use software
> > for compression and decompression.
> >
> > Signed-off-by: Yuan Liu 
> > Reviewed-by: Nanhai Zou 
> > ---
> >  migration/multifd-qpl.c | 284 +++-
> >  1 file changed, 280 insertions(+), 4 deletions(-)
> >
> > diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c
> > index 89fa51091a..9a1fddbdd0 100644
> > --- a/migration/multifd-qpl.c
> > +++ b/migration/multifd-qpl.c
> > @@ -13,6 +13,7 @@
> >  #include "qemu/osdep.h"
> >  #include "qemu/module.h"
> >  #include "qapi/error.h"
> > +#include "exec/ramblock.h"
> >  #include "migration.h"
> >  #include "multifd.h"
> >  #include "qpl/qpl.h"
> > @@ -204,6 +205,139 @@ static void
> multifd_qpl_send_cleanup(MultiFDSendParams *p, Error **errp)
> >  p->iov = NULL;
> >  }
> >
> > +/**
> > + * multifd_qpl_prepare_job: prepare a compression or decompression job
> > + *
> > + * Prepare a compression or decompression job and configure job
> attributes
> > + * including job compression level and flags.
> > + *
> > + * @job: pointer to the QplData structure
> 
> qpl_job structure

Thanks for the comment, I will fix this next version.

> > + * @is_compression: compression or decompression indication
> > + * @input: pointer to the input data buffer
> > + * @input_len: the length of the input data
> > + * @output: pointer to the output data buffer
> > + * @output_len: the size of the output data buffer
> > + */
> > +static void multifd_qpl_prepare_job(qpl_job *job, bool is_compression,
> > +uint8_t *input, uint32_t input_len,
> > +uint8_t *output, uint32_t
> output_len)
> > +{
> > +job->op = is_compression ? qpl_op_compress : qpl_op_decompress;
> > +job->next_in_ptr = input;
> > +job->next_out_ptr = output;
> > +job->available_in = input_len;
> > +job->available_out = output_len;
> > +job->flags = QPL_FLAG_FIRST | QPL_FLAG_LAST | QPL_FLAG_OMIT_VERIFY;
> > +/* only supports one compression level */
> > +job->level = 1;
> > +}
> > +
> > +/**
> > + * multifd_qpl_build_packet: build a qpl compressed data packet
> > + *
> > + * The qpl compressed data packet consists of two parts, one part
> stores
> > + * the compressed length of each page, and the other part is the
> compressed
> > + * data of each page. The zbuf_hdr stores the compressed length of all
> pages,
> > + * and use a separate IOV to store the compressed data of each page.
> > + *
> > + * @qpl: pointer to the QplData structure
> > + * @p: Params for the channel that we are using
> > + * @idx: The index of the compressed length array
> > + * @addr: pointer to the compressed data
> > + * @len: The length of the compressed data
> > + */
> > +static void multifd_qpl_build_packet(QplData *qpl, MultiFDSendParams
> *p,
> > + uint32_t idx, uint8_t *addr,
> uint32_t len)
> > +{
> > +qpl->zbuf_hdr[idx] = cpu_to_be32(len);
> > +p->iov[p->iovs_num].iov_base = addr;
> > +p->iov[p->iovs_num].iov_len = len;
> > +p->iovs_num++;
> > +p->next_packet_size += len;
> > +}
> > +
> > +/**
> > + * multifd_qpl_compress_pages: compress normal pages
> > + *
> > + * Each normal page will be compressed independently, and the
> compression jobs
> > + * will be submitted to the IAA hardware in non-blocking mode, waiting
> for all
> > + * jobs to be completed and filling the compressed length and data into
> the
> > + * sending IOVs. If IAA device is not available, the software path is
> used.
> > + *
> > + * Returns 0 for success or -1 for error
> > + *
> > + * @p: Params for the channel that we are using
> > + * @errp: pointer to an error
> > + */
>

RE: [PATCH v6 5/7] migration/multifd: implement initialization of qpl compression

2024-05-11 Thread Liu, Yuan1
> -Original Message-
> From: Fabiano Rosas 
> Sent: Saturday, May 11, 2024 4:45 AM
> To: Liu, Yuan1 ; pet...@redhat.com
> Cc: qemu-devel@nongnu.org; Liu, Yuan1 ; Zou, Nanhai
> 
> Subject: Re: [PATCH v6 5/7] migration/multifd: implement initialization of
> qpl compression
> 
> Yuan Liu  writes:
> 
> > the qpl initialization includes memory allocation for compressed
> > data and the qpl job initialization.
> >
> > the qpl job initialization will check if the In-Memory Analytics
> > Accelerator(IAA) device is available and use the IAA device first.
> > If the platform does not have IAA device or the IAA device is not
> > available, the qpl compression will fallback to the software path.
> >
> > Signed-off-by: Yuan Liu 
> > Reviewed-by: Nanhai Zou 
> 
> Looks good, just some nits below.
> 
> > ---
> >  migration/multifd-qpl.c | 272 +++-
> >  1 file changed, 271 insertions(+), 1 deletion(-)
> >
> > diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c
> > index 056a68a060..89fa51091a 100644
> > --- a/migration/multifd-qpl.c
> > +++ b/migration/multifd-qpl.c
> > @@ -9,12 +9,282 @@
> >   * This work is licensed under the terms of the GNU GPL, version 2 or
> later.
> >   * See the COPYING file in the top-level directory.
> >   */
> > +
> >  #include "qemu/osdep.h"
> >  #include "qemu/module.h"
> > +#include "qapi/error.h"
> > +#include "migration.h"
> > +#include "multifd.h"
> > +#include "qpl/qpl.h"
> > +
> > +typedef struct {
> > +qpl_job **job_array;
> > +/* the number of allocated jobs */
> > +uint32_t total_job_num;
> > +/* compressed data buffer */
> > +uint8_t *zbuf;
> > +/* the length of compressed data */
> 
> array of lenghts

Thanks for the comment, I will improve the comment next version.

> > +uint32_t *zbuf_hdr;
> 
> Why the _hdr suffix if the lengths are the only data stored here?

This zbuf_hdr is confusing, I will use lens instead of zbuf_hdr next version

> > +/* the status of IAA device */
> > +bool iaa_avail;
> > +} QplData;
> > +
> > +/**
> > + * check_iaa_avail: check if IAA device is available
> > + *
> > + * If the system does not have an IAA device, the IAA device is
> > + * not enabled or the IAA work queue is not configured as a shared
> > + * mode, the QPL hardware path initialization will fail.
> > + *
> > + * Returns true if IAA device is available, otherwise false.
> > + */
> > +static bool check_iaa_avail(void)
> > +{
> > +qpl_job *job = NULL;
> > +uint32_t job_size = 0;
> > +qpl_path_t path = qpl_path_hardware;
> > +
> > +if (qpl_get_job_size(path, &job_size) != QPL_STS_OK) {
> > +return false;
> > +}
> > +job = g_malloc0(job_size);
> > +if (qpl_init_job(path, job) != QPL_STS_OK) {
> > +g_free(job);
> > +return false;
> > +}
> > +g_free(job);
> > +return true;
> > +}
> > +
> > +/**
> > + * multifd_qpl_free_jobs: cleanup jobs
> > + *
> > + * Free all job resources.
> > + *
> > + * @qpl: pointer to the QplData structure
> > + */
> > +static void multifd_qpl_free_jobs(QplData *qpl)
> > +{
> > +assert(qpl != NULL);
> > +for (int i = 0; i < qpl->total_job_num; i++) {
> > +qpl_fini_job(qpl->job_array[i]);
> > +g_free(qpl->job_array[i]);
> > +qpl->job_array[i] = NULL;
> > +}
> > +g_free(qpl->job_array);
> > +qpl->job_array = NULL;
> > +}
> > +
> > +/**
> > + * multifd_qpl_init_jobs: initialize jobs
> > + *
> > + * Initialize all jobs
> > + *
> > + * @qpl: pointer to the QplData structure
> > + * @chan_id: multifd channel number
> > + * @errp: pointer to an error
> > + */
> > +static int multifd_qpl_init_jobs(QplData *qpl, uint8_t chan_id, Error
> **errp)
> > +{
> > +qpl_path_t path;
> > +qpl_status status;
> > +uint32_t job_size = 0;
> > +qpl_job *job = NULL;
> > +
> > +path = qpl->iaa_avail ? qpl_path_hardware : qpl_path_software;
> > +status = qpl_get_job_size(path, &job_size);
> > +if (status != QPL_STS_OK) {
> > +error_setg(errp, "multifd: %u: qpl_get_job_size failed with
> error %d",
> > +   chan_id, status);
> > +   

RE: [PATCH v6 5/7] migration/multifd: implement initialization of qpl compression

2024-05-11 Thread Liu, Yuan1
> -Original Message-
> From: Fabiano Rosas 
> Sent: Saturday, May 11, 2024 4:53 AM
> To: Liu, Yuan1 ; pet...@redhat.com
> Cc: qemu-devel@nongnu.org; Liu, Yuan1 ; Zou, Nanhai
> 
> Subject: Re: [PATCH v6 5/7] migration/multifd: implement initialization of
> qpl compression
> 
> Yuan Liu  writes:
> 
> > the qpl initialization includes memory allocation for compressed
> > data and the qpl job initialization.
> >
> > the qpl job initialization will check if the In-Memory Analytics
> > Accelerator(IAA) device is available and use the IAA device first.
> > If the platform does not have IAA device or the IAA device is not
> > available, the qpl compression will fallback to the software path.
> >
> > Signed-off-by: Yuan Liu 
> > Reviewed-by: Nanhai Zou 
> > ---
> >  migration/multifd-qpl.c | 272 +++-
> >  1 file changed, 271 insertions(+), 1 deletion(-)
> >
> > diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c
> > index 056a68a060..89fa51091a 100644
> > --- a/migration/multifd-qpl.c
> > +++ b/migration/multifd-qpl.c
> > @@ -9,12 +9,282 @@
> >   * This work is licensed under the terms of the GNU GPL, version 2 or
> later.
> >   * See the COPYING file in the top-level directory.
> >   */
> > +
> >  #include "qemu/osdep.h"
> >  #include "qemu/module.h"
> > +#include "qapi/error.h"
> > +#include "migration.h"
> 
> Where is this used? I think you only need qapi/qapi-types-migration.h

Yes, the qapi/qapi-types-migration.h is enough, no need to include migration.h
I will fix this next version.

> > +#include "multifd.h"
> > +#include "qpl/qpl.h"
> > +
> > +typedef struct {
> > +qpl_job **job_array;
> > +/* the number of allocated jobs */
> > +uint32_t total_job_num;
> > +/* compressed data buffer */
> > +uint8_t *zbuf;
> > +/* the length of compressed data */
> > +uint32_t *zbuf_hdr;
> > +/* the status of IAA device */
> > +bool iaa_avail;
> > +} QplData;
> > +
> > +/**
> > + * check_iaa_avail: check if IAA device is available
> > + *
> > + * If the system does not have an IAA device, the IAA device is
> > + * not enabled or the IAA work queue is not configured as a shared
> > + * mode, the QPL hardware path initialization will fail.
> > + *
> > + * Returns true if IAA device is available, otherwise false.
> > + */
> > +static bool check_iaa_avail(void)
> > +{
> > +qpl_job *job = NULL;
> > +uint32_t job_size = 0;
> > +qpl_path_t path = qpl_path_hardware;
> > +
> > +if (qpl_get_job_size(path, &job_size) != QPL_STS_OK) {
> > +return false;
> > +}
> > +job = g_malloc0(job_size);
> > +if (qpl_init_job(path, job) != QPL_STS_OK) {
> > +g_free(job);
> > +return false;
> > +}
> > +g_free(job);
> > +return true;
> > +}
> > +
> > +/**
> > + * multifd_qpl_free_jobs: cleanup jobs
> > + *
> > + * Free all job resources.
> > + *
> > + * @qpl: pointer to the QplData structure
> > + */
> > +static void multifd_qpl_free_jobs(QplData *qpl)
> > +{
> > +assert(qpl != NULL);
> > +for (int i = 0; i < qpl->total_job_num; i++) {
> > +qpl_fini_job(qpl->job_array[i]);
> > +g_free(qpl->job_array[i]);
> > +qpl->job_array[i] = NULL;
> > +}
> > +g_free(qpl->job_array);
> > +qpl->job_array = NULL;
> > +}
> > +
> > +/**
> > + * multifd_qpl_init_jobs: initialize jobs
> > + *
> > + * Initialize all jobs
> > + *
> > + * @qpl: pointer to the QplData structure
> > + * @chan_id: multifd channel number
> > + * @errp: pointer to an error
> > + */
> > +static int multifd_qpl_init_jobs(QplData *qpl, uint8_t chan_id, Error
> **errp)
> > +{
> > +qpl_path_t path;
> > +qpl_status status;
> > +uint32_t job_size = 0;
> > +qpl_job *job = NULL;
> > +
> > +path = qpl->iaa_avail ? qpl_path_hardware : qpl_path_software;
> > +status = qpl_get_job_size(path, &job_size);
> > +if (status != QPL_STS_OK) {
> > +error_setg(errp, "multifd: %u: qpl_get_job_size failed with
> error %d",
> > +   chan_id, status);
> > +return -1;
> > +}
> > +qpl->job_array = g_new0(qpl_job *, qpl->total_job_num);
> > 

RE: [PATCH v6 4/7] migration/multifd: add qpl compression method

2024-05-10 Thread Liu, Yuan1
> -Original Message-
> From: Fabiano Rosas 
> Sent: Friday, May 10, 2024 10:12 PM
> To: Liu, Yuan1 ; pet...@redhat.com
> Cc: qemu-devel@nongnu.org; Liu, Yuan1 ; Zou, Nanhai
> 
> Subject: Re: [PATCH v6 4/7] migration/multifd: add qpl compression method
> 
> Yuan Liu  writes:
> 
> > add the Query Processing Library (QPL) compression method
> >
> > Introduce the qpl as a new multifd migration compression method, it can
> > use In-Memory Analytics Accelerator(IAA) to accelerate compression and
> > decompression, which can not only reduce network bandwidth requirement
> > but also reduce host compression and decompression CPU overhead.
> >
> > How to enable qpl compression during migration:
> > migrate_set_parameter multifd-compression qpl
> >
> > The qpl method only supports one compression level, there is no qpl
> > compression level parameter added, users do not need to specify the
> > qpl compression level.
> >
> > Signed-off-by: Yuan Liu 
> > Reviewed-by: Nanhai Zou 
> 
> There's an r-b from Peter that you forgot to bring along in this version
> of the series.

Yes, this patch has received r-b from Peter Xu in the previous version.
Sorry, I forgot this, I will add it next time.
Thank you very much for the reminder.




RE: [PATCH 0/1] Solve zero page causing multiple page faults

2024-04-02 Thread Liu, Yuan1
> -Original Message-
> From: Liu, Yuan1 
> Sent: Monday, April 1, 2024 11:41 PM
> To: pet...@redhat.com; faro...@suse.de
> Cc: qemu-devel@nongnu.org; hao.xi...@bytedance.com;
> bryan.zh...@bytedance.com; Liu, Yuan1 ; Zou, Nanhai
> 
> Subject: [PATCH 0/1] Solve zero page causing multiple page faults
> 
> 1. Description of multiple page faults for received zero pages
> a. -mem-prealloc feature and hugepage backend are not enabled on the
>destination
> b. After receiving the zero pages, the destination first determines if
>the current page content is 0 via buffer_is_zero, this may cause a
>read page fault
> 
>   perf record -e page-faults information below
>   13.75%  13.75%  multifdrecv_0 qemu-system-x86_64 [.]
> buffer_zero_avx512
>   11.85%  11.85%  multifdrecv_1 qemu-system-x86_64 [.]
> buffer_zero_avx512
>   multifd_recv_thread
>   nocomp_recv
>   multifd_recv_zero_page_process
>   buffer_is_zero
>   select_accel_fn
>   buffer_zero_avx512
> 
>c. Other page faults mainly come from writing operations to normal and
>   zero pages.
> 
> 2. Solution
> a. During the multifd migration process, the received pages are
> tracked
>through RAMBlock's receivedmap.
> 
> b. If received zero page is not set in recvbitmap, the destination
> will not
>check whether the page content is 0, thus avoiding the occurrence
> of
>read fault.
> 
> c. If the zero page has been set in receivedmap, set the page with 0
>directly.
> 
> There are two reasons for this
> 1. It's unlikely a zero page if it's sent once or more.
> 2. For the 1st time destination received a zero page, it must be a
> zero
>page, so no need to scan for the 1st round.
> 
> 3. Test Result 16 vCPUs and 64G memory VM,  multifd number is 2,
>and 100G network bandwidth
> 
> 3.1 Test case: 16 vCPUs are idle and only 2G memory are used
> +---+++--+
> |MultiFD| total  |downtime|   Page   |
> |Nocomp | time   || Faults   |
> |   | (ms)   | (ms)   |  |
> +---+++--+
> |with   |||  |
> |recvbitmap |7335| 180|  2716|
> +---+++--+
> |without|||  |
> |recvbitmap |7771| 153|121357|
> +---+++--+
> 
> +---++++---++-
> +
> |MultiFD| total  |downtime| SVM|SVM| IOTLB  | IO
> PageFault|
> |QPL| time   || IO TLB |IO Page| MaxTime| MaxTime
> |
> |   | (ms)   | (ms)   | Flush  |Faults | (us)   | (us)
> |
> +---++++---++-
> +
> |with   ||||   ||
> |
> |recvbitmap |   10224| 175| 410|  27429|   1|
> 447|
> +---++++---++-
> +
> |without||||   ||
> |
> |recvbitmap |   11253| 153|   80756|  38655|  25|
> 18349|
> +---++++---++-
> +
> 
> 
> 3.2 Test case: 16 vCPUs are idle and 56G memory(not zero) are used
> +---+++--+
> |MultiFD| total  |downtime|   Page   |
> |Nocomp | time   || Faults   |
> |   | (ms)   | (ms)   |  |
> +---+++--+
> |with   |||  |
> |recvbitmap |   16825| 165| 52967|
> +---+++--+
> |without|||  |
> |recvbitmap |   12987| 159|   2672677|
> +---+++--+
> 
> +---++++---++-
> +
> |MultiFD| total  |downtime| SVM|SVM| IOTLB  | IO
> PageFault|
> |QPL| time   || IO TLB |IO Page| MaxTime| MaxTime
> |
> |   | (ms)   | (ms)   | Flush  |Faults | (us)   | (us)
> |
> +---++++---++-
> +
> |with   ||||   ||
> |
> |recvbitmap |  132315|  77| 890| 937105|  60|
> 9581|
> +---+++--

RE: [PATCH v5 0/7] Live Migration With IAA

2024-03-28 Thread Liu, Yuan1
> -Original Message-
> From: Peter Xu 
> Sent: Thursday, March 28, 2024 11:22 PM
> To: Liu, Yuan1 
> Cc: faro...@suse.de; qemu-devel@nongnu.org; hao.xi...@bytedance.com;
> bryan.zh...@bytedance.com; Zou, Nanhai 
> Subject: Re: [PATCH v5 0/7] Live Migration With IAA
> 
> On Thu, Mar 28, 2024 at 03:02:30AM +, Liu, Yuan1 wrote:
> > Yes, I will support software fallback to ensure CI testing and users can
> > still use qpl compression without IAA hardware.
> >
> > Although the qpl software solution will have better performance than
> zlib,
> > I still don't think it has a greater advantage than zstd. I don't think
> there
> > is a need to add a migration option to configure the qpl software or
> hardware path.
> > So I will still only use QPL as an independent compression in the next
> version, and
> > no other migration options are needed.
> 
> That should be fine.
> 
> >
> > I will also add a guide to qpl-compression.rst about IAA permission
> issues and how to
> > determine whether the hardware path is available.
> 
> OK.
> 
> [...]
> 
> > > > Yes, I use iperf3 to check the bandwidth for one core, the bandwith
> is
> > > 60Gbps.
> > > > [ ID] Interval   Transfer Bitrate Retr  Cwnd
> > > > [  5]   0.00-1.00   sec  7.00 GBytes  60.1 Gbits/sec0   2.87
> MBytes
> > > > [  5]   1.00-2.00   sec  7.05 GBytes  60.6 Gbits/sec0   2.87
> Mbytes
> > > >
> > > > And in the live migration test, a multifd thread's CPU utilization
> is
> > > almost 100%
> > >
> > > This 60Gpbs per-channel is definitely impressive..
> > >
> > > Have you tried migration without multifd on your system? Would that
> also
> > > perform similarly v.s. 2 channels multifd?
> >
> > Simple Test result below:
> > VM Type: 16vCPU, 64G memory
> > Workload in VM: fill 56G memory with Silesia data and vCPUs are idle
> > Migration Configurations:
> > 1. migrate_set_parameter max-bandwidth 100G
> > 2. migrate_set_parameter downtime-limit 300
> > 3. migrate_set_capability multifd on (multiFD test case)
> > 4. migrate_set_parameter multifd-channels 2 (multiFD test case)
> >
> >   Totaltime (ms) Downtime (ms) Throughput (mbps) Pages-
> per-second
> > without Multifd 23580   307  21221 689588
> > Multifd 2  7657 198  654102221176
> 
> Thanks for the test results.
> 
> So I am guessing the migration overheads besides pushing the socket is
> high
> enough to make it drop drastically, even if in this case zero detection
> shouldn't play a major role considering most of guest mem is pre-filled.

Yes, for no multifd migration, besides the network stack overhead, the zero
page detection overhead (both of source and destination) is indeed very high.
Placing the zero page detection in multi-threads can reduce the performance 
degradation caused by the overhead of zero page detection.

I also think migration doesn't need to detect zero page by memcmp in all cases.
The benefit of zero page detection may be that the VM's memory determines that
there are a large number of 0 pages. 

My experience in this area may be insufficient, I am trying with Hao and Bryan 
to
see if it is possible to use DSA hardware to accelerate this part (including 
page 0
detection and writing page 0). 

DSA is an accelerator for detecting memory, writing memory, and comparing memory
https://cdrdv2-public.intel.com/671116/341204-intel-data-streaming-accelerator-spec.pdf


RE: [PATCH v5 5/7] migration/multifd: implement initialization of qpl compression

2024-03-28 Thread Liu, Yuan1
> -Original Message-
> From: Peter Xu 
> Sent: Thursday, March 28, 2024 11:16 PM
> To: Liu, Yuan1 
> Cc: Daniel P. Berrangé ; faro...@suse.de; qemu-
> de...@nongnu.org; hao.xi...@bytedance.com; bryan.zh...@bytedance.com; Zou,
> Nanhai 
> Subject: Re: [PATCH v5 5/7] migration/multifd: implement initialization of
> qpl compression
> 
> On Thu, Mar 28, 2024 at 02:32:37AM +, Liu, Yuan1 wrote:
> > > -Original Message-
> > > From: Peter Xu 
> > > Sent: Thursday, March 28, 2024 3:26 AM
> > > To: Liu, Yuan1 
> > > Cc: Daniel P. Berrangé ; faro...@suse.de; qemu-
> > > de...@nongnu.org; hao.xi...@bytedance.com; bryan.zh...@bytedance.com;
> Zou,
> > > Nanhai 
> > > Subject: Re: [PATCH v5 5/7] migration/multifd: implement
> initialization of
> > > qpl compression
> > >
> > > On Fri, Mar 22, 2024 at 12:40:32PM -0400, Peter Xu wrote:
> > > > > > void multifd_recv_zero_page_process(MultiFDRecvParams *p)
> > > > > > {
> > > > > > for (int i = 0; i < p->zero_num; i++) {
> > > > > > void *page = p->host + p->zero[i];
> > > > > > if (!buffer_is_zero(page, p->page_size)) {
> > > > > > memset(page, 0, p->page_size);
> > > > > > }
> > > > > > }
> > > > > > }
> > > >
> > > > It may not matter much (where I also see your below comments), but
> just
> > > to
> > > > mention another solution to avoid this read is that we can maintain
> > > > RAMBlock->receivedmap for precopy (especially, multifd, afaiu
> multifd
> > > > doesn't yet update this bitmap.. even if normal precopy does), then
> here
> > > > instead of scanning every time, maybe we can do:
> > > >
> > > >   /*
> > > >* If it's the 1st time receiving it, no need to clear it as it
> must
> > > be
> > > >* all zeros now.
> > > >*/
> > > >   if (bitmap_test(rb->receivedmap, page_offset)) {
> > > >   memset(page, 0, ...);
> > > >   } else {
> > > >   bitmap_set(rb->receivedmap, page_offset);
> > > >   }
> > > >
> > > > And we also always set the bit when !zero too.
> > > >
> > > > My rational is that it's unlikely a zero page if it's sent once or
> more,
> > > > while OTOH for the 1st time we receive it, it must be a zero page,
> so no
> > > > need to scan for the 1st round.
> > >
> > > Thinking about this, I'm wondering whether we should have this
> regardless.
> > > IIUC now multifd will always require two page faults on destination
> for
> > > anonymous guest memories (I suppose shmem/hugetlb is fine as no zero
> page
> > > in those worlds).  Even though it should be faster than DMA faults, it
> > > still is unwanted.
> > >
> > > I'll take a note myself as todo to do some measurements in the future
> > > first.  However if anyone thinks that makes sense and want to have a
> look,
> > > please say so.  It'll be more than welcomed.
> >
> > Yes, I think this is a better improvement to avoid two page faults. I
> can test
> > the performance impact of this change on SVM-capable devices and give
> some data
> > later. As we saw before, the IOTLB flush occurs via COW, with the
> change, the
> > impact of the COW should be gone.
> >
> > If you need more testing and analysis on this, please let me know
> 
> Nothing more than that.  Just a heads up that Xiang used to mention a test
> case where Richard used to suggest dropping the zero check:
> 
> https://lore.kernel.org/r/CAAYibXib+TWnJpV22E=adncdBmwXJRqgRjJXK7X71J=bDfa
> x...@mail.gmail.com
> 
> AFAIU this should be resolved if we have the bitmap maintained, but we can
> double check.  IIUC that's exactly the case for an idle guest, in that
> case
> it should be even faster to skip the memcmp when bit clear.
> 
> If you're going to post the patches, feel free to post that as a
> standalone
> small series first, then that can be considered merge even earlier.
> 
> Thanks a lot for doing this.

Sure, I will prepare a separate patch for this, and we can have a better 
discussion
on concrete implementation and test results. 


RE: [PATCH v2 2/5] migration: Add migration parameters for QATzip

2024-03-28 Thread Liu, Yuan1
> -Original Message-
> From: Bryan Zhang 
> Sent: Wednesday, March 27, 2024 6:42 AM
> To: qemu-devel@nongnu.org
> Cc: pet...@redhat.com; faro...@suse.de; Liu, Yuan1 ;
> berra...@redhat.com; Zou, Nanhai ;
> hao.xi...@linux.dev; Bryan Zhang 
> Subject: [PATCH v2 2/5] migration: Add migration parameters for QATzip
> 
> Adds support for migration parameters to control QATzip compression
> level and to enable/disable software fallback when QAT hardware is
> unavailable. This is a preparatory commit for a subsequent commit that
> will actually use QATzip compression.
> 
> Signed-off-by: Bryan Zhang 
> Signed-off-by: Hao Xiang 
> ---
> Revision: This commit now includes a parameter for controlling software
> fallback. Fallback is generally intended to be disabled, but having this
> option available enables using software fallback for testing.
> 
> This commit also now has some glue code to properly set parameters.
> 
>  migration/migration-hmp-cmds.c |  8 +
>  migration/options.c| 57 ++
>  migration/options.h|  2 ++
>  qapi/migration.json| 35 +
>  4 files changed, 102 insertions(+)
> 
> diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-
> cmds.c
> index 99b49df5dd..4bd23ba14d 100644
> --- a/migration/migration-hmp-cmds.c
> +++ b/migration/migration-hmp-cmds.c
> @@ -630,6 +630,14 @@ void hmp_migrate_set_parameter(Monitor *mon, const
> QDict *qdict)
>  p->has_multifd_zlib_level = true;
>  visit_type_uint8(v, param, &p->multifd_zlib_level, &err);
>  break;
> +case MIGRATION_PARAMETER_MULTIFD_QATZIP_LEVEL:
> +p->has_multifd_qatzip_level = true;
> +visit_type_uint8(v, param, &p->multifd_qatzip_level, &err);
> +break;
> +case MIGRATION_PARAMETER_MULTIFD_QATZIP_SW_FALLBACK:
> +p->has_multifd_qatzip_sw_fallback = true;
> +visit_type_bool(v, param, &p->multifd_qatzip_sw_fallback, &err);
> +break;
>  case MIGRATION_PARAMETER_MULTIFD_ZSTD_LEVEL:
>  p->has_multifd_zstd_level = true;
>  visit_type_uint8(v, param, &p->multifd_zstd_level, &err);
> diff --git a/migration/options.c b/migration/options.c
> index 3e3e0b93b4..1316ea605a 100644
> --- a/migration/options.c
> +++ b/migration/options.c
> @@ -62,6 +62,15 @@
>  #define DEFAULT_MIGRATE_MULTIFD_COMPRESSION MULTIFD_COMPRESSION_NONE
>  /* 0: means nocompress, 1: best speed, ... 9: best compress ratio */
>  #define DEFAULT_MIGRATE_MULTIFD_ZLIB_LEVEL 1
> +/*
> + * 1: best speed, ... 9: best compress ratio
> + * There is some nuance here. Refer to QATzip documentation to understand
> + * the mapping of QATzip levels to standard deflate levels.
> + */
> +#define DEFAULT_MIGRATE_MULTIFD_QATZIP_LEVEL 1
> +/* QATzip's SW fallback implementation is extremely slow, so avoid
> fallback */
> +#define DEFAULT_MIGRATE_MULTIFD_QATZIP_SW_FALLBACK false
> +
>  /* 0: means nocompress, 1: best speed, ... 20: best compress ratio */
>  #define DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL 1

Hi Bryan

The default compression level may be set higher, such as 6. I checked QAT 
throughput
If the data size is less than or equal to 64K, level 1 has much better 
throughput 
performance than level 6 and level 9. But if the data size is greater than 
128K, little 
change in throughput, and the default MULTIFD_PACKET_SIZE is 512K, you can have 
a try 
to use a high compression level, to get better compression performance without 
affecting 
throughput.

In addition, if you change MULTIFD_PACKET_SIZE to 64K, you may have better 
throughput 
with more multifd threads

> @@ -143,6 +152,12 @@ Property migration_properties[] = {
>  DEFINE_PROP_UINT8("multifd-zlib-level", MigrationState,
>parameters.multifd_zlib_level,
>DEFAULT_MIGRATE_MULTIFD_ZLIB_LEVEL),
> +DEFINE_PROP_UINT8("multifd-qatzip-level", MigrationState,
> +  parameters.multifd_qatzip_level,
> +  DEFAULT_MIGRATE_MULTIFD_QATZIP_LEVEL),
> +DEFINE_PROP_BOOL("multifd-qatzip-sw-fallback", MigrationState,
> +  parameters.multifd_qatzip_sw_fallback,
> +  DEFAULT_MIGRATE_MULTIFD_QATZIP_SW_FALLBACK),
>  DEFINE_PROP_UINT8("multifd-zstd-level", MigrationState,
>parameters.multifd_zstd_level,
>DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL),
> @@ -861,6 +876,20 @@ int migrate_multifd_zlib_level(void)
>  return s->parameters.multifd_zlib_level;
>  }
> 
> +int migrate_multifd_qatzip_level(void)
> +{
> +

RE: [PATCH v5 4/7] migration/multifd: add qpl compression method

2024-03-27 Thread Liu, Yuan1
> -Original Message-
> From: Peter Xu 
> Sent: Thursday, March 28, 2024 3:49 AM
> To: Liu, Yuan1 
> Cc: faro...@suse.de; qemu-devel@nongnu.org; hao.xi...@bytedance.com;
> bryan.zh...@bytedance.com; Zou, Nanhai 
> Subject: Re: [PATCH v5 4/7] migration/multifd: add qpl compression method
> 
> On Wed, Mar 20, 2024 at 12:45:24AM +0800, Yuan Liu wrote:
> > add the Query Processing Library (QPL) compression method
> >
> > Although both qpl and zlib support deflate compression, qpl will
> > only use the In-Memory Analytics Accelerator(IAA) for compression
> > and decompression, and IAA is not compatible with the Zlib in
> > migration, so qpl is used as a new compression method for migration.
> >
> > How to enable qpl compression during migration:
> > migrate_set_parameter multifd-compression qpl
> >
> > The qpl only supports one compression level, there is no qpl
> > compression level parameter added, users do not need to specify
> > the qpl compression level.
> >
> > Signed-off-by: Yuan Liu 
> > Reviewed-by: Nanhai Zou 
> > ---
> >  hw/core/qdev-properties-system.c |  2 +-
> >  migration/meson.build|  1 +
> >  migration/multifd-qpl.c  | 20 
> >  migration/multifd.h  |  1 +
> >  qapi/migration.json  |  7 ++-
> >  5 files changed, 29 insertions(+), 2 deletions(-)
> >  create mode 100644 migration/multifd-qpl.c
> >
> > diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-
> system.c
> > index d79d6f4b53..6ccd7224f6 100644
> > --- a/hw/core/qdev-properties-system.c
> > +++ b/hw/core/qdev-properties-system.c
> > @@ -659,7 +659,7 @@ const PropertyInfo qdev_prop_fdc_drive_type = {
> >  const PropertyInfo qdev_prop_multifd_compression = {
> >  .name = "MultiFDCompression",
> >  .description = "multifd_compression values, "
> > -   "none/zlib/zstd",
> > +   "none/zlib/zstd/qpl",
> >  .enum_table = &MultiFDCompression_lookup,
> >  .get = qdev_propinfo_get_enum,
> >  .set = qdev_propinfo_set_enum,
> > diff --git a/migration/meson.build b/migration/meson.build
> > index 1eeb915ff6..cb177de1d2 100644
> > --- a/migration/meson.build
> > +++ b/migration/meson.build
> > @@ -41,6 +41,7 @@ if get_option('live_block_migration').allowed()
> >system_ss.add(files('block.c'))
> >  endif
> >  system_ss.add(when: zstd, if_true: files('multifd-zstd.c'))
> > +system_ss.add(when: qpl, if_true: files('multifd-qpl.c'))
> >
> >  specific_ss.add(when: 'CONFIG_SYSTEM_ONLY',
> >  if_true: files('ram.c',
> > diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c
> > new file mode 100644
> > index 00..056a68a060
> > --- /dev/null
> > +++ b/migration/multifd-qpl.c
> > @@ -0,0 +1,20 @@
> > +/*
> > + * Multifd qpl compression accelerator implementation
> > + *
> > + * Copyright (c) 2023 Intel Corporation
> > + *
> > + * Authors:
> > + *  Yuan Liu
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2 or
> later.
> > + * See the COPYING file in the top-level directory.
> > + */
> > +#include "qemu/osdep.h"
> > +#include "qemu/module.h"
> > +
> > +static void multifd_qpl_register(void)
> > +{
> > +/* noop */
> > +}
> > +
> > +migration_init(multifd_qpl_register);
> > diff --git a/migration/multifd.h b/migration/multifd.h
> > index c9d9b09239..5b7d9b15f8 100644
> > --- a/migration/multifd.h
> > +++ b/migration/multifd.h
> > @@ -40,6 +40,7 @@ MultiFDRecvData *multifd_get_recv_data(void);
> >  #define MULTIFD_FLAG_NOCOMP (0 << 1)
> >  #define MULTIFD_FLAG_ZLIB (1 << 1)
> >  #define MULTIFD_FLAG_ZSTD (2 << 1)
> > +#define MULTIFD_FLAG_QPL (4 << 1)
> >
> >  /* This value needs to be a multiple of qemu_target_page_size() */
> >  #define MULTIFD_PACKET_SIZE (512 * 1024)
> > diff --git a/qapi/migration.json b/qapi/migration.json
> > index aa1b39bce1..dceb35db5b 100644
> > --- a/qapi/migration.json
> > +++ b/qapi/migration.json
> > @@ -629,11 +629,16 @@
> >  #
> >  # @zstd: use zstd compression method.
> >  #
> > +# @qpl: use qpl compression method. Query Processing Library(qpl) is
> based on
> > +#   the deflate compression algorithm and use the Intel In-Memory
> Analytics
> > +#   Accelerator(IAA) accelerated compression and decompression.
> (Since 9.0)
> 
> s/9.0/9.1/

Ok, I will fix it in the next version.

> > +#
> >  # Since: 5.0
> >  ##
> >  { 'enum': 'MultiFDCompression',
> >'data': [ 'none', 'zlib',
> > -{ 'name': 'zstd', 'if': 'CONFIG_ZSTD' } ] }
> > +{ 'name': 'zstd', 'if': 'CONFIG_ZSTD' },
> > +{ 'name': 'qpl', 'if': 'CONFIG_QPL' } ] }
> >
> >  ##
> >  # @MigMode:
> > --
> > 2.39.3
> >
> 
> Reviewed-by: Peter Xu 
> 
> --
> Peter Xu



RE: [PATCH v5 5/7] migration/multifd: implement initialization of qpl compression

2024-03-27 Thread Liu, Yuan1
> -Original Message-
> From: Peter Xu 
> Sent: Thursday, March 28, 2024 3:26 AM
> To: Liu, Yuan1 
> Cc: Daniel P. Berrangé ; faro...@suse.de; qemu-
> de...@nongnu.org; hao.xi...@bytedance.com; bryan.zh...@bytedance.com; Zou,
> Nanhai 
> Subject: Re: [PATCH v5 5/7] migration/multifd: implement initialization of
> qpl compression
> 
> On Fri, Mar 22, 2024 at 12:40:32PM -0400, Peter Xu wrote:
> > > > void multifd_recv_zero_page_process(MultiFDRecvParams *p)
> > > > {
> > > > for (int i = 0; i < p->zero_num; i++) {
> > > > void *page = p->host + p->zero[i];
> > > > if (!buffer_is_zero(page, p->page_size)) {
> > > > memset(page, 0, p->page_size);
> > > > }
> > > > }
> > > > }
> >
> > It may not matter much (where I also see your below comments), but just
> to
> > mention another solution to avoid this read is that we can maintain
> > RAMBlock->receivedmap for precopy (especially, multifd, afaiu multifd
> > doesn't yet update this bitmap.. even if normal precopy does), then here
> > instead of scanning every time, maybe we can do:
> >
> >   /*
> >* If it's the 1st time receiving it, no need to clear it as it must
> be
> >* all zeros now.
> >*/
> >   if (bitmap_test(rb->receivedmap, page_offset)) {
> >   memset(page, 0, ...);
> >   } else {
> >   bitmap_set(rb->receivedmap, page_offset);
> >   }
> >
> > And we also always set the bit when !zero too.
> >
> > My rational is that it's unlikely a zero page if it's sent once or more,
> > while OTOH for the 1st time we receive it, it must be a zero page, so no
> > need to scan for the 1st round.
> 
> Thinking about this, I'm wondering whether we should have this regardless.
> IIUC now multifd will always require two page faults on destination for
> anonymous guest memories (I suppose shmem/hugetlb is fine as no zero page
> in those worlds).  Even though it should be faster than DMA faults, it
> still is unwanted.
> 
> I'll take a note myself as todo to do some measurements in the future
> first.  However if anyone thinks that makes sense and want to have a look,
> please say so.  It'll be more than welcomed.

Yes, I think this is a better improvement to avoid two page faults. I can test
the performance impact of this change on SVM-capable devices and give some data
later. As we saw before, the IOTLB flush occurs via COW, with the change, the 
impact of the COW should be gone.

If you need more testing and analysis on this, please let me know



RE: [PATCH v5 1/7] docs/migration: add qpl compression feature

2024-03-26 Thread Liu, Yuan1

> -Original Message-
> From: Peter Xu 
> Sent: Wednesday, March 27, 2024 1:58 AM
> To: Liu, Yuan1 
> Cc: faro...@suse.de; qemu-devel@nongnu.org; hao.xi...@bytedance.com;
> bryan.zh...@bytedance.com; Zou, Nanhai 
> Subject: Re: [PATCH v5 1/7] docs/migration: add qpl compression feature
> 
> On Wed, Mar 20, 2024 at 12:45:21AM +0800, Yuan Liu wrote:
> > add Intel Query Processing Library (QPL) compression method
> > introduction
> >
> > Signed-off-by: Yuan Liu 
> > Reviewed-by: Nanhai Zou 
> > ---
> >  docs/devel/migration/features.rst|   1 +
> >  docs/devel/migration/qpl-compression.rst | 231 +++
> >  2 files changed, 232 insertions(+)
> >  create mode 100644 docs/devel/migration/qpl-compression.rst
> >
> > diff --git a/docs/devel/migration/features.rst
> b/docs/devel/migration/features.rst
> > index d5ca7b86d5..bc98b65075 100644
> > --- a/docs/devel/migration/features.rst
> > +++ b/docs/devel/migration/features.rst
> > @@ -12,3 +12,4 @@ Migration has plenty of features to support different
> use cases.
> > virtio
> > mapped-ram
> > CPR
> > +   qpl-compression
> > diff --git a/docs/devel/migration/qpl-compression.rst
> b/docs/devel/migration/qpl-compression.rst
> > new file mode 100644
> > index 00..42c7969d30
> > --- /dev/null
> > +++ b/docs/devel/migration/qpl-compression.rst
> > @@ -0,0 +1,231 @@
> > +===
> > +QPL Compression
> > +===
> > +The Intel Query Processing Library (Intel ``QPL``) is an open-source
> library to
> > +provide compression and decompression features and it is based on
> deflate
> > +compression algorithm (RFC 1951).
> > +
> > +The ``QPL`` compression relies on Intel In-Memory Analytics
> Accelerator(``IAA``)
> > +and Shared Virtual Memory(``SVM``) technology, they are new features
> supported
> > +from Intel 4th Gen Intel Xeon Scalable processors, codenamed Sapphire
> Rapids
> > +processor(``SPR``).
> > +
> > +For more ``QPL`` introduction, please refer to:
> > +
> >
> +https://intel.github.io/qpl/documentation/introduction_docs/introduction.
> html
> 
> There're a bunch of links in this page, please consider switching all of
> them to use the link formats of .rST:
> 
>   Please refer to `QPL introduction page `_.

Sure, thanks for the suggestion

> > +
> > +QPL Compression Framework
> > +=
> > +
> > +::
> > +
> > +  ++   +--+
> > +  | MultiFD Service|   |accel-config tool |
> > +  +---++   ++-+
> > +  | |
> > +  | |
> > +  +---++| Setup IAA
> > +  |  QPL library   || Resources
> > +  +---+---++|
> > +  |   | |
> > +  |   +-+---+
> > +  |   Open IAA  |
> > +  |   Devices +-+-+
> > +  |   |idxd driver|
> > +  |   +-+-+
> > +  | |
> > +  | |
> > +  |   +-+-+
> > +  +---+IAA Devices|
> > +  Submit jobs +---+
> > +  via enqcmd
> > +
> > +
> > +Intel In-Memory Analytics Accelerator (Intel IAA) Introduction
> > +
> > +
> > +Intel ``IAA`` is an accelerator that has been designed to help benefit
> > +in-memory databases and analytic workloads. There are three main areas
> > +that Intel ``IAA`` can assist with analytics primitives (scan, filter,
> etc.),
> > +sparse data compression and memory tiering.
> > +
> > +``IAA`` Manual Documentation:
> > +
> > +https://www.intel.com/content/www/us/en/content-details/721858/intel-
> in-memory-analytics-accelerator-architecture-specification
> > +
> > +IAA Device Enabling
> > +---
> > +
> > +- Enabling ``IAA`` devices for platform configuration, please refer to:
> > +
> > +https://www.intel.com/content/www/us/en/content-details/780887/intel-
> in-memory-analytics-accelerator-intel-iaa.html
> > +
> > +- ``IAA`` device driver is ``Intel Data Accelerator Driver (idxd)``, it
> is
> > +  recommended that the minimum version of Linux kernel is 5.18.
> > +
> > +- Add ``"intel_iommu=on,sm_on"`` parameter to kernel c

RE: [PATCH v5 5/7] migration/multifd: implement initialization of qpl compression

2024-03-22 Thread Liu, Yuan1
> -Original Message-
> From: Liu, Yuan1
> Sent: Friday, March 22, 2024 10:07 AM
> To: Peter Xu 
> Cc: Daniel P. Berrangé ; faro...@suse.de; qemu-
> de...@nongnu.org; hao.xi...@bytedance.com; bryan.zh...@bytedance.com; Zou,
> Nanhai 
> Subject: RE: [PATCH v5 5/7] migration/multifd: implement initialization of
> qpl compression
> 
> > -Original Message-
> > From: Peter Xu 
> > Sent: Thursday, March 21, 2024 11:28 PM
> > To: Liu, Yuan1 
> > Cc: Daniel P. Berrangé ; faro...@suse.de; qemu-
> > de...@nongnu.org; hao.xi...@bytedance.com; bryan.zh...@bytedance.com;
> Zou,
> > Nanhai 
> > Subject: Re: [PATCH v5 5/7] migration/multifd: implement initialization
> of
> > qpl compression
> >
> > On Thu, Mar 21, 2024 at 01:37:36AM +, Liu, Yuan1 wrote:
> > > > -Original Message-
> > > > From: Peter Xu 
> > > > Sent: Thursday, March 21, 2024 4:32 AM
> > > > To: Liu, Yuan1 
> > > > Cc: Daniel P. Berrangé ; faro...@suse.de; qemu-
> > > > de...@nongnu.org; hao.xi...@bytedance.com;
> bryan.zh...@bytedance.com;
> > Zou,
> > > > Nanhai 
> > > > Subject: Re: [PATCH v5 5/7] migration/multifd: implement
> > initialization of
> > > > qpl compression
> > > >
> > > > On Wed, Mar 20, 2024 at 04:23:01PM +, Liu, Yuan1 wrote:
> > > > > let me explain here, during the decompression operation of IAA,
> the
> > > > > decompressed data can be directly output to the virtual address of
> > the
> > > > > guest memory by IAA hardware.  It can avoid copying the
> decompressed
> > > > data
> > > > > to guest memory by CPU.
> > > >
> > > > I see.
> > > >
> > > > > Without -mem-prealloc, all the guest memory is not populated, and
> > IAA
> > > > > hardware needs to trigger I/O page fault first and then output the
> > > > > decompressed data to the guest memory region.  Besides that, CPU
> > page
> > > > > faults will also trigger IOTLB flush operation when IAA devices
> use
> > SVM.
> > > >
> > > > Oh so the IAA hardware already can use CPU pgtables?  Nice..
> > > >
> > > > Why IOTLB flush is needed?  AFAIU we're only installing new pages,
> the
> > > > request can either come from a CPU access or a DMA.  In all cases
> > there
> > > > should have no tearing down of an old page.  Isn't an iotlb flush
> only
> > > > needed if a tear down happens?
> > >
> > > As far as I know, IAA hardware uses SVM technology to use the CPU's
> page
> > table
> > > for address translation (IOMMU scalable mode directly accesses the CPU
> > page table).
> > > Therefore, when the CPU page table changes, the device's Invalidation
> > operation needs
> > > to be triggered to update the IOMMU and the device's cache.
> > >
> > > My current kernel version is mainline 6.2. The issue I see is as
> > follows:
> > > --Handle_mm_fault
> > >  |
> > >   -- wp_page_copy
> >
> > This is the CoW path.  Not usual at all..
> >
> > I assume this issue should only present on destination.  Then the guest
> > pages should be the destination of such DMAs to happen, which means
> these
> > should be write faults, and as we see here it is, otherwise it won't
> > trigger a CoW.
> >
> > However it's not clear to me why a pre-installed zero page existed.  It
> > means someone read the guest pages first.
> >
> > It might be interesting to know _why_ someone reads the guest pages,
> even
> > if we know they're all zeros.  If we can avoid such reads then it'll be
> a
> > hole rather than a prefaulted read on zero page, then invalidations are
> > not
> > needed, and I expect that should fix the iotlb storm issue.
> 
> The received pages will be read for zero pages check first. Although
> these pages are zero pages, and IAA hardware will not access them, the
> COW happens and causes following IOTLB flush operation. As far as I know,
> IOMMU quickly detects whether the address range has been used by the
> device,
> and does not invalidate the address that is not used by the device, this
> has
> not yet been resolved in Linux kernel 6.2. I will check the latest status
> for
> this.

I checked the Linux mainline 6.8 code, there are no big changes for this.
In version 6.8, if the process needs to flush MMU TLB, then I/O TLB flush
will be 

RE: [PATCH v5 5/7] migration/multifd: implement initialization of qpl compression

2024-03-21 Thread Liu, Yuan1
> -Original Message-
> From: Peter Xu 
> Sent: Thursday, March 21, 2024 11:28 PM
> To: Liu, Yuan1 
> Cc: Daniel P. Berrangé ; faro...@suse.de; qemu-
> de...@nongnu.org; hao.xi...@bytedance.com; bryan.zh...@bytedance.com; Zou,
> Nanhai 
> Subject: Re: [PATCH v5 5/7] migration/multifd: implement initialization of
> qpl compression
> 
> On Thu, Mar 21, 2024 at 01:37:36AM +, Liu, Yuan1 wrote:
> > > -Original Message-
> > > From: Peter Xu 
> > > Sent: Thursday, March 21, 2024 4:32 AM
> > > To: Liu, Yuan1 
> > > Cc: Daniel P. Berrangé ; faro...@suse.de; qemu-
> > > de...@nongnu.org; hao.xi...@bytedance.com; bryan.zh...@bytedance.com;
> Zou,
> > > Nanhai 
> > > Subject: Re: [PATCH v5 5/7] migration/multifd: implement
> initialization of
> > > qpl compression
> > >
> > > On Wed, Mar 20, 2024 at 04:23:01PM +, Liu, Yuan1 wrote:
> > > > let me explain here, during the decompression operation of IAA, the
> > > > decompressed data can be directly output to the virtual address of
> the
> > > > guest memory by IAA hardware.  It can avoid copying the decompressed
> > > data
> > > > to guest memory by CPU.
> > >
> > > I see.
> > >
> > > > Without -mem-prealloc, all the guest memory is not populated, and
> IAA
> > > > hardware needs to trigger I/O page fault first and then output the
> > > > decompressed data to the guest memory region.  Besides that, CPU
> page
> > > > faults will also trigger IOTLB flush operation when IAA devices use
> SVM.
> > >
> > > Oh so the IAA hardware already can use CPU pgtables?  Nice..
> > >
> > > Why IOTLB flush is needed?  AFAIU we're only installing new pages, the
> > > request can either come from a CPU access or a DMA.  In all cases
> there
> > > should have no tearing down of an old page.  Isn't an iotlb flush only
> > > needed if a tear down happens?
> >
> > As far as I know, IAA hardware uses SVM technology to use the CPU's page
> table
> > for address translation (IOMMU scalable mode directly accesses the CPU
> page table).
> > Therefore, when the CPU page table changes, the device's Invalidation
> operation needs
> > to be triggered to update the IOMMU and the device's cache.
> >
> > My current kernel version is mainline 6.2. The issue I see is as
> follows:
> > --Handle_mm_fault
> >  |
> >   -- wp_page_copy
> 
> This is the CoW path.  Not usual at all..
> 
> I assume this issue should only present on destination.  Then the guest
> pages should be the destination of such DMAs to happen, which means these
> should be write faults, and as we see here it is, otherwise it won't
> trigger a CoW.
> 
> However it's not clear to me why a pre-installed zero page existed.  It
> means someone read the guest pages first.
> 
> It might be interesting to know _why_ someone reads the guest pages, even
> if we know they're all zeros.  If we can avoid such reads then it'll be a
> hole rather than a prefaulted read on zero page, then invalidations are
> not
> needed, and I expect that should fix the iotlb storm issue.

The received pages will be read for zero pages check first. Although
these pages are zero pages, and IAA hardware will not access them, the
COW happens and causes following IOTLB flush operation. As far as I know, 
IOMMU quickly detects whether the address range has been used by the device,
and does not invalidate the address that is not used by the device, this has 
not yet been resolved in Linux kernel 6.2. I will check the latest status for
this.
void multifd_recv_zero_page_process(MultiFDRecvParams *p)
{
for (int i = 0; i < p->zero_num; i++) {
void *page = p->host + p->zero[i];
if (!buffer_is_zero(page, p->page_size)) {
memset(page, 0, p->page_size);
}
}
}


> It'll still be good we can fix this first to not make qpl special from
> this
> regard, so that the hope is migration submodule shouldn't rely on any
> pre-config (-mem-prealloc) on guest memory behaviors to work properly.

Even if the IOTLB problem can be avoided, the I/O page fault problem (normal
pages are loaded by the IAA device and solving normal page faults through IOMMU,
the performance is not good)

It can let the decompressed data of the IAA device be output to a pre-populated
memory instead of directly outputting to the guest address, but then each 
multifd
thread needs two memory copies, one copy from the network to the IAA input 
memory(pre-populated), and another copy from the IAA output 
memory(pre-populated)
to the guest address, which may become a performance bottleneck at the 
destination
during the live migration process.

So I think it is still necessary to use the -mem-prealloc option

> > -- mmu_notifier_invalidate_range
> >   |
> >   -- intel_invalidate_rage
> > |
> > -- qi_flush_piotlb
> > -- qi_flush_dev_iotlb_pasid
> 
> --
> Peter Xu



RE: [PATCH v5 5/7] migration/multifd: implement initialization of qpl compression

2024-03-20 Thread Liu, Yuan1
> -Original Message-
> From: Peter Xu 
> Sent: Thursday, March 21, 2024 4:32 AM
> To: Liu, Yuan1 
> Cc: Daniel P. Berrangé ; faro...@suse.de; qemu-
> de...@nongnu.org; hao.xi...@bytedance.com; bryan.zh...@bytedance.com; Zou,
> Nanhai 
> Subject: Re: [PATCH v5 5/7] migration/multifd: implement initialization of
> qpl compression
> 
> On Wed, Mar 20, 2024 at 04:23:01PM +, Liu, Yuan1 wrote:
> > let me explain here, during the decompression operation of IAA, the
> > decompressed data can be directly output to the virtual address of the
> > guest memory by IAA hardware.  It can avoid copying the decompressed
> data
> > to guest memory by CPU.
> 
> I see.
> 
> > Without -mem-prealloc, all the guest memory is not populated, and IAA
> > hardware needs to trigger I/O page fault first and then output the
> > decompressed data to the guest memory region.  Besides that, CPU page
> > faults will also trigger IOTLB flush operation when IAA devices use SVM.
> 
> Oh so the IAA hardware already can use CPU pgtables?  Nice..
> 
> Why IOTLB flush is needed?  AFAIU we're only installing new pages, the
> request can either come from a CPU access or a DMA.  In all cases there
> should have no tearing down of an old page.  Isn't an iotlb flush only
> needed if a tear down happens?

As far as I know, IAA hardware uses SVM technology to use the CPU's page table 
for address translation (IOMMU scalable mode directly accesses the CPU page 
table).
Therefore, when the CPU page table changes, the device's Invalidation operation 
needs
to be triggered to update the IOMMU and the device's cache. 

My current kernel version is mainline 6.2. The issue I see is as follows:
--Handle_mm_fault
 |
  -- wp_page_copy
|
-- mmu_notifier_invalidate_range
  |
  -- intel_invalidate_rage
|
-- qi_flush_piotlb
-- qi_flush_dev_iotlb_pasid
 

> > Due to the inability to quickly resolve a large number of IO page faults
> > and IOTLB flushes, the decompression throughput of the IAA device will
> > decrease significantly.
> 
> --
> Peter Xu



RE: [PATCH v5 7/7] tests/migration-test: add qpl compression test

2024-03-20 Thread Liu, Yuan1
> -Original Message-
> From: Daniel P. Berrangé 
> Sent: Wednesday, March 20, 2024 11:40 PM
> To: Liu, Yuan1 
> Cc: pet...@redhat.com; faro...@suse.de; qemu-devel@nongnu.org;
> hao.xi...@bytedance.com; bryan.zh...@bytedance.com; Zou, Nanhai
> 
> Subject: Re: [PATCH v5 7/7] tests/migration-test: add qpl compression test
> 
> On Wed, Mar 20, 2024 at 03:30:40PM +, Liu, Yuan1 wrote:
> > > -Original Message-
> > > From: Daniel P. Berrangé 
> > > Sent: Wednesday, March 20, 2024 6:46 PM
> > > To: Liu, Yuan1 
> > > Cc: pet...@redhat.com; faro...@suse.de; qemu-devel@nongnu.org;
> > > hao.xi...@bytedance.com; bryan.zh...@bytedance.com; Zou, Nanhai
> > > 
> > > Subject: Re: [PATCH v5 7/7] tests/migration-test: add qpl compression
> test
> > >
> > > On Wed, Mar 20, 2024 at 12:45:27AM +0800, Yuan Liu wrote:
> > > > add qpl to compression method test for multifd migration
> > > >
> > > > the migration with qpl compression needs to access IAA hardware
> > > > resource, please run "check-qtest" with sudo or root permission,
> > > > otherwise migration test will fail
> > >
> > > That's not an acceptable requirement.
> > >
> > > If someone builds QEMU with QPL, the migration test *must*
> > > pass 100% reliably when either running on a host without
> > > the QPL required hardware, or when lacking permissions.
> > >
> > > The test case needs to detect these scenarios and automatically
> > > skip the test if it is incapable of running successfully.
> > > This raises another question though. If QPL migration requires
> > > running as root, then it is effectively unusable for QEMU, as
> > > no sane deployment ever runs QEMU as root.
> > >
> > > Is there a way to make QPL work for non-root users ?
> >
> > There are two issues here
> > 1. I need to add an IAA resource detection before the QPL test begins
> >In this way, when QPL resources are unavailable, the live migration
> >test will not be affected.
> >
> > 2. I need to add some additional information about IAA configuration in
> >the devel/qpl-compression.rst documentation. In addition to
> configuring
> >IAA resources, the system administrator also needs to assign IAA
> resources
> >to user groups.
> >For example, the system administrator runs "chown -R user /dev/iax",
> then
> >all IAA resources can be accessed by "user", this method does not
> require
> >sudo or root permissions
> 
> Ok, so in the test suite you likely should do something
> approximately like
> 
> #ifdef CONFIG_QPL
>   if (access("/dev/iax", R_OK|W_OK) == 0) {
> migration_test_add("/migration/multifd/tcp/plain/qpl",
>test_multifd_tcp_qpl);
>   }
> #endif
> 
> possibly more if you need to actually query supported features
> of /dev/iax before trying to use it

Yes, very thanks for your suggestion, I will fix this in the next version.

> > > > Signed-off-by: Yuan Liu 
> > > > Reviewed-by: Nanhai Zou 
> > > > ---
> > > >  tests/qtest/migration-test.c | 24 
> > > >  1 file changed, 24 insertions(+)
> > > >
> > > > diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-
> test.c
> > > > index 71895abb7f..052d0d60fd 100644
> > > > --- a/tests/qtest/migration-test.c
> > > > +++ b/tests/qtest/migration-test.c
> > > > @@ -2815,6 +2815,15 @@
> > > test_migrate_precopy_tcp_multifd_zstd_start(QTestState *from,
> > > >  }
> > > >  #endif /* CONFIG_ZSTD */
> > > >
> > > > +#ifdef CONFIG_QPL
> > > > +static void *
> > > > +test_migrate_precopy_tcp_multifd_qpl_start(QTestState *from,
> > > > +QTestState *to)
> > > > +{
> > > > +return test_migrate_precopy_tcp_multifd_start_common(from, to,
> > > "qpl");
> > > > +}
> > > > +#endif /* CONFIG_QPL */
> > > > +
> > > >  static void test_multifd_tcp_none(void)
> > > >  {
> > > >  MigrateCommon args = {
> > > > @@ -2880,6 +2889,17 @@ static void test_multifd_tcp_zstd(void)
> > > >  }
> > > >  #endif
> > > >
> > > > +#ifdef CONFIG_QPL
> > > > +static void test_multifd_tcp_qpl(void)
> > > > +{
> > &g

RE: [PATCH v5 5/7] migration/multifd: implement initialization of qpl compression

2024-03-20 Thread Liu, Yuan1
> -Original Message-
> From: Peter Xu 
> Sent: Wednesday, March 20, 2024 11:35 PM
> To: Liu, Yuan1 
> Cc: Daniel P. Berrangé ; faro...@suse.de; qemu-
> de...@nongnu.org; hao.xi...@bytedance.com; bryan.zh...@bytedance.com; Zou,
> Nanhai 
> Subject: Re: [PATCH v5 5/7] migration/multifd: implement initialization of
> qpl compression
> 
> On Wed, Mar 20, 2024 at 03:02:59PM +, Liu, Yuan1 wrote:
> > > > +static int alloc_zbuf(QplData *qpl, uint8_t chan_id, Error **errp)
> > > > +{
> > > > +int flags = MAP_PRIVATE | MAP_POPULATE | MAP_ANONYMOUS;
> > > > +uint32_t size = qpl->job_num * qpl->data_size;
> > > > +uint8_t *buf;
> > > > +
> > > > +buf = (uint8_t *) mmap(NULL, size, PROT_READ | PROT_WRITE,
> flags, -
> > > 1, 0);
> > > > +if (buf == MAP_FAILED) {
> > > > +error_setg(errp, "multifd: %u: alloc_zbuf failed, job
> num %u,
> > > size %u",
> > > > +   chan_id, qpl->job_num, qpl->data_size);
> > > > +return -1;
> > > > +}
> > >
> > > What's the reason for using mmap here, rather than a normal
> > > malloc ?
> >
> > I want to populate the memory accessed by the IAA device in the
> initialization
> > phase, and then avoid initiating I/O page faults through the IAA device
> during
> > migration, a large number of I/O page faults are not good for
> performance.
> 
> mmap() doesn't populate pages, unless with MAP_POPULATE.  And even with
> that it shouldn't be guaranteed, as the populate phase should ignore all
> errors.
> 
>MAP_POPULATE (since Linux 2.5.46)
>   Populate (prefault) page tables for a mapping.  For a file
> map‐
>   ping, this causes read-ahead on the file.  This will help to
> re‐
>   duce  blocking  on  page  faults later.  The mmap() call
> doesn't
>   fail if the mapping cannot be populated  (for  example,  due
> to
>   limitations  on  the  number  of  mapped  huge  pages when
> using
>   MAP_HUGETLB).  Support for MAP_POPULATE in conjunction with
> pri‐
>   vate mappings was added in Linux 2.6.23.
> 
> OTOH, I think g_malloc0() should guarantee to prefault everything in as
> long as the call returned (even though they can be swapped out later, but
> that applies to all cases anyway).

Thanks, Peter. I will try the g_malloc0 method here

> > This problem also occurs at the destination, therefore, I recommend that
> > customers need to add -mem-prealloc for destination boot parameters.
> 
> I'm not sure what issue you hit when testing it, but -mem-prealloc flag
> should only control the guest memory backends not the buffers that QEMU
> internally use, afaiu.
> 
> Thanks,
> 
> --
> Peter Xu

let me explain here, during the decompression operation of IAA, the 
decompressed data
can be directly output to the virtual address of the guest memory by IAA 
hardware. 
It can avoid copying the decompressed data to guest memory by CPU.

Without -mem-prealloc, all the guest memory is not populated, and IAA hardware 
needs to trigger
I/O page fault first and then output the decompressed data to the guest memory 
region. 
Besides that, CPU page faults will also trigger IOTLB flush operation when IAA 
devices use SVM. 

Due to the inability to quickly resolve a large number of IO page faults and 
IOTLB flushes, the
decompression throughput of the IAA device will decrease significantly.



RE: [PATCH v5 5/7] migration/multifd: implement initialization of qpl compression

2024-03-20 Thread Liu, Yuan1
> -Original Message-
> From: Daniel P. Berrangé 
> Sent: Wednesday, March 20, 2024 11:21 PM
> To: Liu, Yuan1 
> Cc: pet...@redhat.com; faro...@suse.de; qemu-devel@nongnu.org;
> hao.xi...@bytedance.com; bryan.zh...@bytedance.com; Zou, Nanhai
> 
> Subject: Re: [PATCH v5 5/7] migration/multifd: implement initialization of
> qpl compression
> 
> On Wed, Mar 20, 2024 at 03:02:59PM +, Liu, Yuan1 wrote:
> > > -Original Message-
> > > From: Daniel P. Berrangé 
> > > Sent: Wednesday, March 20, 2024 6:42 PM
> > > To: Liu, Yuan1 
> > > Cc: pet...@redhat.com; faro...@suse.de; qemu-devel@nongnu.org;
> > > hao.xi...@bytedance.com; bryan.zh...@bytedance.com; Zou, Nanhai
> > > 
> > > Subject: Re: [PATCH v5 5/7] migration/multifd: implement
> initialization of
> > > qpl compression
> > >
> > > On Wed, Mar 20, 2024 at 12:45:25AM +0800, Yuan Liu wrote:
> > > > the qpl initialization includes memory allocation for compressed
> > > > data and the qpl job initialization.
> > > >
> > > > the qpl initialization will check whether the In-Memory Analytics
> > > > Accelerator(IAA) hardware is available, if the platform does not
> > > > have IAA hardware or the IAA hardware is not available, the QPL
> > > > compression initialization will fail.
> > > >
> > > > Signed-off-by: Yuan Liu 
> > > > Reviewed-by: Nanhai Zou 
> > > > ---
> > > >  migration/multifd-qpl.c | 243
> +++-
> > > >  1 file changed, 242 insertions(+), 1 deletion(-)
> > > >
> > > > diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c
> > > > index 056a68a060..6de65e9da7 100644
> > > > --- a/migration/multifd-qpl.c
> > > > +++ b/migration/multifd-qpl.c
> > > > @@ -9,12 +9,253 @@
> > > >   * This work is licensed under the terms of the GNU GPL, version 2
> or
> > > later.
> > > >   * See the COPYING file in the top-level directory.
> > > >   */
> > > > +
> > > >  #include "qemu/osdep.h"
> > > >  #include "qemu/module.h"
> > > > +#include "qapi/error.h"
> > > > +#include "migration.h"
> > > > +#include "multifd.h"
> > > > +#include "qpl/qpl.h"
> > > > +
> > > > +typedef struct {
> > > > +qpl_job **job_array;
> > > > +/* the number of allocated jobs */
> > > > +uint32_t job_num;
> > > > +/* the size of data processed by a qpl job */
> > > > +uint32_t data_size;
> > > > +/* compressed data buffer */
> > > > +uint8_t *zbuf;
> > > > +/* the length of compressed data */
> > > > +uint32_t *zbuf_hdr;
> > > > +} QplData;
> > > > +
> > > > +static void free_zbuf(QplData *qpl)
> > > > +{
> > > > +if (qpl->zbuf != NULL) {
> > > > +munmap(qpl->zbuf, qpl->job_num * qpl->data_size);
> > > > +qpl->zbuf = NULL;
> > > > +}
> > > > +if (qpl->zbuf_hdr != NULL) {
> > > > +g_free(qpl->zbuf_hdr);
> > > > +qpl->zbuf_hdr = NULL;
> > > > +}
> > > > +}
> > > > +
> > > > +static int alloc_zbuf(QplData *qpl, uint8_t chan_id, Error **errp)
> > > > +{
> > > > +int flags = MAP_PRIVATE | MAP_POPULATE | MAP_ANONYMOUS;
> > > > +uint32_t size = qpl->job_num * qpl->data_size;
> > > > +uint8_t *buf;
> > > > +
> > > > +buf = (uint8_t *) mmap(NULL, size, PROT_READ | PROT_WRITE,
> flags, -
> > > 1, 0);
> > > > +if (buf == MAP_FAILED) {
> > > > +error_setg(errp, "multifd: %u: alloc_zbuf failed, job
> num %u,
> > > size %u",
> > > > +   chan_id, qpl->job_num, qpl->data_size);
> > > > +return -1;
> > > > +}
> > >
> > > What's the reason for using mmap here, rather than a normal
> > > malloc ?
> >
> > I want to populate the memory accessed by the IAA device in the
> initialization
> > phase, and then avoid initiating I/O page faults through the IAA device
> during
> > migration, a large number of I/O page faults are not good for
> performance.
> 
> Does this mmap actually make a measurabl

RE: [PATCH v5 2/7] migration/multifd: put IOV initialization into compression method

2024-03-20 Thread Liu, Yuan1
> -Original Message-
> From: Fabiano Rosas 
> Sent: Wednesday, March 20, 2024 11:19 PM
> To: Liu, Yuan1 ; pet...@redhat.com
> Cc: qemu-devel@nongnu.org; hao.xi...@bytedance.com;
> bryan.zh...@bytedance.com; Liu, Yuan1 ; Zou, Nanhai
> 
> Subject: Re: [PATCH v5 2/7] migration/multifd: put IOV initialization into
> compression method
> 
> Yuan Liu  writes:
> 
> > Different compression methods may require different numbers of IOVs.
> > Based on streaming compression of zlib and zstd, all pages will be
> > compressed to a data block, so two IOVs are needed for packet header
> > and compressed data block.
> >
> > Signed-off-by: Yuan Liu 
> > Reviewed-by: Nanhai Zou 
> > ---
> >  migration/multifd-zlib.c | 4 
> >  migration/multifd-zstd.c | 6 +-
> >  migration/multifd.c  | 8 +---
> >  3 files changed, 14 insertions(+), 4 deletions(-)
> >
> > diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c
> > index 99821cd4d5..8095ef8e28 100644
> > --- a/migration/multifd-zlib.c
> > +++ b/migration/multifd-zlib.c
> > @@ -70,6 +70,10 @@ static int zlib_send_setup(MultiFDSendParams *p,
> Error **errp)
> >  goto err_free_zbuff;
> >  }
> >  p->compress_data = z;
> > +
> > +assert(p->iov == NULL);
> > +/* For packet header and zlib streaming compression block */
> > +p->iov = g_new0(struct iovec, 2);
> >  return 0;
> >
> >  err_free_zbuff:
> > diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c
> > index 02112255ad..9c9217794e 100644
> > --- a/migration/multifd-zstd.c
> > +++ b/migration/multifd-zstd.c
> > @@ -52,7 +52,6 @@ static int zstd_send_setup(MultiFDSendParams *p, Error
> **errp)
> >  struct zstd_data *z = g_new0(struct zstd_data, 1);
> >  int res;
> >
> > -p->compress_data = z;
> >  z->zcs = ZSTD_createCStream();
> >  if (!z->zcs) {
> >  g_free(z);
> > @@ -77,6 +76,11 @@ static int zstd_send_setup(MultiFDSendParams *p,
> Error **errp)
> >  error_setg(errp, "multifd %u: out of memory for zbuff", p->id);
> >  return -1;
> >  }
> > +p->compress_data = z;
> > +
> > +assert(p->iov == NULL);
> > +/* For packet header and zstd streaming compression block */
> > +p->iov = g_new0(struct iovec, 2);
> >  return 0;
> >  }
> >
> > diff --git a/migration/multifd.c b/migration/multifd.c
> > index 0179422f6d..5155e02ae3 100644
> > --- a/migration/multifd.c
> > +++ b/migration/multifd.c
> > @@ -1181,9 +1181,11 @@ bool multifd_send_setup(void)
> >  p->packet = g_malloc0(p->packet_len);
> >  p->packet->magic = cpu_to_be32(MULTIFD_MAGIC);
> >  p->packet->version = cpu_to_be32(MULTIFD_VERSION);
> > -
> > -/* We need one extra place for the packet header */
> > -p->iov = g_new0(struct iovec, page_count + 1);
> > +/* IOVs are initialized in send_setup of compression method
> */
> > +if (!migrate_multifd_compression()) {
> > +/* We need one extra place for the packet header */
> > +p->iov = g_new0(struct iovec, page_count + 1);
> > +}
> 
> This^ should go into nocomp_send_setup:
> 
> static int nocomp_send_setup(MultiFDSendParams *p, Error **errp)
> {
> if (migrate_zero_copy_send()) {
> p->write_flags |= QIO_CHANNEL_WRITE_FLAG_ZERO_COPY;
> }
> 
> if (multifd_use_packets()) {
> /* We need one extra place for the packet header */
> p->iov = g_new0(struct iovec, p->page_count + 1);
> } else {
> p->iov = g_new0(struct iovec, p->page_count);
> }
> 
> return 0;
> }

Yes, this is better, I will fix this in the next version,
thanks for your comments.

> >  } else {
> >  p->iov = g_new0(struct iovec, page_count);
> >  }



RE: [PATCH v5 7/7] tests/migration-test: add qpl compression test

2024-03-20 Thread Liu, Yuan1
> -Original Message-
> From: Daniel P. Berrangé 
> Sent: Wednesday, March 20, 2024 6:46 PM
> To: Liu, Yuan1 
> Cc: pet...@redhat.com; faro...@suse.de; qemu-devel@nongnu.org;
> hao.xi...@bytedance.com; bryan.zh...@bytedance.com; Zou, Nanhai
> 
> Subject: Re: [PATCH v5 7/7] tests/migration-test: add qpl compression test
> 
> On Wed, Mar 20, 2024 at 12:45:27AM +0800, Yuan Liu wrote:
> > add qpl to compression method test for multifd migration
> >
> > the migration with qpl compression needs to access IAA hardware
> > resource, please run "check-qtest" with sudo or root permission,
> > otherwise migration test will fail
> 
> That's not an acceptable requirement.
> 
> If someone builds QEMU with QPL, the migration test *must*
> pass 100% reliably when either running on a host without
> the QPL required hardware, or when lacking permissions.
> 
> The test case needs to detect these scenarios and automatically
> skip the test if it is incapable of running successfully.
> This raises another question though. If QPL migration requires
> running as root, then it is effectively unusable for QEMU, as
> no sane deployment ever runs QEMU as root.
> 
> Is there a way to make QPL work for non-root users ?

There are two issues here
1. I need to add an IAA resource detection before the QPL test begins
   In this way, when QPL resources are unavailable, the live migration 
   test will not be affected.

2. I need to add some additional information about IAA configuration in 
   the devel/qpl-compression.rst documentation. In addition to configuring 
   IAA resources, the system administrator also needs to assign IAA resources
   to user groups.
   For example, the system administrator runs "chown -R user /dev/iax", then
   all IAA resources can be accessed by "user", this method does not require 
   sudo or root permissions

> > Signed-off-by: Yuan Liu 
> > Reviewed-by: Nanhai Zou 
> > ---
> >  tests/qtest/migration-test.c | 24 
> >  1 file changed, 24 insertions(+)
> >
> > diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
> > index 71895abb7f..052d0d60fd 100644
> > --- a/tests/qtest/migration-test.c
> > +++ b/tests/qtest/migration-test.c
> > @@ -2815,6 +2815,15 @@
> test_migrate_precopy_tcp_multifd_zstd_start(QTestState *from,
> >  }
> >  #endif /* CONFIG_ZSTD */
> >
> > +#ifdef CONFIG_QPL
> > +static void *
> > +test_migrate_precopy_tcp_multifd_qpl_start(QTestState *from,
> > +QTestState *to)
> > +{
> > +return test_migrate_precopy_tcp_multifd_start_common(from, to,
> "qpl");
> > +}
> > +#endif /* CONFIG_QPL */
> > +
> >  static void test_multifd_tcp_none(void)
> >  {
> >  MigrateCommon args = {
> > @@ -2880,6 +2889,17 @@ static void test_multifd_tcp_zstd(void)
> >  }
> >  #endif
> >
> > +#ifdef CONFIG_QPL
> > +static void test_multifd_tcp_qpl(void)
> > +{
> > +MigrateCommon args = {
> > +.listen_uri = "defer",
> > +.start_hook = test_migrate_precopy_tcp_multifd_qpl_start,
> > +};
> > +test_precopy_common(&args);
> > +}
> > +#endif
> > +
> >  #ifdef CONFIG_GNUTLS
> >  static void *
> >  test_migrate_multifd_tcp_tls_psk_start_match(QTestState *from,
> > @@ -3789,6 +3809,10 @@ int main(int argc, char **argv)
> >  migration_test_add("/migration/multifd/tcp/plain/zstd",
> > test_multifd_tcp_zstd);
> >  #endif
> > +#ifdef CONFIG_QPL
> > +migration_test_add("/migration/multifd/tcp/plain/qpl",
> > +   test_multifd_tcp_qpl);
> > +#endif
> >  #ifdef CONFIG_GNUTLS
> >  migration_test_add("/migration/multifd/tcp/tls/psk/match",
> > test_multifd_tcp_tls_psk_match);
> > --
> > 2.39.3
> >
> >
> 
> With regards,
> Daniel
> --
> |: https://berrange.com  -o-
> https://www.flickr.com/photos/dberrange :|
> |: https://libvirt.org -o-
> https://fstop138.berrange.com :|
> |: https://entangle-photo.org-o-
> https://www.instagram.com/dberrange :|



RE: [PATCH v5 5/7] migration/multifd: implement initialization of qpl compression

2024-03-20 Thread Liu, Yuan1
> -Original Message-
> From: Daniel P. Berrangé 
> Sent: Wednesday, March 20, 2024 6:42 PM
> To: Liu, Yuan1 
> Cc: pet...@redhat.com; faro...@suse.de; qemu-devel@nongnu.org;
> hao.xi...@bytedance.com; bryan.zh...@bytedance.com; Zou, Nanhai
> 
> Subject: Re: [PATCH v5 5/7] migration/multifd: implement initialization of
> qpl compression
> 
> On Wed, Mar 20, 2024 at 12:45:25AM +0800, Yuan Liu wrote:
> > the qpl initialization includes memory allocation for compressed
> > data and the qpl job initialization.
> >
> > the qpl initialization will check whether the In-Memory Analytics
> > Accelerator(IAA) hardware is available, if the platform does not
> > have IAA hardware or the IAA hardware is not available, the QPL
> > compression initialization will fail.
> >
> > Signed-off-by: Yuan Liu 
> > Reviewed-by: Nanhai Zou 
> > ---
> >  migration/multifd-qpl.c | 243 +++-
> >  1 file changed, 242 insertions(+), 1 deletion(-)
> >
> > diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c
> > index 056a68a060..6de65e9da7 100644
> > --- a/migration/multifd-qpl.c
> > +++ b/migration/multifd-qpl.c
> > @@ -9,12 +9,253 @@
> >   * This work is licensed under the terms of the GNU GPL, version 2 or
> later.
> >   * See the COPYING file in the top-level directory.
> >   */
> > +
> >  #include "qemu/osdep.h"
> >  #include "qemu/module.h"
> > +#include "qapi/error.h"
> > +#include "migration.h"
> > +#include "multifd.h"
> > +#include "qpl/qpl.h"
> > +
> > +typedef struct {
> > +qpl_job **job_array;
> > +/* the number of allocated jobs */
> > +uint32_t job_num;
> > +/* the size of data processed by a qpl job */
> > +uint32_t data_size;
> > +/* compressed data buffer */
> > +uint8_t *zbuf;
> > +/* the length of compressed data */
> > +uint32_t *zbuf_hdr;
> > +} QplData;
> > +
> > +static void free_zbuf(QplData *qpl)
> > +{
> > +if (qpl->zbuf != NULL) {
> > +munmap(qpl->zbuf, qpl->job_num * qpl->data_size);
> > +qpl->zbuf = NULL;
> > +}
> > +if (qpl->zbuf_hdr != NULL) {
> > +g_free(qpl->zbuf_hdr);
> > +qpl->zbuf_hdr = NULL;
> > +}
> > +}
> > +
> > +static int alloc_zbuf(QplData *qpl, uint8_t chan_id, Error **errp)
> > +{
> > +int flags = MAP_PRIVATE | MAP_POPULATE | MAP_ANONYMOUS;
> > +uint32_t size = qpl->job_num * qpl->data_size;
> > +uint8_t *buf;
> > +
> > +buf = (uint8_t *) mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -
> 1, 0);
> > +if (buf == MAP_FAILED) {
> > +error_setg(errp, "multifd: %u: alloc_zbuf failed, job num %u,
> size %u",
> > +   chan_id, qpl->job_num, qpl->data_size);
> > +return -1;
> > +}
> 
> What's the reason for using mmap here, rather than a normal
> malloc ?

I want to populate the memory accessed by the IAA device in the initialization
phase, and then avoid initiating I/O page faults through the IAA device during
migration, a large number of I/O page faults are not good for performance. 

This problem also occurs at the destination, therefore, I recommend that
customers need to add -mem-prealloc for destination boot parameters.

> > +qpl->zbuf = buf;
> > +qpl->zbuf_hdr = g_new0(uint32_t, qpl->job_num);
> > +return 0;
> > +}
> > +
> > +static void free_jobs(QplData *qpl)
> > +{
> > +for (int i = 0; i < qpl->job_num; i++) {
> > +qpl_fini_job(qpl->job_array[i]);
> > +g_free(qpl->job_array[i]);
> > +qpl->job_array[i] = NULL;
> > +}
> > +g_free(qpl->job_array);
> > +qpl->job_array = NULL;
> > +}
> > +
> > +static int alloc_jobs(QplData *qpl, uint8_t chan_id, Error **errp)
> > +{
> > +qpl_status status;
> > +uint32_t job_size = 0;
> > +qpl_job *job = NULL;
> > +/* always use IAA hardware accelerator */
> > +qpl_path_t path = qpl_path_hardware;
> > +
> > +status = qpl_get_job_size(path, &job_size);
> > +if (status != QPL_STS_OK) {
> > +error_setg(errp, "multifd: %u: qpl_get_job_size failed with
> error %d",
> > +   chan_id, status);
> > +return -1;
> > +}
> > +qpl->job_array = g_new0(qpl_job *, qpl->job_num);
> >

RE: [PATCH v5 3/7] configure: add --enable-qpl build option

2024-03-20 Thread Liu, Yuan1
> -Original Message-
> From: Daniel P. Berrangé 
> Sent: Wednesday, March 20, 2024 6:31 PM
> To: Liu, Yuan1 
> Cc: pet...@redhat.com; faro...@suse.de; qemu-devel@nongnu.org;
> hao.xi...@bytedance.com; bryan.zh...@bytedance.com; Zou, Nanhai
> 
> Subject: Re: [PATCH v5 3/7] configure: add --enable-qpl build option
> 
> On Wed, Mar 20, 2024 at 12:45:23AM +0800, Yuan Liu wrote:
> > add --enable-qpl and --disable-qpl options to enable and disable
> > the QPL compression method for multifd migration.
> >
> > the Query Processing Library (QPL) is an open-source library
> > that supports data compression and decompression features.
> >
> > The QPL compression is based on the deflate compression algorithm
> > and use Intel In-Memory Analytics Accelerator(IAA) hardware for
> > compression and decompression acceleration.
> >
> > Please refer to the following for more information about QPL
> >
> https://intel.github.io/qpl/documentation/introduction_docs/introduction.h
> tml
> >
> > Signed-off-by: Yuan Liu 
> > Reviewed-by: Nanhai Zou 
> > ---
> >  meson.build   | 16 
> >  meson_options.txt |  2 ++
> >  scripts/meson-buildoptions.sh |  3 +++
> >  3 files changed, 21 insertions(+)
> >
> > diff --git a/meson.build b/meson.build
> > index b375248a76..bee7dcd53b 100644
> > --- a/meson.build
> > +++ b/meson.build
> > @@ -1200,6 +1200,20 @@ if not get_option('zstd').auto() or have_block
> >  required: get_option('zstd'),
> >  method: 'pkg-config')
> >  endif
> > +qpl = not_found
> > +if not get_option('qpl').auto()
> > +  libqpl = cc.find_library('qpl', required: false)
> > +  if not libqpl.found()
> > +error('libqpl not found, please install it from ' +
> > +
> 'https://intel.github.io/qpl/documentation/get_started_docs/installation.h
> tml')
> > +  endif
> > +  libaccel = dependency('libaccel-config', version: '>=4.0.0',
> > +required: true,
> > +method: 'pkg-config')
> > +  qpl = declare_dependency(dependencies: [libqpl, libaccel,
> > +cc.find_library('dl', required: get_option('qpl'))],
> > +link_args: ['-lstdc++'])
> > +endif
> 
> Are either of these libraries present in any mainstream Linux
> distro ? If not, then this feature will not get any CI build
> coverage from QEMU.
> 
> Needing to manually add '-lstdc++' & '-ldl' is presumably a
> requirement from 'qpl'. As a future enhancement it would be
> much better if 'qpl' provided a pkg-config file, so this
> list of dependencies didn't have to be hardcoded by apps
> using qpl
> 
> 
> With regards,
> Daniel

Hi Daniel

Thanks for your comments, the QPL has not been integrated into 
mainstream Linux distro yet, I am actively promoting QPL to 
support pkg-config file and integrate it into the distributions.

The QPL will support these soon, I will use pkg-config in the next 
version to solve the QPL build dependency issue.


RE: [PATCH v5 3/7] configure: add --enable-qpl build option

2024-03-20 Thread Liu, Yuan1
> -Original Message-
> From: Thomas Huth 
> Sent: Wednesday, March 20, 2024 4:57 PM
> To: Liu, Yuan1 ; pet...@redhat.com; faro...@suse.de
> Cc: qemu-devel@nongnu.org; hao.xi...@bytedance.com;
> bryan.zh...@bytedance.com; Zou, Nanhai 
> Subject: Re: [PATCH v5 3/7] configure: add --enable-qpl build option
> 
> On 20/03/2024 09.55, Thomas Huth wrote:
> > On 19/03/2024 17.45, Yuan Liu wrote:
> >> add --enable-qpl and --disable-qpl options to enable and disable
> >> the QPL compression method for multifd migration.
> >>
> >> the Query Processing Library (QPL) is an open-source library
> >> that supports data compression and decompression features.
> >>
> >> The QPL compression is based on the deflate compression algorithm
> >> and use Intel In-Memory Analytics Accelerator(IAA) hardware for
> >> compression and decompression acceleration.
> >>
> >> Please refer to the following for more information about QPL
> >>
> https://intel.github.io/qpl/documentation/introduction_docs/introduction.h
> tml
> >>
> >> Signed-off-by: Yuan Liu 
> >> Reviewed-by: Nanhai Zou 
> >> ---
> >>   meson.build   | 16 
> >>   meson_options.txt |  2 ++
> >>   scripts/meson-buildoptions.sh |  3 +++
> >>   3 files changed, 21 insertions(+)
> >>
> >> diff --git a/meson.build b/meson.build
> >> index b375248a76..bee7dcd53b 100644
> >> --- a/meson.build
> >> +++ b/meson.build
> >> @@ -1200,6 +1200,20 @@ if not get_option('zstd').auto() or have_block
> >>   required: get_option('zstd'),
> >>   method: 'pkg-config')
> >>   endif
> >> +qpl = not_found
> >> +if not get_option('qpl').auto()
> >
> > Do you really only want to enable this if the user explicitly specified
> > "--enable-qpl" ? Otherwise, I think this should be:
> >
> >   if not get_option('qpl').auto() or have_system
> >
> > ?
> >
> >   Thomas
> >
> >
> >
> >
> >> +  libqpl = cc.find_library('qpl', required: false)
> 
> ... and it should use "required: get_option('qpl')" in that case.
> 
>   Thomas

Hi Thomas

Thanks for your comments, you are right, I need to add have_system
and check get_option('qpl') here, I will fix this next version.

> >> +  if not libqpl.found()
> >> +    error('libqpl not found, please install it from ' +
> >> +
> >>
> 'https://intel.github.io/qpl/documentation/get_started_docs/installation.h
> tml')
> >> +  endif
> >> +  libaccel = dependency('libaccel-config', version: '>=4.0.0',
> >> +    required: true,
> >> +    method: 'pkg-config')
> >> +  qpl = declare_dependency(dependencies: [libqpl, libaccel,
> >> +    cc.find_library('dl', required: get_option('qpl'))],
> >> +    link_args: ['-lstdc++'])
> >> +endif
> >>   virgl = not_found
> >



RE: [PATCH v4 3/8] configure: add --enable-qpl build option

2024-03-06 Thread Liu, Yuan1
> -Original Message-
> From: Fabiano Rosas 
> Sent: Wednesday, March 6, 2024 7:56 PM
> To: Liu, Yuan1 ; pet...@redhat.com
> Cc: qemu-devel@nongnu.org; hao.xi...@bytedance.com;
> bryan.zh...@bytedance.com; Zou, Nanhai 
> Subject: RE: [PATCH v4 3/8] configure: add --enable-qpl build option
> 
> "Liu, Yuan1"  writes:
> 
> >> -Original Message-
> >> From: Fabiano Rosas 
> >> Sent: Wednesday, March 6, 2024 4:32 AM
> >> To: Liu, Yuan1 ; pet...@redhat.com
> >> Cc: qemu-devel@nongnu.org; hao.xi...@bytedance.com;
> >> bryan.zh...@bytedance.com; Liu, Yuan1 ; Zou,
> Nanhai
> >> 
> >> Subject: Re: [PATCH v4 3/8] configure: add --enable-qpl build option
> >>
> >> Yuan Liu  writes:
> >>
> >> > add --enable-qpl and --disable-qpl options to enable and disable
> >> > the QPL compression method for multifd migration.
> >> >
> >> > the Query Processing Library (QPL) is an open-source library
> >> > that supports data compression and decompression features.
> >> >
> >> > The QPL compression is based on the deflate compression algorithm
> >> > and use Intel In-Memory Analytics Accelerator(IAA) hardware for
> >> > compression and decompression acceleration.
> >> >
> >> > Please refer to the following for more information about QPL
> >> >
> >>
> https://intel.github.io/qpl/documentation/introduction_docs/introduction.h
> >> tml
> >> >
> >> > Signed-off-by: Yuan Liu 
> >> > Reviewed-by: Nanhai Zou 
> >> > ---
> >> >  meson.build   | 18 ++
> >> >  meson_options.txt |  2 ++
> >> >  scripts/meson-buildoptions.sh |  3 +++
> >> >  3 files changed, 23 insertions(+)
> >> >
> >> > diff --git a/meson.build b/meson.build
> >> > index c1dc83e4c0..2dea1e6834 100644
> >> > --- a/meson.build
> >> > +++ b/meson.build
> >> > @@ -1197,6 +1197,22 @@ if not get_option('zstd').auto() or have_block
> >> >  required: get_option('zstd'),
> >> >  method: 'pkg-config')
> >> >  endif
> >> > +qpl = not_found
> >> > +if not get_option('qpl').auto()
> >> > +  libqpl = cc.find_library('qpl', required: false)
> >> > +  if not libqpl.found()
> >> > +error('libqpl not found, please install it from ' +
> >> > +
> >>
> 'https://intel.github.io/qpl/documentation/get_started_docs/installation.h
> >> tml')
> >> > +  endif
> >> > +  libaccel = cc.find_library('accel-config', required: false)
> >> > +  if not libaccel.found()
> >> > +error('libaccel-config not found, please install it from ' +
> >> > +'https://github.com/intel/idxd-config')
> >>
> >> accel-config seems to be packaged by many distros, I'm not sure we need
> >> to reference the repository here.
> >>
> >> https://repology.org/project/accel-config/versions
> >
> > Yes, accel-config has been added to many distributions, I will use
> pkgconfig to
> > detect the libaccel and the version(at least v4.0).
> >
> > I have a question, I didn't find accel-config installation package from
> > https://repology.org/project/accel-config/versions. Does using this link
> also
> > require the user to build an accel-config package, and then install it?
> 
> That is just an aggregated list of distros and the version of the
> package they provide in their repos. So I'm just pointing out to you
> that there seems to be a packaged accel-config for most distros
> already. Which means we just want to say "install accel-config" and
> users should be able to use their distro's package manager.
> 
> >
> > It is easy to install accel-config using the installation package, but I
> didn't
> > find a repo that provides accel-config installation packages for most
> distributions.
> >
> > First check accel-config is available through pktconfig, and if it is
> not available,
> > prompts users to install it from https://github.com/intel/idxd-config,
> is it OK?
> 
> There's no need, just check if its available and suggest the user to
> install it. We already have the link in the docs.

Get it, thanks~

> >
> >> > +  endif
> >> > +  qpl = decl

RE: [PATCH v4 4/8] migration/multifd: add qpl compression method

2024-03-05 Thread Liu, Yuan1
> -Original Message-
> From: Fabiano Rosas 
> Sent: Wednesday, March 6, 2024 4:58 AM
> To: Liu, Yuan1 ; pet...@redhat.com
> Cc: qemu-devel@nongnu.org; hao.xi...@bytedance.com;
> bryan.zh...@bytedance.com; Liu, Yuan1 ; Zou, Nanhai
> 
> Subject: Re: [PATCH v4 4/8] migration/multifd: add qpl compression method
> 
> Yuan Liu  writes:
> 
> > add the Query Processing Library (QPL) compression method
> >
> > Although both qpl and zlib support deflate compression, qpl will
> > only use the In-Memory Analytics Accelerator(IAA) for compression
> > and decompression, and IAA is not compatible with the Zlib in
> > migration, so qpl is used as a new compression method for migration.
> >
> > How to enable qpl compression during migration:
> > migrate_set_parameter multifd-compression qpl
> >
> > The qpl only supports one compression level, there is no qpl
> > compression level parameter added, users do not need to specify
> > the qpl compression level.
> >
> > Signed-off-by: Yuan Liu 
> > Reviewed-by: Nanhai Zou 
> > ---
> >  hw/core/qdev-properties-system.c |   2 +-
> >  migration/meson.build|   1 +
> >  migration/multifd-qpl.c  | 158 +++
> >  migration/multifd.h  |   1 +
> >  qapi/migration.json  |   7 +-
> >  5 files changed, 167 insertions(+), 2 deletions(-)
> >  create mode 100644 migration/multifd-qpl.c
> >
> > diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-
> system.c
> > index 1a396521d5..b4f0e5cbdb 100644
> > --- a/hw/core/qdev-properties-system.c
> > +++ b/hw/core/qdev-properties-system.c
> > @@ -658,7 +658,7 @@ const PropertyInfo qdev_prop_fdc_drive_type = {
> >  const PropertyInfo qdev_prop_multifd_compression = {
> >  .name = "MultiFDCompression",
> >  .description = "multifd_compression values, "
> > -   "none/zlib/zstd",
> > +   "none/zlib/zstd/qpl",
> >  .enum_table = &MultiFDCompression_lookup,
> >  .get = qdev_propinfo_get_enum,
> >  .set = qdev_propinfo_set_enum,
> > diff --git a/migration/meson.build b/migration/meson.build
> > index 92b1cc4297..c155c2d781 100644
> > --- a/migration/meson.build
> > +++ b/migration/meson.build
> > @@ -40,6 +40,7 @@ if get_option('live_block_migration').allowed()
> >system_ss.add(files('block.c'))
> >  endif
> >  system_ss.add(when: zstd, if_true: files('multifd-zstd.c'))
> > +system_ss.add(when: qpl, if_true: files('multifd-qpl.c'))
> >
> >  specific_ss.add(when: 'CONFIG_SYSTEM_ONLY',
> >  if_true: files('ram.c',
> > diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c
> > new file mode 100644
> > index 00..6b94e732ac
> > --- /dev/null
> > +++ b/migration/multifd-qpl.c
> > @@ -0,0 +1,158 @@
> > +/*
> > + * Multifd qpl compression accelerator implementation
> > + *
> > + * Copyright (c) 2023 Intel Corporation
> > + *
> > + * Authors:
> > + *  Yuan Liu
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2 or
> later.
> > + * See the COPYING file in the top-level directory.
> > + */
> > +
> > +#include "qemu/osdep.h"
> > +#include "qemu/rcu.h"
> > +#include "exec/ramblock.h"
> > +#include "exec/target_page.h"
> > +#include "qapi/error.h"
> > +#include "migration.h"
> > +#include "trace.h"
> > +#include "options.h"
> > +#include "multifd.h"
> > +#include "qpl/qpl.h"
> 
> I don't mind adding a skeleton upfront before adding the implementation,
> but adding the headers here hurts the review process. Reviewers will
> have to go digging through the next patches to be able to validate each
> of these. It's better to include them along with their usage.
> 
> What I would do in this patch is maybe just add the new option, the
> .json and meson changes and this file with just:
> 
> static void multifd_qpl_register(void)
> {
> /* noop */
> }
> 
> Then in the next commit you can implement all the methods in one
> go. That way, the docstrings come along with the implementation, which
> also facilitates review.

Thanks for the guidance, I will implement it in the next version.

> > +
> > +struct qpl_data {
> 
> typedef struct {} QplData/QPLData, foll

RE: [PATCH v4 3/8] configure: add --enable-qpl build option

2024-03-05 Thread Liu, Yuan1
> -Original Message-
> From: Fabiano Rosas 
> Sent: Wednesday, March 6, 2024 4:32 AM
> To: Liu, Yuan1 ; pet...@redhat.com
> Cc: qemu-devel@nongnu.org; hao.xi...@bytedance.com;
> bryan.zh...@bytedance.com; Liu, Yuan1 ; Zou, Nanhai
> 
> Subject: Re: [PATCH v4 3/8] configure: add --enable-qpl build option
> 
> Yuan Liu  writes:
> 
> > add --enable-qpl and --disable-qpl options to enable and disable
> > the QPL compression method for multifd migration.
> >
> > the Query Processing Library (QPL) is an open-source library
> > that supports data compression and decompression features.
> >
> > The QPL compression is based on the deflate compression algorithm
> > and use Intel In-Memory Analytics Accelerator(IAA) hardware for
> > compression and decompression acceleration.
> >
> > Please refer to the following for more information about QPL
> >
> https://intel.github.io/qpl/documentation/introduction_docs/introduction.h
> tml
> >
> > Signed-off-by: Yuan Liu 
> > Reviewed-by: Nanhai Zou 
> > ---
> >  meson.build   | 18 ++
> >  meson_options.txt |  2 ++
> >  scripts/meson-buildoptions.sh |  3 +++
> >  3 files changed, 23 insertions(+)
> >
> > diff --git a/meson.build b/meson.build
> > index c1dc83e4c0..2dea1e6834 100644
> > --- a/meson.build
> > +++ b/meson.build
> > @@ -1197,6 +1197,22 @@ if not get_option('zstd').auto() or have_block
> >  required: get_option('zstd'),
> >  method: 'pkg-config')
> >  endif
> > +qpl = not_found
> > +if not get_option('qpl').auto()
> > +  libqpl = cc.find_library('qpl', required: false)
> > +  if not libqpl.found()
> > +error('libqpl not found, please install it from ' +
> > +
> 'https://intel.github.io/qpl/documentation/get_started_docs/installation.h
> tml')
> > +  endif
> > +  libaccel = cc.find_library('accel-config', required: false)
> > +  if not libaccel.found()
> > +error('libaccel-config not found, please install it from ' +
> > +'https://github.com/intel/idxd-config')
> 
> accel-config seems to be packaged by many distros, I'm not sure we need
> to reference the repository here.
> 
> https://repology.org/project/accel-config/versions

Yes, accel-config has been added to many distributions, I will use pkgconfig to 
detect the libaccel and the version(at least v4.0). 

I have a question, I didn't find accel-config installation package from 
https://repology.org/project/accel-config/versions. Does using this link also 
require the user to build an accel-config package, and then install it?

It is easy to install accel-config using the installation package, but I didn't 
find a repo that provides accel-config installation packages for most 
distributions.

First check accel-config is available through pktconfig, and if it is not 
available,
prompts users to install it from https://github.com/intel/idxd-config, is it OK?

> > +  endif
> > +  qpl = declare_dependency(dependencies: [libqpl, libaccel,
> > +cc.find_library('dl', required: get_option('qpl'))],
> > +link_args: ['-lstdc++'])
> > +endif
> >  virgl = not_found
> >
> >  have_vhost_user_gpu = have_tools and host_os == 'linux' and
> pixman.found()
> > @@ -2298,6 +2314,7 @@ config_host_data.set('CONFIG_MALLOC_TRIM',
> has_malloc_trim)
> >  config_host_data.set('CONFIG_STATX', has_statx)
> >  config_host_data.set('CONFIG_STATX_MNT_ID', has_statx_mnt_id)
> >  config_host_data.set('CONFIG_ZSTD', zstd.found())
> > +config_host_data.set('CONFIG_QPL', qpl.found())
> >  config_host_data.set('CONFIG_FUSE', fuse.found())
> >  config_host_data.set('CONFIG_FUSE_LSEEK', fuse_lseek.found())
> >  config_host_data.set('CONFIG_SPICE_PROTOCOL', spice_protocol.found())
> > @@ -4438,6 +4455,7 @@ summary_info += {'snappy support':snappy}
> >  summary_info += {'bzip2 support': libbzip2}
> >  summary_info += {'lzfse support': liblzfse}
> >  summary_info += {'zstd support':  zstd}
> > +summary_info += {'Query Processing Library support': qpl}
> >  summary_info += {'NUMA host support': numa}
> >  summary_info += {'capstone':  capstone}
> >  summary_info += {'libpmem support':   libpmem}
> > diff --git a/meson_options.txt b/meson_options.txt
> >

RE: [PATCH v4 2/8] migration/multifd: add get_iov_count in the multifd method

2024-03-05 Thread Liu, Yuan1
> -Original Message-
> From: Fabiano Rosas 
> Sent: Wednesday, March 6, 2024 4:24 AM
> To: Liu, Yuan1 ; pet...@redhat.com
> Cc: qemu-devel@nongnu.org; hao.xi...@bytedance.com;
> bryan.zh...@bytedance.com; Liu, Yuan1 ; Zou, Nanhai
> 
> Subject: Re: [PATCH v4 2/8] migration/multifd: add get_iov_count in the
> multifd method
> 
> Yuan Liu  writes:
> 
> > the new function get_iov_count is used to get the number of
> > IOVs required by a specified multifd method
> >
> > Different multifd methods may require different numbers of IOVs.
> > Based on streaming compression of zlib and zstd, all pages will be
> > compressed to a data block, so an IOV is required to send this data
> > block. For no compression, each IOV is used to send a page, so the
> > number of IOVs required is the same as the number of pages.
> 
> Let's just move the responsibility of allocating p->iov to the client
> code. You can move the allocation into send_setup() and the free into
> send_cleanup().

Yes, this is a good way, I will implement it in the next version

> >
> > Signed-off-by: Yuan Liu 
> > Reviewed-by: Nanhai Zou 
> > ---
> >  migration/multifd-zlib.c | 18 +-
> >  migration/multifd-zstd.c | 18 +-
> >  migration/multifd.c  | 24 +---
> >  migration/multifd.h  |  2 ++
> >  4 files changed, 57 insertions(+), 5 deletions(-)
> >
> > diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c
> > index 012e3bdea1..35187f2aff 100644
> > --- a/migration/multifd-zlib.c
> > +++ b/migration/multifd-zlib.c
> > @@ -313,13 +313,29 @@ static int zlib_recv_pages(MultiFDRecvParams *p,
> Error **errp)
> >  return 0;
> >  }
> >
> > +/**
> > + * zlib_get_iov_count: get the count of IOVs
> > + *
> > + * For zlib streaming compression, all pages will be compressed into a
> data
> > + * block, and an IOV is requested for sending this block.
> > + *
> > + * Returns the count of the IOVs
> > + *
> > + * @page_count: Indicate the maximum count of pages processed by
> multifd
> > + */
> > +static uint32_t zlib_get_iov_count(uint32_t page_count)
> > +{
> > +return 1;
> > +}
> > +
> >  static MultiFDMethods multifd_zlib_ops = {
> >  .send_setup = zlib_send_setup,
> >  .send_cleanup = zlib_send_cleanup,
> >  .send_prepare = zlib_send_prepare,
> >  .recv_setup = zlib_recv_setup,
> >  .recv_cleanup = zlib_recv_cleanup,
> > -.recv_pages = zlib_recv_pages
> > +.recv_pages = zlib_recv_pages,
> > +.get_iov_count = zlib_get_iov_count
> >  };
> >
> >  static void multifd_zlib_register(void)
> > diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c
> > index dc8fe43e94..25ed1add2a 100644
> > --- a/migration/multifd-zstd.c
> > +++ b/migration/multifd-zstd.c
> > @@ -304,13 +304,29 @@ static int zstd_recv_pages(MultiFDRecvParams *p,
> Error **errp)
> >  return 0;
> >  }
> >
> > +/**
> > + * zstd_get_iov_count: get the count of IOVs
> > + *
> > + * For zstd streaming compression, all pages will be compressed into a
> data
> > + * block, and an IOV is requested for sending this block.
> > + *
> > + * Returns the count of the IOVs
> > + *
> > + * @page_count: Indicate the maximum count of pages processed by
> multifd
> > + */
> > +static uint32_t zstd_get_iov_count(uint32_t page_count)
> > +{
> > +return 1;
> > +}
> > +
> >  static MultiFDMethods multifd_zstd_ops = {
> >  .send_setup = zstd_send_setup,
> >  .send_cleanup = zstd_send_cleanup,
> >  .send_prepare = zstd_send_prepare,
> >  .recv_setup = zstd_recv_setup,
> >  .recv_cleanup = zstd_recv_cleanup,
> > -.recv_pages = zstd_recv_pages
> > +.recv_pages = zstd_recv_pages,
> > +.get_iov_count = zstd_get_iov_count
> >  };
> >
> >  static void multifd_zstd_register(void)
> > diff --git a/migration/multifd.c b/migration/multifd.c
> > index adfe8c9a0a..787402247e 100644
> > --- a/migration/multifd.c
> > +++ b/migration/multifd.c
> > @@ -209,13 +209,29 @@ static int nocomp_recv_pages(MultiFDRecvParams *p,
> Error **errp)
> >  return qio_channel_readv_all(p->c, p->iov, p->normal_num, errp);
> >  }
> >
> > +/**
> > + * nocomp_get_iov_count: get the count of IOVs
> > + *
> > + * For no compression, the count of IOVs required is the same as the
> count of
> > + * pages
> > + 

RE: Regarding to the recent Intel IAA/DSA/QAT support on migration

2024-02-07 Thread Liu, Yuan1
Thank you very much for your reminder and the rapid updates to the
multifd function. I will incorporate your suggestions into the next
version (IAA Accelerated Live Migration solution). 

Regarding the QAT and DSA optimization, my colleagues and I have 
already started reviewing and testing them, and it seems like a 
promising optimization direction. I am more than willing to contribute 
further efforts to the long-term maintenance of Intel accelerators in 
live migration.

> -Original Message-
> From: Peter Xu 
> Sent: Wednesday, February 7, 2024 4:10 PM
> To: Bryan Zhang ; Hao Xiang
> ; Liu, Yuan1 
> Cc: Fabiano Rosas ; QEMU Devel Mailing List  de...@nongnu.org>
> Subject: Regarding to the recent Intel IAA/DSA/QAT support on migration
> 
> Copy qemu-devel.
> 
> On Wed, Feb 07, 2024 at 04:07:40PM +0800, Peter Xu wrote:
> > Hi,
> >
> > I'm sending this email just to leave a generic comment to the recent
> > migration efforts to enable these new Intel technologies.
> >
> > The relevant patchsets (latest version so far) we're discussing are:
> >
> >   [PATCH v3 0/4] Live Migration Acceleration with IAA Compression
> >
> > https://lore.kernel.org/r/20240103112851.908082-1-yuan1@intel.com
> >
> >   [PATCH v3 00/20] Use Intel DSA accelerator to offload zero page
> checking in multifd live migration.
> >
> > https://lore.kernel.org/r/20240104004452.324068-1-hao.xiang@bytedance.
> > com
> >
> >   [PATCH 0/5] *** Implement using Intel QAT to offload ZLIB
> >
> > https://lore.kernel.org/r/20231231205804.2366509-1-bryan.zhang@bytedan
> > ce.com
> >
> > I want to comment in a generic way since this should apply to all
> > these
> > series:
> >
> >   - A heads-up that multifd code is rapidly changing recently, I
> apologize
> > that you'll need a rebase.  It's just that it's probably much better
> to
> > do this before anything lands there.
> >
> > IIUC the good thing is we found that send_prepare() doesn't need to
> be
> > changed that much, however there's still some change; please refer
> to
> > the new code (I'll prepare a pull tomorrow to include most of the
> > changes, and we should have a major thread race fixed too with
> Fabiano
> > & Avihai's help). I hope this will also provide some kind of
> isolation
> > to e.g. other works that may touch other areas.  E.g., I hope fixed-
> ram
> > won't need to conflict much with any of the above series now.
> >
> >   - When posting the new patchset (if there is a plan..), please make
> sure
> > we have:
> >
> > - Proper unit tests for the new code (probably mostly software
> >   fallbacks to be tested on the new libraries being introduced; just
> to
> >   make sure the new library code paths can get some torture please).
> >
> > - Proper documentation for the new code.  Please feel free to start
> >   creating your own .rst file under docs/devel/migration/, we can
> try
> >   to merge them later.  It should help avoid conflictions.  Please
> also
> >   link the new file into index.rst there.
> >
> >   IMHO the document can contain many things, the important ones
> could
> >   start from: who should enable such feature; what one can get from
> >   having it enabled; what is the HW requirement to enable it; how
> >   should one tune the new parameters, and so on... some links to the
> >   technology behinds it would be nice too to be referenced.
> >
> > - Try to add new code (especially HW/library based) into new file.
> >   I see that QPL & QAT already proposed its own files (multifd-
> pql.c,
> >   multifd-qatzip.c) which is great.
> >
> >   Xiang, please also consider doing so for the DSA based zero page
> >   detection.  It can be called multifd-zero-page.c, for example, and
> >   you can create it when working on the
> >   offload-zero-page-detect-to-multifd patchset already.
> >
> > - Please provide someone who can potentially maintain this code if
> ever
> >   possible.  Pushing these code upstream is great, but maintaining
> will
> >   also take effort.  It might be impractical this keeps growing for
> >   migration maintainers (currently Fabiano and myself), so we may
> like
> >   to have people covering these areas, especially when the new codes
> >   are not directly relevant to migration framework.
> >
> >   I'd suggest for each of the project we can add an entry in
> >   MAINTAINERS below "Migration" section, adding relevant files (and
> >   these files should exist in both the new section and "Migration").
> I
> >   am not sure whether Bytedance would be able to cover this, or we
> >   should try to find someone from Intel?  If you're willing to add
> >   yourself to maintain such codes, please attach the maintainers
> file
> >   change together with the series.  It will be very much
> appreciated.
> >
> > Thanks,
> >
> > --
> > Peter Xu
> 
> --
> Peter Xu



RE: [PATCH v3 0/4] Live Migration Acceleration with IAA Compression

2024-01-30 Thread Liu, Yuan1
> -Original Message-
> From: Peter Xu 
> Sent: Tuesday, January 30, 2024 6:32 PM
> To: Liu, Yuan1 
> Cc: faro...@suse.de; leob...@redhat.com; qemu-devel@nongnu.org; Zou,
> Nanhai 
> Subject: Re: [PATCH v3 0/4] Live Migration Acceleration with IAA
> Compression
> 
> On Tue, Jan 30, 2024 at 03:56:05AM +, Liu, Yuan1 wrote:
> > > -Original Message-
> > > From: Peter Xu 
> > > Sent: Monday, January 29, 2024 6:43 PM
> > > To: Liu, Yuan1 
> > > Cc: faro...@suse.de; leob...@redhat.com; qemu-devel@nongnu.org; Zou,
> > > Nanhai 
> > > Subject: Re: [PATCH v3 0/4] Live Migration Acceleration with IAA
> > > Compression
> > >
> > > On Wed, Jan 03, 2024 at 07:28:47PM +0800, Yuan Liu wrote:
> > > > Hi,
> > >
> > > Hi, Yuan,
> > >
> > > I have a few comments and questions.  Many of them can be pure
> > > questions as I don't know enough on these new technologies.
> > >
> > > >
> > > > I am writing to submit a code change aimed at enhancing live
> > > > migration acceleration by leveraging the compression capability of
> > > > the Intel In-Memory Analytics Accelerator (IAA).
> > > >
> > > > The implementation of the IAA (de)compression code is based on
> > > > Intel Query Processing Library (QPL), an open-source software
> > > > project designed for IAA high-level software programming.
> > > > https://github.com/intel/qpl
> > > >
> > > > In the last version, there was some discussion about whether to
> > > > introduce a new compression algorithm for IAA. Because the
> > > > compression algorithm of IAA hardware is based on deflate, and QPL
> > > > already supports Zlib, so in this version, I implemented IAA as an
> > > > accelerator for the Zlib compression method. However, due to some
> > > > reasons, QPL is currently not compatible with the existing Zlib
> > > > method that Zlib compressed data can be decompressed by QPl and vice
> versa.
> > > >
> > > > I have some concerns about the existing Zlib compression
> > > >   1. Will you consider supporting one channel to support multi-
> stream
> > > >  compression? Of course, this may lead to a reduction in
> compression
> > > >  ratio, but it will allow the hardware to process each stream
> > > >  concurrently. We can have each stream process multiple pages,
> > > >  reducing the loss of compression ratio. For example, 128 pages
> are
> > > >  divided into 16 streams for independent compression. I will
> provide
> > > >  the a early performance data in the next version(v4).
> > >
> > > I think Juan used to ask similar question: how much this can help if
> > > multifd can already achieve some form of concurrency over the pages?
> >
> >
> > > Couldn't the user specify more multifd channels if they want to
> > > grant more cpu resource for comp/decomp purpose?
> > >
> > > IOW, how many concurrent channels QPL can provide?  What is the
> > > suggested concurrency channels there?
> >
> > From the QPL software, there is no limit on the number of concurrent
> compression and decompression tasks.
> > From the IAA hardware, one IAA physical device can process two
> compressions concurrently or eight decompression tasks concurrently. There
> are up to 8 IAA devices on an Intel SPR Server and it will vary according
> to the customer’s product selection and deployment.
> >
> > Regarding the requirement for the number of concurrent channels, I think
> this may not be a bottleneck problem.
> > Please allow me to introduce a little more here
> >
> > 1. If the compression design is based on Zlib/Deflate/Gzip streaming
> mode, then we indeed need more channels to maintain concurrent processing.
> Because each time a multifd packet is compressed (including 128
> independent pages), it needs to be compressed page by page. These 128
> pages are not concurrent. The concurrency is reflected in the logic of
> multiple channels for the multifd packet.
> 
> Right.  However since you said there're only a max of 8 IAA devices, would
> it also mean n_multifd_threads=8 can be a good enough scenario to achieve
> proper concurrency, no matter the size of data chunk for one compression
> request?
> 
> Maybe you meant each device can still process concurrent compression
> requests, so the real capability of concurrency can be much larger than 8?

Yes, the number 

RE: [PATCH v3 0/4] Live Migration Acceleration with IAA Compression

2024-01-29 Thread Liu, Yuan1
> -Original Message-
> From: Peter Xu 
> Sent: Monday, January 29, 2024 6:43 PM
> To: Liu, Yuan1 
> Cc: faro...@suse.de; leob...@redhat.com; qemu-devel@nongnu.org; Zou,
> Nanhai 
> Subject: Re: [PATCH v3 0/4] Live Migration Acceleration with IAA
> Compression
> 
> On Wed, Jan 03, 2024 at 07:28:47PM +0800, Yuan Liu wrote:
> > Hi,
> 
> Hi, Yuan,
> 
> I have a few comments and questions.  Many of them can be pure questions
> as I don't know enough on these new technologies.
> 
> >
> > I am writing to submit a code change aimed at enhancing live migration
> > acceleration by leveraging the compression capability of the Intel
> > In-Memory Analytics Accelerator (IAA).
> >
> > The implementation of the IAA (de)compression code is based on Intel
> > Query Processing Library (QPL), an open-source software project
> > designed for IAA high-level software programming.
> > https://github.com/intel/qpl
> >
> > In the last version, there was some discussion about whether to
> > introduce a new compression algorithm for IAA. Because the compression
> > algorithm of IAA hardware is based on deflate, and QPL already
> > supports Zlib, so in this version, I implemented IAA as an accelerator
> > for the Zlib compression method. However, due to some reasons, QPL is
> > currently not compatible with the existing Zlib method that Zlib
> > compressed data can be decompressed by QPl and vice versa.
> >
> > I have some concerns about the existing Zlib compression
> >   1. Will you consider supporting one channel to support multi-stream
> >  compression? Of course, this may lead to a reduction in compression
> >  ratio, but it will allow the hardware to process each stream
> >  concurrently. We can have each stream process multiple pages,
> >  reducing the loss of compression ratio. For example, 128 pages are
> >  divided into 16 streams for independent compression. I will provide
> >  the a early performance data in the next version(v4).
> 
> I think Juan used to ask similar question: how much this can help if
> multifd can already achieve some form of concurrency over the pages?


> Couldn't the user specify more multifd channels if they want to grant more
> cpu resource for comp/decomp purpose?
> 
> IOW, how many concurrent channels QPL can provide?  What is the suggested
> concurrency channels there?

From the QPL software, there is no limit on the number of concurrent 
compression and decompression tasks.
From the IAA hardware, one IAA physical device can process two compressions 
concurrently or eight decompression tasks concurrently. There are up to 8 IAA 
devices on an Intel SPR Server and it will vary according to the customer’s 
product selection and deployment.

Regarding the requirement for the number of concurrent channels, I think this 
may not be a bottleneck problem.
Please allow me to introduce a little more here

1. If the compression design is based on Zlib/Deflate/Gzip streaming mode, then 
we indeed need more channels to maintain concurrent processing. Because each 
time a multifd packet is compressed (including 128 independent pages), it needs 
to be compressed page by page. These 128 pages are not concurrent. The 
concurrency is reflected in the logic of multiple channels for the multifd 
packet.

2. Through testing, we prefer concurrent processing on 4K pages, not multifd 
packet, which means that 128 pages belonging to a packet can be 
compressed/decompressed concurrently. Even one channel can also utilize all the 
resources of IAA. But this is not compatible with existing zlib.
The code is similar to the following
  for(int i = 0; i < num_pages; i++) {
job[i]->input_data = pages[i]
submit_job(job[i] //Non-block submit for compression/decompression tasks
  }
  for(int i = 0; i < num_pages; i++) {
wait_job(job[i])  //busy polling. In the future, we will make this part and 
data sending into pipeline mode.
  } 

3. Currently, the patches we provide to the community are based on streaming 
compression. This is to be compatible with the current zlib method. However, we 
found that there are still many problems with this, so we plan to provide a new 
change in the next version that the independent QPL/IAA acceleration function 
as said above.
Compatibility issues include the following
1. QPL currently does not support the z_sync_flush operation
2. IAA comp/decomp window is fixed 4K. By default, the zlib window size is 
32K. And window size should be the same for Both comp/decomp sides. 
3. At the same time, I researched the QAT compression scheme. QATzip 
currently does not support zlib, nor does it support z_sync_flush. The window 
size is 32K

In general, I think it is a good suggestion to make the accele

RE: [PATCH 0/5] migration/multifd: Prerequisite cleanups for ongoing work

2024-01-28 Thread Liu, Yuan1
> -Original Message-
> From: Fabiano Rosas 
> Sent: Saturday, January 27, 2024 6:20 AM
> To: qemu-devel@nongnu.org
> Cc: Peter Xu ; Hao Xiang ;
> Liu, Yuan1 ; Bryan Zhang 
> Subject: [PATCH 0/5] migration/multifd: Prerequisite cleanups for ongoing
> work
> 
> Hi,
> 
> Here are two cleanups that are prerequiste for the fixed-ram work, but
> also affect the other series on the list at the moment, so I want to make
> sure it works for everyone:
> 
> 1) Separate multifd_ops from compression. The multifd_ops are
>currently coupled with the multifd_compression parameter.
> 
> We're adding new multifd_ops in the fixed-ram work and adding new
> compression ops in the compression work.
> 2) Add a new send hook. The multifd_send_thread code currently does
>some twists to support zero copy, which is a socket-only feature.
> 
> This might affect the zero page and DSA work which add code to
> multifd_send_thread.

Thank you for your reminder, I reviewed the patch set and there is 
a question.

Because this change has an impact on the previous live migration 
With IAA Patch, does the submission of the next version needs 
to be submitted based on this change?

> 
> CI run: https://gitlab.com/farosas/qemu/-/pipelines/1154332360
> 
> (I also tested zero copy locally. We cannot add a test for it because it
> needs root due to memory locking limits)
> 
> Fabiano Rosas (5):
>   migration/multifd: Separate compression ops from non-compression
>   migration/multifd: Move multifd_socket_ops to socket.c
>   migration/multifd: Add multifd_ops->send
>   migration/multifd: Simplify zero copy send
>   migration/multifd: Move zero copy flag into multifd_socket_setup
> 
>  migration/multifd-zlib.c |   9 ++-
>  migration/multifd-zstd.c |   9 ++-
>  migration/multifd.c  | 164 +--
>  migration/multifd.h  |   6 +-
>  migration/socket.c   |  90 -
>  5 files changed, 128 insertions(+), 150 deletions(-)
> 
> --
> 2.35.3



RE: [External] Re: [PATCH 3/5] migration: Introduce unimplemented 'qatzip' compression method

2024-01-13 Thread Liu, Yuan1


> -Original Message-
> From: Hao Xiang 
> Sent: Thursday, January 11, 2024 1:42 PM
> To: Liu, Yuan1 
> Cc: Fabiano Rosas ; Bryan Zhang
> ; qemu-devel@nongnu.org;
> marcandre.lur...@redhat.com; pet...@redhat.com; quint...@redhat.com;
> peter.mayd...@linaro.org; berra...@redhat.com; Zou, Nanhai
> 
> Subject: Re: [External] Re: [PATCH 3/5] migration: Introduce unimplemented
> 'qatzip' compression method
> 
> On Mon, Jan 8, 2024 at 6:26 PM Liu, Yuan1  wrote:
> >
> > > -Original Message-
> > > From: Fabiano Rosas 
> > > Sent: Tuesday, January 9, 2024 4:28 AM
> > > To: Liu, Yuan1 ; Hao Xiang
> > > 
> > > Cc: Bryan Zhang ; qemu-devel@nongnu.org;
> > > marcandre.lur...@redhat.com; pet...@redhat.com; quint...@redhat.com;
> > > peter.mayd...@linaro.org; berra...@redhat.com
> > > Subject: RE: [External] Re: [PATCH 3/5] migration: Introduce
> > > unimplemented 'qatzip' compression method
> > >
> > > "Liu, Yuan1"  writes:
> > >
> > > >> -Original Message-
> > > >> From: Hao Xiang 
> > > >> Sent: Saturday, January 6, 2024 7:53 AM
> > > >> To: Fabiano Rosas 
> > > >> Cc: Bryan Zhang ;
> > > >> qemu-devel@nongnu.org; marcandre.lur...@redhat.com;
> > > >> pet...@redhat.com; quint...@redhat.com; peter.mayd...@linaro.org;
> > > >> Liu, Yuan1 ; berra...@redhat.com
> > > >> Subject: Re: [External] Re: [PATCH 3/5] migration: Introduce
> > > >> unimplemented 'qatzip' compression method
> > > >>
> > > >> On Fri, Jan 5, 2024 at 12:07 PM Fabiano Rosas 
> wrote:
> > > >> >
> > > >> > Bryan Zhang  writes:
> > > >> >
> > > >> > +cc Yuan Liu, Daniel Berrangé
> > > >> >
> > > >> > > Adds support for 'qatzip' as an option for the multifd
> > > >> > > compression method parameter, but copy-pastes the no-op logic
> > > >> > > to leave the actual methods effectively unimplemented. This
> > > >> > > is in preparation of a subsequent commit that will implement
> > > >> > > actually using QAT for compression and decompression.
> > > >> > >
> > > >> > > Signed-off-by: Bryan Zhang 
> > > >> > > Signed-off-by: Hao Xiang 
> > > >> > > ---
> > > >> > >  hw/core/qdev-properties-system.c |  6 ++-
> > > >> > >  migration/meson.build|  1 +
> > > >> > >  migration/multifd-qatzip.c   | 81
> > > >> 
> > > >> > >  migration/multifd.h  |  1 +
> > > >> > >  qapi/migration.json  |  5 +-
> > > >> > >  5 files changed, 92 insertions(+), 2 deletions(-)  create
> > > >> > > mode
> > > >> > > 100644 migration/multifd-qatzip.c
> > > >> > >
> > > >> > > diff --git a/hw/core/qdev-properties-system.c
> > > >> > > b/hw/core/qdev-properties-system.c
> > > >> > > index 1a396521d5..d8e48dcb0e 100644
> > > >> > > --- a/hw/core/qdev-properties-system.c
> > > >> > > +++ b/hw/core/qdev-properties-system.c
> > > >> > > @@ -658,7 +658,11 @@ const PropertyInfo
> > > >> > > qdev_prop_fdc_drive_type = { const PropertyInfo
> qdev_prop_multifd_compression = {
> > > >> > >  .name = "MultiFDCompression",
> > > >> > >  .description = "multifd_compression values, "
> > > >> > > -   "none/zlib/zstd",
> > > >> > > +   "none/zlib/zstd"
> > > >> > > +#ifdef CONFIG_QATZIP
> > > >> > > +   "/qatzip"
> > > >> > > +#endif
> > > >> > > +   ,
> > > >> > >  .enum_table = &MultiFDCompression_lookup,
> > > >> > >  .get = qdev_propinfo_get_enum,
> > > >> > >  .set = qdev_propinfo_set_enum, diff --git
> > > >> > > a/migration/meson.build b/migration/meson.build index
> > > >> > > 92b1cc4297..e20f318379 100644
> > > >> > > --- a/migration/meson.build
> > &

RE: [External] Re: [PATCH 3/5] migration: Introduce unimplemented 'qatzip' compression method

2024-01-13 Thread Liu, Yuan1
> -Original Message-
> From: Hao Xiang 
> Sent: Thursday, January 11, 2024 2:40 PM
> To: Fabiano Rosas 
> Cc: Liu, Yuan1 ; Bryan Zhang
> ; qemu-devel@nongnu.org;
> marcandre.lur...@redhat.com; pet...@redhat.com; quint...@redhat.com;
> peter.mayd...@linaro.org; berra...@redhat.com
> Subject: Re: [External] Re: [PATCH 3/5] migration: Introduce unimplemented
> 'qatzip' compression method
> 
> On Mon, Jan 8, 2024 at 12:28 PM Fabiano Rosas  wrote:
> >
> > "Liu, Yuan1"  writes:
> >
> > >> -Original Message-
> > >> From: Hao Xiang 
> > >> Sent: Saturday, January 6, 2024 7:53 AM
> > >> To: Fabiano Rosas 
> > >> Cc: Bryan Zhang ; qemu-devel@nongnu.org;
> > >> marcandre.lur...@redhat.com; pet...@redhat.com;
> > >> quint...@redhat.com; peter.mayd...@linaro.org; Liu, Yuan1
> > >> ; berra...@redhat.com
> > >> Subject: Re: [External] Re: [PATCH 3/5] migration: Introduce
> > >> unimplemented 'qatzip' compression method
> > >>
> > >> On Fri, Jan 5, 2024 at 12:07 PM Fabiano Rosas 
> wrote:
> > >> >
> > >> > Bryan Zhang  writes:
> > >> >
> > >> > +cc Yuan Liu, Daniel Berrangé
> > >> >
> > >> > > Adds support for 'qatzip' as an option for the multifd
> > >> > > compression method parameter, but copy-pastes the no-op logic
> > >> > > to leave the actual methods effectively unimplemented. This is
> > >> > > in preparation of a subsequent commit that will implement
> > >> > > actually using QAT for compression and decompression.
> > >> > >
> > >> > > Signed-off-by: Bryan Zhang 
> > >> > > Signed-off-by: Hao Xiang 
> > >> > > ---
> > >> > >  hw/core/qdev-properties-system.c |  6 ++-
> > >> > >  migration/meson.build|  1 +
> > >> > >  migration/multifd-qatzip.c   | 81
> > >> 
> > >> > >  migration/multifd.h  |  1 +
> > >> > >  qapi/migration.json  |  5 +-
> > >> > >  5 files changed, 92 insertions(+), 2 deletions(-)  create mode
> > >> > > 100644 migration/multifd-qatzip.c
> > >> > >
> > >> > > diff --git a/hw/core/qdev-properties-system.c
> > >> > > b/hw/core/qdev-properties-system.c
> > >> > > index 1a396521d5..d8e48dcb0e 100644
> > >> > > --- a/hw/core/qdev-properties-system.c
> > >> > > +++ b/hw/core/qdev-properties-system.c
> > >> > > @@ -658,7 +658,11 @@ const PropertyInfo
> > >> > > qdev_prop_fdc_drive_type = { const PropertyInfo
> qdev_prop_multifd_compression = {
> > >> > >  .name = "MultiFDCompression",
> > >> > >  .description = "multifd_compression values, "
> > >> > > -   "none/zlib/zstd",
> > >> > > +   "none/zlib/zstd"
> > >> > > +#ifdef CONFIG_QATZIP
> > >> > > +   "/qatzip"
> > >> > > +#endif
> > >> > > +   ,
> > >> > >  .enum_table = &MultiFDCompression_lookup,
> > >> > >  .get = qdev_propinfo_get_enum,
> > >> > >  .set = qdev_propinfo_set_enum, diff --git
> > >> > > a/migration/meson.build b/migration/meson.build index
> > >> > > 92b1cc4297..e20f318379 100644
> > >> > > --- a/migration/meson.build
> > >> > > +++ b/migration/meson.build
> > >> > > @@ -40,6 +40,7 @@ if get_option('live_block_migration').allowed()
> > >> > >system_ss.add(files('block.c'))  endif
> > >> > >  system_ss.add(when: zstd, if_true: files('multifd-zstd.c'))
> > >> > > +system_ss.add(when: qatzip, if_true:
> > >> > > +files('multifd-qatzip.c'))
> > >> > >
> > >> > >  specific_ss.add(when: 'CONFIG_SYSTEM_ONLY',
> > >> > >  if_true: files('ram.c', diff --git
> > >> > > a/migration/multifd-qatzip.c b/migration/multifd-qatzip.c new
> file
> > >> > > mode 100644 index 00..1733bbddb7
> > >> > > --- /dev/null
> > >

RE: [External] Re: [PATCH 3/5] migration: Introduce unimplemented 'qatzip' compression method

2024-01-08 Thread Liu, Yuan1
> -Original Message-
> From: Fabiano Rosas 
> Sent: Tuesday, January 9, 2024 4:28 AM
> To: Liu, Yuan1 ; Hao Xiang 
> Cc: Bryan Zhang ; qemu-devel@nongnu.org;
> marcandre.lur...@redhat.com; pet...@redhat.com; quint...@redhat.com;
> peter.mayd...@linaro.org; berra...@redhat.com
> Subject: RE: [External] Re: [PATCH 3/5] migration: Introduce unimplemented
> 'qatzip' compression method
> 
> "Liu, Yuan1"  writes:
> 
> >> -Original Message-
> >> From: Hao Xiang 
> >> Sent: Saturday, January 6, 2024 7:53 AM
> >> To: Fabiano Rosas 
> >> Cc: Bryan Zhang ; qemu-devel@nongnu.org;
> >> marcandre.lur...@redhat.com; pet...@redhat.com; quint...@redhat.com;
> >> peter.mayd...@linaro.org; Liu, Yuan1 ;
> >> berra...@redhat.com
> >> Subject: Re: [External] Re: [PATCH 3/5] migration: Introduce
> >> unimplemented 'qatzip' compression method
> >>
> >> On Fri, Jan 5, 2024 at 12:07 PM Fabiano Rosas  wrote:
> >> >
> >> > Bryan Zhang  writes:
> >> >
> >> > +cc Yuan Liu, Daniel Berrangé
> >> >
> >> > > Adds support for 'qatzip' as an option for the multifd
> >> > > compression method parameter, but copy-pastes the no-op logic to
> >> > > leave the actual methods effectively unimplemented. This is in
> >> > > preparation of a subsequent commit that will implement actually
> >> > > using QAT for compression and decompression.
> >> > >
> >> > > Signed-off-by: Bryan Zhang 
> >> > > Signed-off-by: Hao Xiang 
> >> > > ---
> >> > >  hw/core/qdev-properties-system.c |  6 ++-
> >> > >  migration/meson.build|  1 +
> >> > >  migration/multifd-qatzip.c   | 81
> >> 
> >> > >  migration/multifd.h  |  1 +
> >> > >  qapi/migration.json  |  5 +-
> >> > >  5 files changed, 92 insertions(+), 2 deletions(-)  create mode
> >> > > 100644 migration/multifd-qatzip.c
> >> > >
> >> > > diff --git a/hw/core/qdev-properties-system.c
> >> > > b/hw/core/qdev-properties-system.c
> >> > > index 1a396521d5..d8e48dcb0e 100644
> >> > > --- a/hw/core/qdev-properties-system.c
> >> > > +++ b/hw/core/qdev-properties-system.c
> >> > > @@ -658,7 +658,11 @@ const PropertyInfo qdev_prop_fdc_drive_type
> >> > > = { const PropertyInfo qdev_prop_multifd_compression = {
> >> > >  .name = "MultiFDCompression",
> >> > >  .description = "multifd_compression values, "
> >> > > -   "none/zlib/zstd",
> >> > > +   "none/zlib/zstd"
> >> > > +#ifdef CONFIG_QATZIP
> >> > > +   "/qatzip"
> >> > > +#endif
> >> > > +   ,
> >> > >  .enum_table = &MultiFDCompression_lookup,
> >> > >  .get = qdev_propinfo_get_enum,
> >> > >  .set = qdev_propinfo_set_enum, diff --git
> >> > > a/migration/meson.build b/migration/meson.build index
> >> > > 92b1cc4297..e20f318379 100644
> >> > > --- a/migration/meson.build
> >> > > +++ b/migration/meson.build
> >> > > @@ -40,6 +40,7 @@ if get_option('live_block_migration').allowed()
> >> > >system_ss.add(files('block.c'))  endif
> >> > >  system_ss.add(when: zstd, if_true: files('multifd-zstd.c'))
> >> > > +system_ss.add(when: qatzip, if_true: files('multifd-qatzip.c'))
> >> > >
> >> > >  specific_ss.add(when: 'CONFIG_SYSTEM_ONLY',
> >> > >  if_true: files('ram.c', diff --git
> >> > > a/migration/multifd-qatzip.c b/migration/multifd-qatzip.c new file
> >> > > mode 100644 index 00..1733bbddb7
> >> > > --- /dev/null
> >> > > +++ b/migration/multifd-qatzip.c
> >> > > @@ -0,0 +1,81 @@
> >> > > +/*
> >> > > + * Multifd QATzip compression implementation
> >> > > + *
> >> > > + * Copyright (c) Bytedance
> >> > > + *
> >> > > + * Authors:
> >> > > + *  Bryan Zhang 
> >> > > + *  Hao Xiang   
> >> > > + *
> >> > > + * T

RE: [External] Re: [PATCH 3/5] migration: Introduce unimplemented 'qatzip' compression method

2024-01-07 Thread Liu, Yuan1
> -Original Message-
> From: Hao Xiang 
> Sent: Saturday, January 6, 2024 7:53 AM
> To: Fabiano Rosas 
> Cc: Bryan Zhang ; qemu-devel@nongnu.org;
> marcandre.lur...@redhat.com; pet...@redhat.com; quint...@redhat.com;
> peter.mayd...@linaro.org; Liu, Yuan1 ;
> berra...@redhat.com
> Subject: Re: [External] Re: [PATCH 3/5] migration: Introduce unimplemented
> 'qatzip' compression method
> 
> On Fri, Jan 5, 2024 at 12:07 PM Fabiano Rosas  wrote:
> >
> > Bryan Zhang  writes:
> >
> > +cc Yuan Liu, Daniel Berrangé
> >
> > > Adds support for 'qatzip' as an option for the multifd compression
> > > method parameter, but copy-pastes the no-op logic to leave the
> > > actual methods effectively unimplemented. This is in preparation of
> > > a subsequent commit that will implement actually using QAT for
> > > compression and decompression.
> > >
> > > Signed-off-by: Bryan Zhang 
> > > Signed-off-by: Hao Xiang 
> > > ---
> > >  hw/core/qdev-properties-system.c |  6 ++-
> > >  migration/meson.build|  1 +
> > >  migration/multifd-qatzip.c   | 81
> 
> > >  migration/multifd.h  |  1 +
> > >  qapi/migration.json  |  5 +-
> > >  5 files changed, 92 insertions(+), 2 deletions(-)  create mode
> > > 100644 migration/multifd-qatzip.c
> > >
> > > diff --git a/hw/core/qdev-properties-system.c
> > > b/hw/core/qdev-properties-system.c
> > > index 1a396521d5..d8e48dcb0e 100644
> > > --- a/hw/core/qdev-properties-system.c
> > > +++ b/hw/core/qdev-properties-system.c
> > > @@ -658,7 +658,11 @@ const PropertyInfo qdev_prop_fdc_drive_type = {
> > > const PropertyInfo qdev_prop_multifd_compression = {
> > >  .name = "MultiFDCompression",
> > >  .description = "multifd_compression values, "
> > > -   "none/zlib/zstd",
> > > +   "none/zlib/zstd"
> > > +#ifdef CONFIG_QATZIP
> > > +   "/qatzip"
> > > +#endif
> > > +   ,
> > >  .enum_table = &MultiFDCompression_lookup,
> > >  .get = qdev_propinfo_get_enum,
> > >  .set = qdev_propinfo_set_enum,
> > > diff --git a/migration/meson.build b/migration/meson.build index
> > > 92b1cc4297..e20f318379 100644
> > > --- a/migration/meson.build
> > > +++ b/migration/meson.build
> > > @@ -40,6 +40,7 @@ if get_option('live_block_migration').allowed()
> > >system_ss.add(files('block.c'))
> > >  endif
> > >  system_ss.add(when: zstd, if_true: files('multifd-zstd.c'))
> > > +system_ss.add(when: qatzip, if_true: files('multifd-qatzip.c'))
> > >
> > >  specific_ss.add(when: 'CONFIG_SYSTEM_ONLY',
> > >  if_true: files('ram.c', diff --git
> > > a/migration/multifd-qatzip.c b/migration/multifd-qatzip.c new file
> > > mode 100644 index 00..1733bbddb7
> > > --- /dev/null
> > > +++ b/migration/multifd-qatzip.c
> > > @@ -0,0 +1,81 @@
> > > +/*
> > > + * Multifd QATzip compression implementation
> > > + *
> > > + * Copyright (c) Bytedance
> > > + *
> > > + * Authors:
> > > + *  Bryan Zhang 
> > > + *  Hao Xiang   
> > > + *
> > > + * This work is licensed under the terms of the GNU GPL, version 2 or
> later.
> > > + * See the COPYING file in the top-level directory.
> > > + */
> > > +
> > > +#include "qemu/osdep.h"
> > > +#include "exec/ramblock.h"
> > > +#include "exec/target_page.h"
> > > +#include "qapi/error.h"
> > > +#include "migration.h"
> > > +#include "options.h"
> > > +#include "multifd.h"
> > > +
> > > +static int qatzip_send_setup(MultiFDSendParams *p, Error **errp) {
> > > +return 0;
> > > +}
> > > +
> > > +static void qatzip_send_cleanup(MultiFDSendParams *p, Error **errp)
> > > +{};
> > > +
> > > +static int qatzip_send_prepare(MultiFDSendParams *p, Error **errp)
> > > +{
> > > +MultiFDPages_t *pages = p->pages;
> > > +
> > > +for (int i = 0; i < p->normal_num; i++) {
> > > +p->iov[p->iovs_num].iov_base = pages->block->host + p-
> >n

RE: [PATCH 3/5] migration: Introduce unimplemented 'qatzip' compression method

2024-01-07 Thread Liu, Yuan1
> -Original Message-
> From: Fabiano Rosas 
> Sent: Saturday, January 6, 2024 4:07 AM
> To: Bryan Zhang ; qemu-devel@nongnu.org;
> marcandre.lur...@redhat.com; pet...@redhat.com; quint...@redhat.com;
> peter.mayd...@linaro.org; hao.xi...@bytedance.com
> Cc: bryan.zh...@bytedance.com; Liu, Yuan1 ;
> berra...@redhat.com
> Subject: Re: [PATCH 3/5] migration: Introduce unimplemented 'qatzip'
> compression method
> 
> Bryan Zhang  writes:
> 
> +cc Yuan Liu, Daniel Berrangé
> 
> > Adds support for 'qatzip' as an option for the multifd compression
> > method parameter, but copy-pastes the no-op logic to leave the actual
> > methods effectively unimplemented. This is in preparation of a
> > subsequent commit that will implement actually using QAT for
> > compression and decompression.
> >
> > Signed-off-by: Bryan Zhang 
> > Signed-off-by: Hao Xiang 
> > ---
> >  hw/core/qdev-properties-system.c |  6 ++-
> >  migration/meson.build|  1 +
> >  migration/multifd-qatzip.c   | 81 
> >  migration/multifd.h  |  1 +
> >  qapi/migration.json  |  5 +-
> >  5 files changed, 92 insertions(+), 2 deletions(-)  create mode 100644
> > migration/multifd-qatzip.c
> >
> > diff --git a/hw/core/qdev-properties-system.c
> > b/hw/core/qdev-properties-system.c
> > index 1a396521d5..d8e48dcb0e 100644
> > --- a/hw/core/qdev-properties-system.c
> > +++ b/hw/core/qdev-properties-system.c
> > @@ -658,7 +658,11 @@ const PropertyInfo qdev_prop_fdc_drive_type = {
> > const PropertyInfo qdev_prop_multifd_compression = {
> >  .name = "MultiFDCompression",
> >  .description = "multifd_compression values, "
> > -   "none/zlib/zstd",
> > +   "none/zlib/zstd"
> > +#ifdef CONFIG_QATZIP
> > +   "/qatzip"
> > +#endif
> > +   ,
> >  .enum_table = &MultiFDCompression_lookup,
> >  .get = qdev_propinfo_get_enum,
> >  .set = qdev_propinfo_set_enum,
> > diff --git a/migration/meson.build b/migration/meson.build index
> > 92b1cc4297..e20f318379 100644
> > --- a/migration/meson.build
> > +++ b/migration/meson.build
> > @@ -40,6 +40,7 @@ if get_option('live_block_migration').allowed()
> >system_ss.add(files('block.c'))
> >  endif
> >  system_ss.add(when: zstd, if_true: files('multifd-zstd.c'))
> > +system_ss.add(when: qatzip, if_true: files('multifd-qatzip.c'))
> >
> >  specific_ss.add(when: 'CONFIG_SYSTEM_ONLY',
> >  if_true: files('ram.c', diff --git
> > a/migration/multifd-qatzip.c b/migration/multifd-qatzip.c new file
> > mode 100644 index 00..1733bbddb7
> > --- /dev/null
> > +++ b/migration/multifd-qatzip.c
> > @@ -0,0 +1,81 @@
> > +/*
> > + * Multifd QATzip compression implementation
> > + *
> > + * Copyright (c) Bytedance
> > + *
> > + * Authors:
> > + *  Bryan Zhang 
> > + *  Hao Xiang   
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2 or
> later.
> > + * See the COPYING file in the top-level directory.
> > + */
> > +
> > +#include "qemu/osdep.h"
> > +#include "exec/ramblock.h"
> > +#include "exec/target_page.h"
> > +#include "qapi/error.h"
> > +#include "migration.h"
> > +#include "options.h"
> > +#include "multifd.h"
> > +
> > +static int qatzip_send_setup(MultiFDSendParams *p, Error **errp) {
> > +return 0;
> > +}
> > +
> > +static void qatzip_send_cleanup(MultiFDSendParams *p, Error **errp)
> > +{};
> > +
> > +static int qatzip_send_prepare(MultiFDSendParams *p, Error **errp) {
> > +MultiFDPages_t *pages = p->pages;
> > +
> > +for (int i = 0; i < p->normal_num; i++) {
> > +p->iov[p->iovs_num].iov_base = pages->block->host + p-
> >normal[i];
> > +p->iov[p->iovs_num].iov_len = p->page_size;
> > +p->iovs_num++;
> > +}
> > +
> > +p->next_packet_size = p->normal_num * p->page_size;
> > +p->flags |= MULTIFD_FLAG_NOCOMP;
> > +return 0;
> > +}
> > +
> > +static int qatzip_recv_setup(MultiFDRecvParams *p, Error **errp) {
> > +return 0;
> > +}
> > +
> > +static void qatzip_recv_cleanup(MultiF

RE: [PATCH v2 3/4] configure: add qpl option

2023-12-04 Thread Liu, Yuan1
> -Original Message-
> From: Fabiano Rosas 
> Sent: Monday, December 4, 2023 8:30 PM
> To: Liu, Yuan1 ; quint...@redhat.com;
> pet...@redhat.com; leob...@redhat.com
> Cc: qemu-devel@nongnu.org; Zou, Nanhai 
> Subject: RE: [PATCH v2 3/4] configure: add qpl option
> 
> "Liu, Yuan1"  writes:
> 
> >> -Original Message-
> >> From: Fabiano Rosas 
> >> Sent: Saturday, December 2, 2023 1:45 AM
> >> To: Liu, Yuan1 ; quint...@redhat.com;
> >> pet...@redhat.com; leob...@redhat.com
> >> Cc: qemu-devel@nongnu.org; Liu, Yuan1 ; Zou,
> >> Nanhai 
> >> Subject: Re: [PATCH v2 3/4] configure: add qpl option
> >>
> >> Yuan Liu  writes:
> >>
> >> > the Query Processing Library (QPL) is an open-source library that
> >> > supports data compression and decompression features.
> >> >
> >> > add --enable-qpl and --disable-qpl options to enable and disable
> >> > the QPL compression accelerator. The QPL compression accelerator
> >> > can accelerate the Zlib compression algorithm during the live
> migration.
> >> >
> >> > Signed-off-by: Yuan Liu 
> >> > Reviewed-by: Nanhai Zou 
> >> > ---
> >> >  meson.build   | 7 +++
> >> >  meson_options.txt | 2 ++
> >> >  scripts/meson-buildoptions.sh | 3 +++
> >> >  3 files changed, 12 insertions(+)
> >> >
> >> > diff --git a/meson.build b/meson.build index 259dc5f308..b4ba30b4fa
> >> > 100644
> >> > --- a/meson.build
> >> > +++ b/meson.build
> >> > @@ -1032,6 +1032,11 @@ if not get_option('zstd').auto() or have_block
> >> >  required: get_option('zstd'),
> >> >  method: 'pkg-config')  endif
> >> > +qpl = not_found
> >> > +if not get_option('qpl').auto()
> >> > +qpl = dependency('libqpl', required: get_option('qpl'),
> >> > + method: 'pkg-config') endif
> >>
> >> Hm.. I'm not having success with pkg-config:
> >>
> >> ../meson.build:1043:10: ERROR: Dependency "libqpl" not found, tried
> >> pkgconfig
> >>
> >> It seems it doesn't find the static library. I had to use this instead:
> >>
> >> qpl = declare_dependency(dependencies: cc.find_library('qpl',
> >>  required: get_option('qpl')))
> >>
> >> What am I missing here?
> > Sorry about this, the QPL repo(https://github.com/intel/qpl) does not
> yet support libqpl pkg-config file, we are in the process of adding this
> functionality and we hope to resolve libqpl's dependencies through pkg-
> config file.
> > I will explicitly address this issue and provide relevant documentation
> in the next version.
> 
> Ok, just remember to test with a clean setup next time.
Sure

> > For the pkg-config test,
> > 1. create /usr/lib64/pkgconfig/libqpl.pc 2. add below lines
> > prefix=/usr/local exec_prefix=${prefix} libdir=${exec_prefix}/lib
> > includedir=${prefix}/include
> >
> > Name: libqpl
> > Description: Intel Query Processing Library
> > Version: 1.3.0
> > Libs: -L${libdir} -lqpl -lpthread -laccel-config -ldl -lstdc++
> 
> We could probably do this with meson directly instead of requiring a pkg-
> config preliminary setup. My meson-fu is not the best, but something
> like:
> 
>   qpl = declare_dependency(dependencies: [
>cc.find_library('qpl', required: get_option('qpl')),
>cc.find_library('accel-config', required: get_option('qpl')),
>...
>], link_args: ['-lstdc++', ...])

I will fix this, thank you for the sample code

> > Cflags: -I${includedir}
> >
> > 3. Install the header files to /usr/local/include/qpl and static
> > library to /usr/local/lib64/libqpl.a
> 
> For this part is ok to just point to the official docs.
Yes, good idea




RE: [PATCH v2 1/4] migration: Introduce multifd-compression-accel parameter

2023-12-04 Thread Liu, Yuan1
> -Original Message-
> From: Markus Armbruster 
> Sent: Friday, December 1, 2023 5:17 PM
> To: Liu, Yuan1 
> Cc: quint...@redhat.com; pet...@redhat.com; faro...@suse.de;
> leob...@redhat.com; qemu-devel@nongnu.org; Zou, Nanhai
> 
> Subject: Re: [PATCH v2 1/4] migration: Introduce multifd-compression-accel
> parameter
> 
> Yuan Liu  writes:
> 
> > Introduce the multifd-compression-accel option to enable or disable
> > live migration data (de)compression accelerator.
> >
> > The default value of multifd-compression-accel is auto, and the
> > enabling and selection of the accelerator are automatically detected.
> > By setting multifd-compression-accel=none, the acceleration function can
> be disabled.
> > Similarly, users can explicitly specify a specific accelerator name,
> > such as multifd-compression-accel=qpl.
> >
> > Signed-off-by: Yuan Liu 
> > Reviewed-by: Nanhai Zou 
> > ---
> >  hw/core/qdev-properties-system.c| 11 +++
> >  include/hw/qdev-properties-system.h |  4 
> >  migration/migration-hmp-cmds.c  | 10 ++
> >  migration/options.c | 24 
> >  migration/options.h |  1 +
> >  qapi/migration.json | 26 +-
> >  6 files changed, 75 insertions(+), 1 deletion(-)
> >
> > diff --git a/hw/core/qdev-properties-system.c
> > b/hw/core/qdev-properties-system.c
> > index 688340610e..ed23035845 100644
> > --- a/hw/core/qdev-properties-system.c
> > +++ b/hw/core/qdev-properties-system.c
> > @@ -673,6 +673,17 @@ const PropertyInfo qdev_prop_multifd_compression =
> {
> >  .set_default_value = qdev_propinfo_set_default_value_enum,
> >  };
> >
> > +/* --- MultiFD Compression Accelerator --- */
> > +
> > +const PropertyInfo qdev_prop_multifd_compression_accel = {
> > +.name = "MultiFDCompressionAccel",
> > +.description = "MultiFD Compression Accelerator, "
> > +   "auto/none/qpl",
> > +.enum_table = &MultiFDCompressionAccel_lookup,
> > +.get = qdev_propinfo_get_enum,
> > +.set = qdev_propinfo_set_enum,
> > +.set_default_value = qdev_propinfo_set_default_value_enum,
> > +};
> >  /* --- Reserved Region --- */
> >
> >  /*
> > diff --git a/include/hw/qdev-properties-system.h
> > b/include/hw/qdev-properties-system.h
> > index 0ac327ae60..da086bd836 100644
> > --- a/include/hw/qdev-properties-system.h
> > +++ b/include/hw/qdev-properties-system.h
> > @@ -7,6 +7,7 @@ extern const PropertyInfo qdev_prop_chr;  extern const
> > PropertyInfo qdev_prop_macaddr;  extern const PropertyInfo
> > qdev_prop_reserved_region;  extern const PropertyInfo
> > qdev_prop_multifd_compression;
> > +extern const PropertyInfo qdev_prop_multifd_compression_accel;
> >  extern const PropertyInfo qdev_prop_losttickpolicy;  extern const
> > PropertyInfo qdev_prop_blockdev_on_error;  extern const PropertyInfo
> > qdev_prop_bios_chs_trans; @@ -41,6 +42,9 @@ extern const PropertyInfo
> > qdev_prop_pcie_link_width;  #define
> > DEFINE_PROP_MULTIFD_COMPRESSION(_n, _s, _f, _d) \
> >  DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_multifd_compression, \
> > MultiFDCompression)
> > +#define DEFINE_PROP_MULTIFD_COMPRESSION_ACCEL(_n, _s, _f, _d) \
> > +DEFINE_PROP_SIGNED(_n, _s, _f, _d,
> qdev_prop_multifd_compression_accel, \
> > +   MultiFDCompressionAccel)
> >  #define DEFINE_PROP_LOSTTICKPOLICY(_n, _s, _f, _d) \
> >  DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_losttickpolicy, \
> >  LostTickPolicy) diff --git
> > a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c
> > index a82597f18e..3a278c89d9 100644
> > --- a/migration/migration-hmp-cmds.c
> > +++ b/migration/migration-hmp-cmds.c
> > @@ -344,6 +344,11 @@ void hmp_info_migrate_parameters(Monitor *mon,
> const QDict *qdict)
> >  monitor_printf(mon, "%s: %s\n",
> >
> MigrationParameter_str(MIGRATION_PARAMETER_MULTIFD_COMPRESSION),
> >  MultiFDCompression_str(params->multifd_compression));
> > +assert(params->has_multifd_compression_accel);
> > +monitor_printf(mon, "%s: %s\n",
> > +MigrationParameter_str(
> > +MIGRATION_PARAMETER_MULTIFD_COMPRESSION_ACCEL),
> > +
> > + MultiFDCompressionAccel_str(params->multifd_compression_accel));
> >  monitor_printf(mon, "

RE: [PATCH v2 2/4] multifd: Implement multifd compression accelerator

2023-12-04 Thread Liu, Yuan1
> -Original Message-
> From: Fabiano Rosas 
> Sent: Saturday, December 2, 2023 2:01 AM
> To: Liu, Yuan1 ; quint...@redhat.com;
> pet...@redhat.com; leob...@redhat.com
> Cc: qemu-devel@nongnu.org; Liu, Yuan1 ; Zou, Nanhai
> 
> Subject: Re: [PATCH v2 2/4] multifd: Implement multifd compression
> accelerator
> 
> Yuan Liu  writes:
> 
> > when starting multifd live migration, if the compression method is
> > enabled, compression method can be accelerated using accelerators.
> >
> > Signed-off-by: Yuan Liu 
> > Reviewed-by: Nanhai Zou 
> > ---
> >  migration/multifd.c | 38 --
> >  migration/multifd.h |  8 
> >  2 files changed, 44 insertions(+), 2 deletions(-)
> >
> > diff --git a/migration/multifd.c b/migration/multifd.c index
> > 1fe53d3b98..7149e67867 100644
> > --- a/migration/multifd.c
> > +++ b/migration/multifd.c
> > @@ -165,6 +165,34 @@ static MultiFDMethods multifd_nocomp_ops = {
> > static MultiFDMethods *multifd_ops[MULTIFD_COMPRESSION__MAX] = {
> >  [MULTIFD_COMPRESSION_NONE] = &multifd_nocomp_ops,  };
> > +static MultiFDAccelMethods
> > +*accel_multifd_ops[MULTIFD_COMPRESSION_ACCEL__MAX];
> > +
> > +static MultiFDMethods *get_multifd_ops(void) {
> > +MultiFDCompression comp = migrate_multifd_compression();
> > +MultiFDCompressionAccel accel =
> > +migrate_multifd_compression_accel();
> > +
> > +if (comp == MULTIFD_COMPRESSION_NONE ||
> > +accel == MULTIFD_COMPRESSION_ACCEL_NONE) {
> > +return multifd_ops[comp];
> > +}
> > +if (accel == MULTIFD_COMPRESSION_ACCEL_AUTO) {
> > +for (int i = 0; i < MULTIFD_COMPRESSION_ACCEL__MAX; i++) {
> > +if (accel_multifd_ops[i] &&
> > +accel_multifd_ops[i]->is_supported(comp)) {
> > +return accel_multifd_ops[i]->get_multifd_methods();
> > +}
> > +}
> > +return multifd_ops[comp];
> > +}
> > +
> > +/* Check if a specified accelerator is available */
> > +if (accel_multifd_ops[accel] &&
> 
> The CI is complaining that we might reach here with accel=2
> when !CONFIG_QPL. It seems the assert at migrate_multifd_compression_accel
> is not enough.
I will add assert to check both comp and accel next version in the 
get_multifd_ops function.



RE: [PATCH v2 3/4] configure: add qpl option

2023-12-03 Thread Liu, Yuan1
> -Original Message-
> From: Fabiano Rosas 
> Sent: Saturday, December 2, 2023 1:45 AM
> To: Liu, Yuan1 ; quint...@redhat.com;
> pet...@redhat.com; leob...@redhat.com
> Cc: qemu-devel@nongnu.org; Liu, Yuan1 ; Zou, Nanhai
> 
> Subject: Re: [PATCH v2 3/4] configure: add qpl option
> 
> Yuan Liu  writes:
> 
> > the Query Processing Library (QPL) is an open-source library that
> > supports data compression and decompression features.
> >
> > add --enable-qpl and --disable-qpl options to enable and disable the
> > QPL compression accelerator. The QPL compression accelerator can
> > accelerate the Zlib compression algorithm during the live migration.
> >
> > Signed-off-by: Yuan Liu 
> > Reviewed-by: Nanhai Zou 
> > ---
> >  meson.build   | 7 +++
> >  meson_options.txt | 2 ++
> >  scripts/meson-buildoptions.sh | 3 +++
> >  3 files changed, 12 insertions(+)
> >
> > diff --git a/meson.build b/meson.build index 259dc5f308..b4ba30b4fa
> > 100644
> > --- a/meson.build
> > +++ b/meson.build
> > @@ -1032,6 +1032,11 @@ if not get_option('zstd').auto() or have_block
> >  required: get_option('zstd'),
> >  method: 'pkg-config')  endif
> > +qpl = not_found
> > +if not get_option('qpl').auto()
> > +qpl = dependency('libqpl', required: get_option('qpl'),
> > + method: 'pkg-config') endif
> 
> Hm.. I'm not having success with pkg-config:
> 
> ../meson.build:1043:10: ERROR: Dependency "libqpl" not found, tried
> pkgconfig
> 
> It seems it doesn't find the static library. I had to use this instead:
> 
> qpl = declare_dependency(dependencies: cc.find_library('qpl',
>  required: get_option('qpl')))
> 
> What am I missing here?
Sorry about this, the QPL repo(https://github.com/intel/qpl) does not yet 
support libqpl pkg-config file, we are in the process of adding this 
functionality and we hope to resolve libqpl's dependencies through pkg-config 
file.
I will explicitly address this issue and provide relevant documentation in the 
next version.

For the pkg-config test, 
1. create /usr/lib64/pkgconfig/libqpl.pc
2. add below lines
prefix=/usr/local
exec_prefix=${prefix}
libdir=${exec_prefix}/lib
includedir=${prefix}/include

Name: libqpl
Description: Intel Query Processing Library
Version: 1.3.0
Libs: -L${libdir} -lqpl -lpthread -laccel-config -ldl -lstdc++
Cflags: -I${includedir}

3. Install the header files to /usr/local/include/qpl and static library to 
/usr/local/lib64/libqpl.a

> >  virgl = not_found
> >
> >  have_vhost_user_gpu = have_tools and targetos == 'linux' and
> > pixman.found() @@ -2165,6 +2170,7 @@
> > config_host_data.set('CONFIG_MALLOC_TRIM', has_malloc_trim)
> > config_host_data.set('CONFIG_STATX', has_statx)
> > config_host_data.set('CONFIG_STATX_MNT_ID', has_statx_mnt_id)
> > config_host_data.set('CONFIG_ZSTD', zstd.found())
> > +config_host_data.set('CONFIG_QPL', qpl.found())
> >  config_host_data.set('CONFIG_FUSE', fuse.found())
> > config_host_data.set('CONFIG_FUSE_LSEEK', fuse_lseek.found())
> > config_host_data.set('CONFIG_SPICE_PROTOCOL', spice_protocol.found())
> > @@ -4325,6 +4331,7 @@ summary_info += {'snappy support':snappy}
> >  summary_info += {'bzip2 support': libbzip2}
> >  summary_info += {'lzfse support': liblzfse}
> >  summary_info += {'zstd support':  zstd}
> > +summary_info += {'Query Processing Library support': qpl}
> >  summary_info += {'NUMA host support': numa}
> >  summary_info += {'capstone':  capstone}
> >  summary_info += {'libpmem support':   libpmem}
> > diff --git a/meson_options.txt b/meson_options.txt index
> > 3c7398f3c6..71cd533985 100644
> > --- a/meson_options.txt
> > +++ b/meson_options.txt
> > @@ -255,6 +255,8 @@ option('xkbcommon', type : 'feature', value :
> 'auto',
> > description: 'xkbcommon support')  option('zstd', type :
> > 'feature', value : 'auto',
> > description: 'zstd compression support')
> > +option('qpl', type : 'feature', value : 'auto',
> > +   description: 'Query Processing Library support')
> >  option('fuse', type: 'feature', value: 'auto

RE: [PATCH 0/5] Live Migration Acceleration with IAA Compression

2023-10-23 Thread Liu, Yuan1
> -Original Message-
> From: Juan Quintela 
> Sent: Monday, October 23, 2023 6:39 PM
> To: Liu, Yuan1 
> Cc: Daniel P.Berrangé ; Peter Xu
> ; faro...@suse.de; leob...@redhat.com; qemu-
> de...@nongnu.org; Zou, Nanhai 
> Subject: Re: [PATCH 0/5] Live Migration Acceleration with IAA Compression
> 
> "Liu, Yuan1"  wrote:
> >> -Original Message-
> >> From: Daniel P. Berrangé 
> >> Sent: Thursday, October 19, 2023 11:32 PM
> >> To: Peter Xu 
> >> Cc: Juan Quintela ; Liu, Yuan1
> >> ; faro...@suse.de; leob...@redhat.com; qemu-
> >> de...@nongnu.org; Zou, Nanhai 
> >> Subject: Re: [PATCH 0/5] Live Migration Acceleration with IAA
> >> Compression
> >>
> >> On Thu, Oct 19, 2023 at 11:23:31AM -0400, Peter Xu wrote:
> >> > On Thu, Oct 19, 2023 at 03:52:14PM +0100, Daniel P. Berrangé wrote:
> >> > > On Thu, Oct 19, 2023 at 01:40:23PM +0200, Juan Quintela wrote:
> >> > > > Yuan Liu  wrote:
> >> > > > > Hi,
> >> > > > >
> >> > > > > I am writing to submit a code change aimed at enhancing live
> >> > > > > migration acceleration by leveraging the compression
> >> > > > > capability of the Intel In-Memory Analytics Accelerator (IAA).
> >> > > > >
> >> > > > > Enabling compression functionality during the live migration
> >> > > > > process can enhance performance, thereby reducing downtime
> >> > > > > and network bandwidth requirements. However, this improvement
> >> > > > > comes at the cost of additional CPU resources, posing a
> >> > > > > challenge for cloud service providers in terms of resource
> >> > > > > allocation. To address this challenge, I have focused on
> >> > > > > offloading the compression
> >> overhead to the IAA hardware, resulting in performance gains.
> >> > > > >
> >> > > > > The implementation of the IAA (de)compression code is based
> >> > > > > on Intel Query Processing Library (QPL), an open-source
> >> > > > > software project designed for IAA high-level software programming.
> >> > > > >
> >> > > > > Best regards,
> >> > > > > Yuan Liu
> >> > > >
> >> > > > After reviewing the patches:
> >> > > >
> >> > > > - why are you doing this on top of old compression code, that is
> >> > > >   obsolete, deprecated and buggy
> > Some users have not enabled the multifd feature yet, but they will
> > decide whether to enable the compression feature based on the load
> > situation. So I'm wondering if, without multifd, the compression
> > functionality will no longer be available?
> 
> Next pull request will deprecate it.  So in two versions is going to be gone.
> 
> >> > > > - why are you not doing it on top of multifd.
> 
> > I plan to submit the support for multifd independently because the
> > multifd compression and legacy compression code are separate.
> 
> compression code is really buggy.  I think you should not even try to work on
> top of it.
Sure, I will focus on multifd compression in the future.

> > I looked at the code of multifd about compression. Currently, it uses
> > the CPU synchronous compression mode. Since it is best to use the
> > asynchronous processing method of the hardware accelerator, I would
> > like to get suggestions on the asynchronous implementation.
> 
> I did that on a previous comment.
> Several questions:
> 
> - you are using zlib, right?  When I tested, the longer streams you
>   have, the better compression you get. right?
>   Is there a way to "continue" with the state of the previous job?
> 
>   Old compression code, generates a new context for every packet.
>   Multifd generates a new zlib context for each connection.
Sorry, I'm not familiar with zlib development.
In most cases, the longer the input data, the higher the compression ratio, one 
reason is that longer data can be encoded more efficiently.
Deflate compression has two phases, LZ77 + Huffman coding, and as far as I 
know, zlib can use a static Huffman table or a dynamic Huffman table, the 
former has high throughput and the latter has high compression ratio, but the 
user can not specify a Huffman table.
IAA can support this, it has a mode(canned mode) that compression can use a 
user-generated Huffman table to improve the compression ratio, th

RE: [PATCH 0/5] Live Migration Acceleration with IAA Compression

2023-10-23 Thread Liu, Yuan1
> -Original Message-
> From: Juan Quintela 
> Sent: Monday, October 23, 2023 6:48 PM
> To: Daniel P.Berrangé 
> Cc: Liu, Yuan1 ; Peter Xu ;
> faro...@suse.de; leob...@redhat.com; qemu-devel@nongnu.org; Zou,
> Nanhai 
> Subject: Re: [PATCH 0/5] Live Migration Acceleration with IAA Compression
> 
> Daniel P. Berrangé  wrote:
> > On Mon, Oct 23, 2023 at 08:33:44AM +, Liu, Yuan1 wrote:
> >> > -Original Message-
> >> > From: Daniel P. Berrangé 
> >> > Sent: Thursday, October 19, 2023 11:32 PM
> >> > To: Peter Xu 
> >> > Cc: Juan Quintela ; Liu, Yuan1
> >> > ; faro...@suse.de; leob...@redhat.com; qemu-
> >> > de...@nongnu.org; Zou, Nanhai 
> >> > Subject: Re: [PATCH 0/5] Live Migration Acceleration with IAA
> >> > Compression
> >> >
> >> > On Thu, Oct 19, 2023 at 11:23:31AM -0400, Peter Xu wrote:
> >> > > On Thu, Oct 19, 2023 at 03:52:14PM +0100, Daniel P. Berrangé wrote:
> >> > > > On Thu, Oct 19, 2023 at 01:40:23PM +0200, Juan Quintela wrote:
> >> > > > > Yuan Liu  wrote:
> >> > > > > > Hi,
> >> > > > > >
> >> > > > > > I am writing to submit a code change aimed at enhancing
> >> > > > > > live migration acceleration by leveraging the compression
> >> > > > > > capability of the Intel In-Memory Analytics Accelerator (IAA).
> >> > > > > >
> >> > > > > > Enabling compression functionality during the live
> >> > > > > > migration process can enhance performance, thereby reducing
> >> > > > > > downtime and network bandwidth requirements. However, this
> >> > > > > > improvement comes at the cost of additional CPU resources,
> >> > > > > > posing a challenge for cloud service providers in terms of
> >> > > > > > resource allocation. To address this challenge, I have
> >> > > > > > focused on offloading the compression
> >> > overhead to the IAA hardware, resulting in performance gains.
> >> > > > > >
> >> > > > > > The implementation of the IAA (de)compression code is based
> >> > > > > > on Intel Query Processing Library (QPL), an open-source
> >> > > > > > software project designed for IAA high-level software
> programming.
> >> > > > >
> >> > > > > After reviewing the patches:
> >> > > > >
> >> > > > > - why are you doing this on top of old compression code, that is
> >> > > > >   obsolete, deprecated and buggy
> >> Some users have not enabled the multifd feature yet, but they will
> >> decide whether to enable the compression feature based on the load
> >> situation. So I'm wondering if, without multifd, the compression
> >> functionality will no longer be available?
> >>
> >> > > > > - why are you not doing it on top of multifd.
> >> I plan to submit the support for multifd independently because the
> >> multifd compression and legacy compression code are separate.
> >
> > So the core question her (for migration maintainers) is whether
> > contributors should be spending any time at all on non-multifd code,
> > or if new features should be exclusively for multifd ?
> 
> Only for multifd.
> 
> Comparison right now:
> - compression (can be done better in multifd)
> - plain precopy (we can satturate faster networks with multifd)
> - xbzrle: right now only non-multifd (plan to add as another multifd
>   compression method)
> - exec: This is a hard one.  Fabiano is about to submit a file based
> multifd method.  Advantages over exec:
>   * much less space used (it writes each page at the right
> position, no overhead and never the same page on the two
> streams)
>   * We can give proper errors, exec is very bad when the exec'd
> process gives an error.
> Disadvantages:
>   * libvirt (or any management app) needs to wait for
> compression to end, and launch the exec command by hand.
> I wanted to discuss this with libvirt, if it would be
> possible to remove the use of exec compression.
> - rdma: This is a hard one
> Current implementation is a mess
> It is almost un-maintained
> There are two-three years old patches to move it on

RE: [PATCH 0/5] Live Migration Acceleration with IAA Compression

2023-10-23 Thread Liu, Yuan1
> -Original Message-
> From: Daniel P. Berrangé 
> Sent: Monday, October 23, 2023 6:30 PM
> To: Liu, Yuan1 
> Cc: Peter Xu ; Juan Quintela ;
> faro...@suse.de; leob...@redhat.com; qemu-devel@nongnu.org; Zou,
> Nanhai 
> Subject: Re: [PATCH 0/5] Live Migration Acceleration with IAA Compression
> 
> On Mon, Oct 23, 2023 at 08:33:44AM +, Liu, Yuan1 wrote:
> > > -Original Message-
> > > From: Daniel P. Berrangé 
> > > Sent: Thursday, October 19, 2023 11:32 PM
> > > To: Peter Xu 
> > > Cc: Juan Quintela ; Liu, Yuan1
> > > ; faro...@suse.de; leob...@redhat.com; qemu-
> > > de...@nongnu.org; Zou, Nanhai 
> > > Subject: Re: [PATCH 0/5] Live Migration Acceleration with IAA
> > > Compression
> > >
> > > On Thu, Oct 19, 2023 at 11:23:31AM -0400, Peter Xu wrote:
> > > > On Thu, Oct 19, 2023 at 03:52:14PM +0100, Daniel P. Berrangé wrote:
> > > > > On Thu, Oct 19, 2023 at 01:40:23PM +0200, Juan Quintela wrote:
> > > > > > Yuan Liu  wrote:
> > > > > > > Hi,
> > > > > > >
> > > > > > > I am writing to submit a code change aimed at enhancing live
> > > > > > > migration acceleration by leveraging the compression
> > > > > > > capability of the Intel In-Memory Analytics Accelerator (IAA).
> > > > > > >
> > > > > > > Enabling compression functionality during the live migration
> > > > > > > process can enhance performance, thereby reducing downtime
> > > > > > > and network bandwidth requirements. However, this
> > > > > > > improvement comes at the cost of additional CPU resources,
> > > > > > > posing a challenge for cloud service providers in terms of
> > > > > > > resource allocation. To address this challenge, I have
> > > > > > > focused on offloading the compression
> > > overhead to the IAA hardware, resulting in performance gains.
> > > > > > >
> > > > > > > The implementation of the IAA (de)compression code is based
> > > > > > > on Intel Query Processing Library (QPL), an open-source
> > > > > > > software project designed for IAA high-level software programming.
> > > > > >
> > > > > > After reviewing the patches:
> > > > > >
> > > > > > - why are you doing this on top of old compression code, that is
> > > > > >   obsolete, deprecated and buggy
> > Some users have not enabled the multifd feature yet, but they will decide
> whether to enable the compression feature based on the load situation. So I'm
> wondering if, without multifd, the compression functionality will no longer be
> available?
> >
> > > > > > - why are you not doing it on top of multifd.
> > I plan to submit the support for multifd independently because the
> > multifd compression and legacy compression code are separate.
> 
> So the core question her (for migration maintainers) is whether contributors
> should be spending any time at all on non-multifd code, or if new features
> should be exclusively for multifd ?
> 
> I doesn't make a lot of sense over the long term to have people spending time
> implementing the same features twice. IOW, should we be directly contributors
> explicitly towards multifd only, and even consider deprecating non-multifd 
> code
> at some time ?
> 
> > > > > I'm not sure that is ideal approach.  IIUC, the IAA/QPL library
> > > > > is not defining a new compression format. Rather it is providing
> > > > > a hardware accelerator for 'deflate' format, as can be made
> > > > > compatible with zlib:
> > > > >
> > > > >
> > > > > https://intel.github.io/qpl/documentation/dev_guide_docs/c_use_c
> > > > > ases
> > > > > /deflate/c_deflate_zlib_gzip.html#zlib-and-gzip-compatibility-re
> > > > > fere
> > > > > nce-link
> > > > >
> > > > > With multifd we already have a 'zlib' compression format, and so
> > > > > this IAA/QPL logic would effectively just be a providing a
> > > > > second implementation of zlib.
> > > > >
> > > > > Given the use of a standard format, I would expect to be able to
> > > > > use software zlib on the src, mixed with IAA/QPL zlib on the
> > > > > target, or vica-ve

RE: [PATCH 0/5] Live Migration Acceleration with IAA Compression

2023-10-23 Thread Liu, Yuan1
> -Original Message-
> From: Daniel P. Berrangé 
> Sent: Thursday, October 19, 2023 11:32 PM
> To: Peter Xu 
> Cc: Juan Quintela ; Liu, Yuan1
> ; faro...@suse.de; leob...@redhat.com; qemu-
> de...@nongnu.org; Zou, Nanhai 
> Subject: Re: [PATCH 0/5] Live Migration Acceleration with IAA Compression
> 
> On Thu, Oct 19, 2023 at 11:23:31AM -0400, Peter Xu wrote:
> > On Thu, Oct 19, 2023 at 03:52:14PM +0100, Daniel P. Berrangé wrote:
> > > On Thu, Oct 19, 2023 at 01:40:23PM +0200, Juan Quintela wrote:
> > > > Yuan Liu  wrote:
> > > > > Hi,
> > > > >
> > > > > I am writing to submit a code change aimed at enhancing live
> > > > > migration acceleration by leveraging the compression capability
> > > > > of the Intel In-Memory Analytics Accelerator (IAA).
> > > > >
> > > > > Enabling compression functionality during the live migration
> > > > > process can enhance performance, thereby reducing downtime and
> > > > > network bandwidth requirements. However, this improvement comes
> > > > > at the cost of additional CPU resources, posing a challenge for
> > > > > cloud service providers in terms of resource allocation. To
> > > > > address this challenge, I have focused on offloading the compression
> overhead to the IAA hardware, resulting in performance gains.
> > > > >
> > > > > The implementation of the IAA (de)compression code is based on
> > > > > Intel Query Processing Library (QPL), an open-source software
> > > > > project designed for IAA high-level software programming.
> > > > >
> > > > > Best regards,
> > > > > Yuan Liu
> > > >
> > > > After reviewing the patches:
> > > >
> > > > - why are you doing this on top of old compression code, that is
> > > >   obsolete, deprecated and buggy
Some users have not enabled the multifd feature yet, but they will decide 
whether to enable the compression feature based on the load situation. So I'm 
wondering if, without multifd, the compression functionality will no longer be 
available?

> > > > - why are you not doing it on top of multifd.
I plan to submit the support for multifd independently because the multifd 
compression and legacy compression code are separate.

I looked at the code of multifd about compression. Currently, it uses the CPU 
synchronous compression mode. Since it is best 
to use the asynchronous processing method of the hardware accelerator,  I would 
like to get suggestions on the asynchronous implementation.

1. Dirty page scanning and compression pipeline processing, the main thread of 
live migration submits compression tasks to the hardware, and multifd threads 
only handle the transmission of compressed pages.
2. Data sending and compression pipeline processing, the Multifd threads submit 
compression tasks to the hardware and then transmit the compressed data. (A 
multifd thread job may need to transmit compressed data multiple times.)

> > > > You just need to add another compression method on top of multifd.
> > > > See how it was done for zstd:
Yes, I will refer to zstd to implement multifd compression with IAA

> > > I'm not sure that is ideal approach.  IIUC, the IAA/QPL library is
> > > not defining a new compression format. Rather it is providing a
> > > hardware accelerator for 'deflate' format, as can be made compatible
> > > with zlib:
> > >
> > >
> > > https://intel.github.io/qpl/documentation/dev_guide_docs/c_use_cases
> > > /deflate/c_deflate_zlib_gzip.html#zlib-and-gzip-compatibility-refere
> > > nce-link
> > >
> > > With multifd we already have a 'zlib' compression format, and so
> > > this IAA/QPL logic would effectively just be a providing a second
> > > implementation of zlib.
> > >
> > > Given the use of a standard format, I would expect to be able to use
> > > software zlib on the src, mixed with IAA/QPL zlib on the target, or
> > > vica-verca.
> > >
> > > IOW, rather than defining a new compression format for this, I think
> > > we could look at a new migration parameter for
> > >
> > > "compression-accelerator": ["auto", "none", "qpl"]
> > >
> > > with 'auto' the default, such that we can automatically enable
> > > IAA/QPL when 'zlib' format is requested, if running on a suitable
> > > host.
> >
> > I was also curious about the format of compr