[PATCH v1 2/2] system/cpus: Fix resume_all_vcpus() under vCPU hotplug condition

2024-03-17 Thread Keqian Zhu via
For vCPU being hotplugged, qemu_init_vcpu() is called. In this
function, we set vcpu state as stopped, and then wait vcpu thread
to be created.

As the vcpu state is stopped, it will inform us it has been created
and then wait on halt_cond. After we has realized vcpu object, we
will resume the vcpu thread.

However, during we wait vcpu thread to be created, the bql is
unlocked, and other thread is allowed to call resume_all_vcpus(),
which will resume the un-realized vcpu.

This fixes the issue by filter out un-realized vcpu during
resume_all_vcpus().

Signed-off-by: Keqian Zhu 
---
 system/cpus.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/system/cpus.c b/system/cpus.c
index 4e41abe23e..8871f5dfa9 100644
--- a/system/cpus.c
+++ b/system/cpus.c
@@ -638,6 +638,9 @@ void resume_all_vcpus(void)
 
 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
 CPU_FOREACH(cpu) {
+if (!object_property_get_bool(OBJECT(cpu), "realized", _abort)) {
+continue;
+}
 cpu_resume(cpu);
 }
 }
-- 
2.33.0




[PATCH v1 1/2] system/cpus: Fix pause_all_vcpus() under concurrent environment

2024-03-17 Thread Keqian Zhu via
Both main loop thread and vCPU thread are allowed to call
pause_all_vcpus(), and in general resume_all_vcpus() is called
after it. Two issues live in pause_all_vcpus():

1. There is possibility that during thread T1 waits on
qemu_pause_cond with bql unlocked, other thread has called
pause_all_vcpus() and resume_all_vcpus(), then thread T1 will
stuck, because the condition all_vcpus_paused() is always false.

2. After all_vcpus_paused() has been checked as true, we will
unlock bql to relock replay_mutex. During the bql was unlocked,
the vcpu's state may has been changed by other thread, so we
must retry.

Signed-off-by: Keqian Zhu 
---
 system/cpus.c | 29 -
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/system/cpus.c b/system/cpus.c
index 68d161d96b..4e41abe23e 100644
--- a/system/cpus.c
+++ b/system/cpus.c
@@ -571,12 +571,14 @@ static bool all_vcpus_paused(void)
 return true;
 }
 
-void pause_all_vcpus(void)
+static void request_pause_all_vcpus(void)
 {
 CPUState *cpu;
 
-qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
 CPU_FOREACH(cpu) {
+if (cpu->stopped) {
+continue;
+}
 if (qemu_cpu_is_self(cpu)) {
 qemu_cpu_stop(cpu, true);
 } else {
@@ -584,6 +586,14 @@ void pause_all_vcpus(void)
 qemu_cpu_kick(cpu);
 }
 }
+}
+
+void pause_all_vcpus(void)
+{
+qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
+
+retry:
+request_pause_all_vcpus();
 
 /* We need to drop the replay_lock so any vCPU threads woken up
  * can finish their replay tasks
@@ -592,14 +602,23 @@ void pause_all_vcpus(void)
 
 while (!all_vcpus_paused()) {
 qemu_cond_wait(_pause_cond, );
-CPU_FOREACH(cpu) {
-qemu_cpu_kick(cpu);
-}
+/* During we waited on qemu_pause_cond the bql was unlocked,
+ * the vcpu's state may has been changed by other thread, so
+ * we must request the pause state on all vcpus again.
+ */
+request_pause_all_vcpus();
 }
 
 bql_unlock();
 replay_mutex_lock();
 bql_lock();
+
+/* During the bql was unlocked, the vcpu's state may has been
+ * changed by other thread, so we must retry.
+ */
+if (!all_vcpus_paused()) {
+goto retry;
+}
 }
 
 void cpu_resume(CPUState *cpu)
-- 
2.33.0




[PATCH v1 0/2] Some fixes for pause and resume all vcpus

2024-03-17 Thread Keqian Zhu via
I hit these bugs when I test the RFC patch of ARM vCPU hotplug feature.
This patch has been verified valid.

Keqian Zhu (2):
  system/cpus: Fix pause_all_vcpus() under concurrent environment
  system/cpus: Fix resume_all_vcpus() under vCPU hotplug condition

 system/cpus.c | 32 +++-
 1 file changed, 27 insertions(+), 5 deletions(-)

-- 
2.33.0




[PATCH] virtio-gpu: Optimize 2D resource data transfer

2023-06-11 Thread Keqian Zhu via
The following points sometimes can reduce much data
to copy:
1. When width matches, we can transfer data with one
call of iov_to_buf().
2. Only the required height need to transfer, not
whole image.

Signed-off-by: Keqian Zhu 
---
 hw/display/virtio-gpu.c | 22 +++---
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c
index 66cddd94d9..af31018ab0 100644
--- a/hw/display/virtio-gpu.c
+++ b/hw/display/virtio-gpu.c
@@ -438,11 +438,11 @@ static void virtio_gpu_transfer_to_host_2d(VirtIOGPU *g,
struct virtio_gpu_ctrl_command *cmd)
 {
 struct virtio_gpu_simple_resource *res;
-int h;
+int h, bpp;
 uint32_t src_offset, dst_offset, stride;
-int bpp;
 pixman_format_code_t format;
 struct virtio_gpu_transfer_to_host_2d t2d;
+void *img_data;
 
 VIRTIO_GPU_FILL_CMD(t2d);
 virtio_gpu_t2d_bswap();
@@ -471,23 +471,23 @@ static void virtio_gpu_transfer_to_host_2d(VirtIOGPU *g,
 format = pixman_image_get_format(res->image);
 bpp = DIV_ROUND_UP(PIXMAN_FORMAT_BPP(format), 8);
 stride = pixman_image_get_stride(res->image);
+img_data = pixman_image_get_data(res->image);
 
-if (t2d.offset || t2d.r.x || t2d.r.y ||
-t2d.r.width != pixman_image_get_width(res->image)) {
-void *img_data = pixman_image_get_data(res->image);
+if (t2d.r.x || t2d.r.width != pixman_image_get_width(res->image)) {
 for (h = 0; h < t2d.r.height; h++) {
 src_offset = t2d.offset + stride * h;
 dst_offset = (t2d.r.y + h) * stride + (t2d.r.x * bpp);
 
 iov_to_buf(res->iov, res->iov_cnt, src_offset,
-   (uint8_t *)img_data
-   + dst_offset, t2d.r.width * bpp);
+   (uint8_t *)img_data + dst_offset,
+   t2d.r.width * bpp);
 }
 } else {
-iov_to_buf(res->iov, res->iov_cnt, 0,
-   pixman_image_get_data(res->image),
-   pixman_image_get_stride(res->image)
-   * pixman_image_get_height(res->image));
+src_offset = t2d.offset;
+dst_offset = t2d.r.y * stride + t2d.r.x * bpp;
+iov_to_buf(res->iov, res->iov_cnt, src_offset,
+   (uint8_t *)img_data + dst_offset,
+   stride * t2d.r.height);
 }
 }
 
-- 
2.20.1




[PATCH v2] hw/acpi: Add ospm_status hook implementation for acpi-ged

2022-08-16 Thread Keqian Zhu via
Setup an ARM virtual machine of machine virt and execute qmp 
"query-acpi-ospm-status"
causes segmentation fault with following dumpstack:
 #1  0xab64235c in qmp_query_acpi_ospm_status 
(errp=errp@entry=0xf030) at ../monitor/qmp-cmds.c:312
 #2  0xabfc4e20 in qmp_marshal_query_acpi_ospm_status (args=, ret=0xea4ffe90, errp=0xea4ffe88) at qapi/qapi-commands-acpi.c:63
 #3  0xabff8ba0 in do_qmp_dispatch_bh (opaque=0xea4ffe98) at 
../qapi/qmp-dispatch.c:128
 #4  0xac02e594 in aio_bh_call (bh=0xe0004d80) at 
../util/async.c:150
 #5  aio_bh_poll (ctx=ctx@entry=0xad0f6040) at ../util/async.c:178
 #6  0xac00bd40 in aio_dispatch (ctx=ctx@entry=0xad0f6040) at 
../util/aio-posix.c:421
 #7  0xac02e010 in aio_ctx_dispatch (source=0xad0f6040, 
callback=, user_data=) at ../util/async.c:320
 #8  0xf76f6884 in g_main_context_dispatch () at 
/usr/lib64/libglib-2.0.so.0
 #9  0xac0452d4 in glib_pollfds_poll () at ../util/main-loop.c:297
 #10 os_host_main_loop_wait (timeout=0) at ../util/main-loop.c:320
 #11 main_loop_wait (nonblocking=nonblocking@entry=0) at ../util/main-loop.c:596
 #12 0xab5c9e50 in qemu_main_loop () at ../softmmu/runstate.c:734
 #13 0xab185370 in qemu_main (argc=argc@entry=47, 
argv=argv@entry=0xf518, envp=envp@entry=0x0) at ../softmmu/main.c:38
 #14 0xab16f99c in main (argc=47, argv=0xf518) at 
../softmmu/main.c:47

Fixes: ebb62075021a ("hw/acpi: Add ACPI Generic Event Device Support")
Signed-off-by: Keqian Zhu 
---
 hw/acpi/generic_event_device.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/hw/acpi/generic_event_device.c b/hw/acpi/generic_event_device.c
index e28457a7d1..a3d31631fe 100644
--- a/hw/acpi/generic_event_device.c
+++ b/hw/acpi/generic_event_device.c
@@ -267,6 +267,13 @@ static void acpi_ged_unplug_cb(HotplugHandler *hotplug_dev,
 }
 }
 
+static void acpi_ged_ospm_status(AcpiDeviceIf *adev, ACPIOSTInfoList ***list)
+{
+AcpiGedState *s = ACPI_GED(adev);
+
+acpi_memory_ospm_status(>memhp_state, list);
+}
+
 static void acpi_ged_send_event(AcpiDeviceIf *adev, AcpiEventStatusBits ev)
 {
 AcpiGedState *s = ACPI_GED(adev);
@@ -409,6 +416,7 @@ static void acpi_ged_class_init(ObjectClass *class, void 
*data)
 hc->unplug_request = acpi_ged_unplug_request_cb;
 hc->unplug = acpi_ged_unplug_cb;
 
+adevc->ospm_status = acpi_ged_ospm_status;
 adevc->send_event = acpi_ged_send_event;
 }
 
-- 
2.33.0




[PATCH] acpi_ged: Add ospm_status hook implementation

2022-08-16 Thread Keqian Zhu via
This fixes a bug that causes segmentation fault with following dumpstack:
 #1  0xab64235c in qmp_query_acpi_ospm_status 
(errp=errp@entry=0xf030) at ../monitor/qmp-cmds.c:312
 #2  0xabfc4e20 in qmp_marshal_query_acpi_ospm_status (args=, ret=0xea4ffe90, errp=0xea4ffe88) at qapi/qapi-commands-acpi.c:63
 #3  0xabff8ba0 in do_qmp_dispatch_bh (opaque=0xea4ffe98) at 
../qapi/qmp-dispatch.c:128
 #4  0xac02e594 in aio_bh_call (bh=0xe0004d80) at 
../util/async.c:150
 #5  aio_bh_poll (ctx=ctx@entry=0xad0f6040) at ../util/async.c:178
 #6  0xac00bd40 in aio_dispatch (ctx=ctx@entry=0xad0f6040) at 
../util/aio-posix.c:421
 #7  0xac02e010 in aio_ctx_dispatch (source=0xad0f6040, 
callback=, user_data=) at ../util/async.c:320
 #8  0xf76f6884 in g_main_context_dispatch () at 
/usr/lib64/libglib-2.0.so.0
 #9  0xac0452d4 in glib_pollfds_poll () at ../util/main-loop.c:297
 #10 os_host_main_loop_wait (timeout=0) at ../util/main-loop.c:320
 #11 main_loop_wait (nonblocking=nonblocking@entry=0) at ../util/main-loop.c:596
 #12 0xab5c9e50 in qemu_main_loop () at ../softmmu/runstate.c:734
 #13 0xab185370 in qemu_main (argc=argc@entry=47, 
argv=argv@entry=0xf518, envp=envp@entry=0x0) at ../softmmu/main.c:38
 #14 0xab16f99c in main (argc=47, argv=0xf518) at 
../softmmu/main.c:47

Fixes: ebb62075021a ("hw/acpi: Add ACPI Generic Event Device Support")
Signed-off-by: Keqian Zhu 
---
 hw/acpi/generic_event_device.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/hw/acpi/generic_event_device.c b/hw/acpi/generic_event_device.c
index e28457a7d1..a3d31631fe 100644
--- a/hw/acpi/generic_event_device.c
+++ b/hw/acpi/generic_event_device.c
@@ -267,6 +267,13 @@ static void acpi_ged_unplug_cb(HotplugHandler *hotplug_dev,
 }
 }
 
+static void acpi_ged_ospm_status(AcpiDeviceIf *adev, ACPIOSTInfoList ***list)
+{
+AcpiGedState *s = ACPI_GED(adev);
+
+acpi_memory_ospm_status(>memhp_state, list);
+}
+
 static void acpi_ged_send_event(AcpiDeviceIf *adev, AcpiEventStatusBits ev)
 {
 AcpiGedState *s = ACPI_GED(adev);
@@ -409,6 +416,7 @@ static void acpi_ged_class_init(ObjectClass *class, void 
*data)
 hc->unplug_request = acpi_ged_unplug_request_cb;
 hc->unplug = acpi_ged_unplug_cb;
 
+adevc->ospm_status = acpi_ged_ospm_status;
 adevc->send_event = acpi_ged_send_event;
 }
 
-- 
2.33.0




[PATCH] vmstate: Constify some VMStateDescriptions

2021-04-08 Thread Keqian Zhu
Constify vmstate_ecc_state and vmstate_x86_cpu.

Signed-off-by: Keqian Zhu 
---
 hw/block/ecc.c   | 2 +-
 include/hw/block/flash.h | 2 +-
 target/i386/cpu.h| 2 +-
 target/i386/machine.c| 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/hw/block/ecc.c b/hw/block/ecc.c
index 1a182367ee..6e0d63842c 100644
--- a/hw/block/ecc.c
+++ b/hw/block/ecc.c
@@ -78,7 +78,7 @@ void ecc_reset(ECCState *s)
 }
 
 /* Save/restore */
-VMStateDescription vmstate_ecc_state = {
+const VMStateDescription vmstate_ecc_state = {
 .name = "ecc-state",
 .version_id = 0,
 .minimum_version_id = 0,
diff --git a/include/hw/block/flash.h b/include/hw/block/flash.h
index 7dde0adcee..86d8363bb0 100644
--- a/include/hw/block/flash.h
+++ b/include/hw/block/flash.h
@@ -74,6 +74,6 @@ typedef struct {
 
 uint8_t ecc_digest(ECCState *s, uint8_t sample);
 void ecc_reset(ECCState *s);
-extern VMStateDescription vmstate_ecc_state;
+extern const VMStateDescription vmstate_ecc_state;
 
 #endif
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 570f916878..1bc300ce85 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1786,7 +1786,7 @@ struct X86CPU {
 
 
 #ifndef CONFIG_USER_ONLY
-extern VMStateDescription vmstate_x86_cpu;
+extern const VMStateDescription vmstate_x86_cpu;
 #endif
 
 int x86_cpu_pending_interrupt(CPUState *cs, int interrupt_request);
diff --git a/target/i386/machine.c b/target/i386/machine.c
index 137604ddb8..f6f094f1c9 100644
--- a/target/i386/machine.c
+++ b/target/i386/machine.c
@@ -1396,7 +1396,7 @@ static const VMStateDescription vmstate_msr_tsx_ctrl = {
 }
 };
 
-VMStateDescription vmstate_x86_cpu = {
+const VMStateDescription vmstate_x86_cpu = {
 .name = "cpu",
 .version_id = 12,
 .minimum_version_id = 11,
-- 
2.19.1




Re: [PATCH 0/3] hw: Constify VMStateDescription

2021-04-08 Thread Keqian Zhu



On 2021/4/8 21:22, Philippe Mathieu-Daudé wrote:
> On 3/15/21 10:05 AM, Keqian Zhu wrote:
>> Hi Philippe,
>>
>> It seems that vmstate_ecc_state and vmstate_x86_cpu can also be constified.
>> Found by .
> 
> Correct (I only searched for the static ones).
> 
> Do you mind sending a patch?
OK, I'll do. :)

BRs,
Keqian

> 
>>
>> Thanks,
>> Keqian
>>
>> On 2021/3/14 1:11, Philippe Mathieu-Daudé wrote:
>>> VMStateDescription isn't supposed to be modified.
> .
> 



Re: [PATCH v5 00/10] KVM: Dirty ring support (QEMU part)

2021-03-24 Thread Keqian Zhu
Peter,

On 2021/3/24 23:09, Peter Xu wrote:
> On Wed, Mar 24, 2021 at 10:56:22AM +0800, Keqian Zhu wrote:
>> Hi Peter,
>>
>> On 2021/3/23 22:34, Peter Xu wrote:
>>> Keqian,
>>>
>>> On Tue, Mar 23, 2021 at 02:40:43PM +0800, Keqian Zhu wrote:
>>>>>> The second question is that you observed longer migration time 
>>>>>> (55s->73s) when guest
>>>>>> has 24G ram and dirty rate is 800M/s. I am not clear about the reason. 
>>>>>> As with dirty
>>>>>> ring enabled, Qemu can get dirty info faster which means it handles 
>>>>>> dirty page more
>>>>>> quick, and guest can be throttled which means dirty page is generated 
>>>>>> slower. What's
>>>>>> the rationale for the longer migration time?
>>>>>
>>>>> Because dirty ring is more sensitive to dirty rate, while dirty bitmap is 
>>>>> more
>>>> Emm... Sorry that I'm very clear about this... I think that higher dirty 
>>>> rate doesn't cause
>>>> slower dirty_log_sync compared to that of legacy bitmap mode. Besides, 
>>>> higher dirty rate
>>>> means we may have more full-exit, which can properly limit the dirty rate. 
>>>> So it seems that
>>>> dirty ring "prefers" higher dirty rate.
>>>
>>> When I measured the 800MB/s it's in the guest, after throttling.
>>>
>>> Imagine another example: a VM has 1G memory keep dirtying with 10GB/s.  
>>> Dirty
>>> logging will need to collect even less for each iteration because memory 
>>> size
>>> shrinked, collect even less frequent due to the high dirty rate, however 
>>> dirty
>>> ring will use 100% cpu power to collect dirty pages because the ring keeps 
>>> full.
>> Looks good.
>>
>> We have many places to collect dirty pages: the background reaper, vCPU exit 
>> handler,
>> and the migration thread. I think migration time is closely related to the 
>> migration thread.
>>
>> The migration thread calls kvm_dirty_ring_flush().
>> 1. kvm_cpu_synchronize_kick_all() will wait vcpu handles full-exit.
>> 2. kvm_dirty_ring_reap() collects and resets dirty pages.
>> The above two operation will spend more time with higher dirty rate.
>>
>> But I suddenly realize that the key problem maybe not at this. Though we 
>> have separate
>> "reset" operation for dirty ring, actually it is performed right after we 
>> collect dirty
>> ring to kvmslot. So in dirty ring mode, it likes legacy bitmap mode without 
>> manual_dirty_clear.
>>
>> If we can "reset" dirty ring just before we really handle the dirty pages, 
>> we can have
>> shorter migration time. But the design of dirty ring doesn't allow this, 
>> because we must
>> perform reset to make free space...
> 
> This is a very good point.
> 
> Dirty ring should have been better in quite some ways already, but from that
> pov as you said it goes a bit backwards on reprotection of pages (not to
> mention currently we can't even reset the ring per-vcpu; that seems to be not
> fully matching the full locality that the rings have provided as well; but
> Paolo and I discussed with that issue, it's about TLB flush expensiveness, so
> we still need to think more of it..).
> 
> Ideally the ring could have been both per-vcpu but also bi-directional (then
> we'll have 2*N rings, N=vcpu number), so as to split the state transition into
> "dirty ring" and "reprotect ring", then that reprotect ring will be the clear
> dirty log.  That'll look more like virtio as used ring.  However we'll still
> need to think about the TLB flush issue too as Paolo used to mention, as
> that'll exist too with any per-vcpu flush model (each reprotect of page will
> need a tlb flush of all vcpus).
> 
> Or.. maybe we can make the flush ring a standalone one, so we need N dirty 
> ring
> and one global flush ring.
Yep, have separate "reprotect" ring(s) is a good idea.

> 
> Anyway.. Before that, I'd still think the next step should be how to integrate
> qemu to fully leverage current ring model, so as to be able to throttle in
> per-vcpu fashion.
> 
> The major issue (IMHO) with huge VM migration is:
> 
>   1. Convergence
>   2. Responsiveness
> 
> Here we'll have a chance to solve (1) by highly throttle the working vcpu
> threads, meanwhile still keep (2) by not throttle user interactive threads.
> I'm not sure whether this will really work as expected, but just show what I'm
> thinking

Re: [PATCH v5 00/10] KVM: Dirty ring support (QEMU part)

2021-03-23 Thread Keqian Zhu
Hi Peter,

On 2021/3/23 22:34, Peter Xu wrote:
> Keqian,
> 
> On Tue, Mar 23, 2021 at 02:40:43PM +0800, Keqian Zhu wrote:
>>>> The second question is that you observed longer migration time (55s->73s) 
>>>> when guest
>>>> has 24G ram and dirty rate is 800M/s. I am not clear about the reason. As 
>>>> with dirty
>>>> ring enabled, Qemu can get dirty info faster which means it handles dirty 
>>>> page more
>>>> quick, and guest can be throttled which means dirty page is generated 
>>>> slower. What's
>>>> the rationale for the longer migration time?
>>>
>>> Because dirty ring is more sensitive to dirty rate, while dirty bitmap is 
>>> more
>> Emm... Sorry that I'm very clear about this... I think that higher dirty 
>> rate doesn't cause
>> slower dirty_log_sync compared to that of legacy bitmap mode. Besides, 
>> higher dirty rate
>> means we may have more full-exit, which can properly limit the dirty rate. 
>> So it seems that
>> dirty ring "prefers" higher dirty rate.
> 
> When I measured the 800MB/s it's in the guest, after throttling.
> 
> Imagine another example: a VM has 1G memory keep dirtying with 10GB/s.  Dirty
> logging will need to collect even less for each iteration because memory size
> shrinked, collect even less frequent due to the high dirty rate, however dirty
> ring will use 100% cpu power to collect dirty pages because the ring keeps 
> full.
Looks good.

We have many places to collect dirty pages: the background reaper, vCPU exit 
handler,
and the migration thread. I think migration time is closely related to the 
migration thread.

The migration thread calls kvm_dirty_ring_flush().
1. kvm_cpu_synchronize_kick_all() will wait vcpu handles full-exit.
2. kvm_dirty_ring_reap() collects and resets dirty pages.
The above two operation will spend more time with higher dirty rate.

But I suddenly realize that the key problem maybe not at this. Though we have 
separate
"reset" operation for dirty ring, actually it is performed right after we 
collect dirty
ring to kvmslot. So in dirty ring mode, it likes legacy bitmap mode without 
manual_dirty_clear.

If we can "reset" dirty ring just before we really handle the dirty pages, we 
can have
shorter migration time. But the design of dirty ring doesn't allow this, 
because we must
perform reset to make free space...

> 
>>
>>> sensitive to memory footprint.  In above 24G mem + 800MB/s dirty rate
>>> condition, dirty bitmap seems to be more efficient, say, collecting dirty
>>> bitmap of 24G mem (24G/4K/8=0.75MB) for each migration cycle is fast enough.
>>>
>>> Not to mention that current implementation of dirty ring in QEMU is not
>>> complete - we still have two more layers of dirty bitmap, so it's actually a
>>> mixture of dirty bitmap and dirty ring.  This series is more like a POC on
>>> dirty ring interface, so as to let QEMU be able to run on KVM dirty ring.
>>> E.g., we won't have hang issue when getting dirty pages since it's totally
>>> async, however we'll still have some legacy dirty bitmap issues e.g. memory
>>> consumption of userspace dirty bitmaps are still linear to memory footprint.
>> The plan looks good and coordinated, but I have a concern. Our dirty ring 
>> actually depends
>> on the structure of hardware logging buffer (PML buffer). We can't say it 
>> can be properly
>> adapted to all kinds of hardware design in the future.
> 
> Sorry I don't get it - dirty ring can work with pure page wr-protect too?
Sure, it can. I just want to discuss many possible kinds of hardware logging 
buffer.
However, I'd like to stop at this, at least dirty ring works well with PML. :)

> 
>>
>>>
>>> Moreover, IMHO another important feature that dirty ring provided is 
>>> actually
>>> the full-exit, where we can pause a vcpu when it dirties too fast, while 
>>> other
>> I think a proper pause time is hard to decide. Short time may have little 
>> effect
>> of throttle, but long time may have heavy effect on guest. Do you have a 
>> good algorithm?
> 
> That's the next thing we can discuss.  IMHO I think the dirty ring is nice
> already because we can measure dirty rate per-vcpu, also we can throttle in
> vcpu granule.  That's something required for a good algorithm, say we 
> shouldn't
> block vcpu when there's small dirty rate, and in many cases that's the case 
> for
> e.g. UI threads.  Any algorithm should be based on these facts.
OK.

Thanks,
Keqian



Re: [PATCH v5 00/10] KVM: Dirty ring support (QEMU part)

2021-03-23 Thread Keqian Zhu
Hi Peter,

On 2021/3/23 3:45, Peter Xu wrote:
> On Mon, Mar 22, 2021 at 10:02:38PM +0800, Keqian Zhu wrote:
>> Hi Peter,
> 
> Hi, Keqian,
> 
> [...]
> 
>> You emphasize that dirty ring is a "Thread-local buffers", but dirty bitmap 
>> is global,
>> but I don't see it has optimization about "locking" compared to dirty bitmap.
>>
>> The thread-local means that vCPU can flush hardware buffer into dirty ring 
>> without
>> locking, but for bitmap, vCPU can also use atomic set to mark dirty without 
>> locking.
>> Maybe I miss something?
> 
> Yes, the atomic ops guaranteed locking as you said, but afaiu atomics are
> expensive already, since at least on x86 I think it needs to lock the memory
> bus.  IIUC that'll become even slower as cores grow, as long as the cores 
> share
> the memory bus.
> 
> KVM dirty ring is per-vcpu, it means its metadata can be modified locally
> without atomicity at all (but still, we'll need READ_ONCE/WRITE_ONCE to
> guarantee ordering of memory accesses).  It should scale better especially 
> with
> hosts who have lots of cores.
That makes sense to me.

> 
>>
>> The second question is that you observed longer migration time (55s->73s) 
>> when guest
>> has 24G ram and dirty rate is 800M/s. I am not clear about the reason. As 
>> with dirty
>> ring enabled, Qemu can get dirty info faster which means it handles dirty 
>> page more
>> quick, and guest can be throttled which means dirty page is generated 
>> slower. What's
>> the rationale for the longer migration time?
> 
> Because dirty ring is more sensitive to dirty rate, while dirty bitmap is more
Emm... Sorry that I'm very clear about this... I think that higher dirty rate 
doesn't cause
slower dirty_log_sync compared to that of legacy bitmap mode. Besides, higher 
dirty rate
means we may have more full-exit, which can properly limit the dirty rate. So 
it seems that
dirty ring "prefers" higher dirty rate.

> sensitive to memory footprint.  In above 24G mem + 800MB/s dirty rate
> condition, dirty bitmap seems to be more efficient, say, collecting dirty
> bitmap of 24G mem (24G/4K/8=0.75MB) for each migration cycle is fast enough.
> 
> Not to mention that current implementation of dirty ring in QEMU is not
> complete - we still have two more layers of dirty bitmap, so it's actually a
> mixture of dirty bitmap and dirty ring.  This series is more like a POC on
> dirty ring interface, so as to let QEMU be able to run on KVM dirty ring.
> E.g., we won't have hang issue when getting dirty pages since it's totally
> async, however we'll still have some legacy dirty bitmap issues e.g. memory
> consumption of userspace dirty bitmaps are still linear to memory footprint.
The plan looks good and coordinated, but I have a concern. Our dirty ring 
actually depends
on the structure of hardware logging buffer (PML buffer). We can't say it can 
be properly
adapted to all kinds of hardware design in the future.

> 
> Moreover, IMHO another important feature that dirty ring provided is actually
> the full-exit, where we can pause a vcpu when it dirties too fast, while other
I think a proper pause time is hard to decide. Short time may have little effect
of throttle, but long time may have heavy effect on guest. Do you have a good 
algorithm?


> vcpus won't be affected.  That's something I really wanted to POC too but I
> don't have enough time.  I think it's a worth project in the future to really
> make the full-exit throttle vcpus, then ideally we'll remove all the dirty
> bitmaps in QEMU as long as dirty ring is on.
> 
> So I'd say the number I got at that time is not really helping a lot - as you
> can see for small VMs it won't make things faster.  Maybe a bit more 
> efficient?
> I can't tell.  From design-wise it looks actually still better.  However dirty
> logging still has the reasoning to be the default interface we use for small
> vms, imho.
I see.

> 
>>
>> PS: As the dirty ring is still converted into dirty_bitmap of kvm_slot, so 
>> the
>> "get dirty info faster" maybe not true. :-(
> 
> We can get dirty info faster even now, I think, because previously we only do
> KVM_GET_DIRTY_LOG once per migration iteration, which could be tens of seconds
> for a VM mentioned above with 24G and 800MB/s dirty rate.  Dirty ring is fully
> async, we'll get that after the reaper thread timeout.  However I must also
> confess "get dirty info faster" doesn't help us a lot on anything yet, afaict,
> comparing to a full-featured dirty logging where clear dirty log and so on.
OK.

> 
> Hope above helps.
Sure, thanks. :)


Keqian



Re: [PATCH v5 10/10] KVM: Dirty ring support

2021-03-22 Thread Keqian Zhu



On 2021/3/23 2:52, Peter Xu wrote:
> On Mon, Mar 22, 2021 at 09:37:19PM +0800, Keqian Zhu wrote:
>>> +/* Should be with all slots_lock held for the address spaces. */
>>> +static void kvm_dirty_ring_mark_page(KVMState *s, uint32_t as_id,
>>> + uint32_t slot_id, uint64_t offset)
>>> +{
>>> +KVMMemoryListener *kml;
>>> +KVMSlot *mem;
>>> +
>>> +if (as_id >= s->nr_as) {
>>> +return;
>>> +}
>>> +
>>> +kml = s->as[as_id].ml;
>>> +mem = >slots[slot_id];
>>> +
>>> +if (!mem->memory_size || offset >= (mem->memory_size / 
>>> TARGET_PAGE_SIZE)) {
>> It seems that TARGET_PAGE_SIZE should be qemu_real_host_page_size.
> 
> Fixed.
> 
> [...]
> 
>>> +/*
>>> + * Flush all the existing dirty pages to the KVM slot buffers.  When
>>> + * this call returns, we guarantee that all the touched dirty pages
>>> + * before calling this function have been put into the per-kvmslot
>>> + * dirty bitmap.
>>> + *
>>> + * This function must be called with BQL held.
>>> + */
>>> +static void kvm_dirty_ring_flush(struct KVMDirtyRingReaper *r)
>> The argument is not used.
> 
> Indeed, removed.
> 
>>
>>> +{
>>> +trace_kvm_dirty_ring_flush(0);
>>> +/*
>>> + * The function needs to be serialized.  Since this function
>>> + * should always be with BQL held, serialization is guaranteed.
>>> + * However, let's be sure of it.
>>> + */
>>> +assert(qemu_mutex_iothread_locked());
>>> +/*
>>> + * First make sure to flush the hardware buffers by kicking all
>>> + * vcpus out in a synchronous way.
>>> + */
>>> +kvm_cpu_synchronize_kick_all();
>> Can we make this function to be architecture specific?
>> It seems that kick out vCPU is an architecture specific way to flush 
>> hardware buffers
>> to dirty ring (for x86 PML).
> 
> I can do that, but I'd say it's kind of an overkill if after all the kernel
> support is not there yet, so I still tend to make it as simple as possible.
OK.

> 
> [...]
> 
>>> +static void *kvm_dirty_ring_reaper_thread(void *data)
>>> +{
>>> +KVMState *s = data;
>>> +struct KVMDirtyRingReaper *r = >reaper;
>>> +
>>> +rcu_register_thread();
>>> +
>>> +trace_kvm_dirty_ring_reaper("init");
>>> +
>>> +while (true) {
>>> +r->reaper_state = KVM_DIRTY_RING_REAPER_WAIT;
>>> +trace_kvm_dirty_ring_reaper("wait");
>>> +/*
>>> + * TODO: provide a smarter timeout rather than a constant?
>>> + */
>>> +sleep(1);
>>> +
>>> +trace_kvm_dirty_ring_reaper("wakeup");
>>> +r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING;
>>> +
>>> +qemu_mutex_lock_iothread();
>>> +kvm_dirty_ring_reap(s);
>>> +qemu_mutex_unlock_iothread();
>>> +
>>> +r->reaper_iteration++;
>>> +}
>> I don't know when does this iteration exit?
>> And I see that we start this reaper_thread in kvm_init(), maybe it's better 
>> to start it
>> when start dirty log and stop it when stop dirty log.
> 
> Yes we can make it conditional, but note that we can't hook at functions like
> memory_global_dirty_log_start() because that is only for migration purpose.
> 
> Currently QEMU exports the dirty tracking more than that, e.g., to the VGA
> code.  We'll need to try to detect whether there's any existing MR got its
> mr->dirty_log_mask set (besides global_dirty_log being set).  When all of them
> got cleared we'll need to detect too so as to turn the thread off.
> 
> It's just easier to me to run this thread with such a timeout, then when not
> necessary it'll see empty ring and return fast (index comparison for each
> ring).  Not to mention the VGA dirty tracking should be on for most of the VM
> lifecycle, so even if we have that knob this thread will probably be running
> for 99% of the time as long as any MR has its DIRTY_MEMORY_VGA bit set.
Make sense. Thanks for your explanation!

Thanks,
Keqian
> 
> [...]
> 
>>> diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
>>> index c68bc3ba8af..2f0991d93f7 100644
>>> --- a/include/hw/core/cpu.h
>>> +++ b/include/hw/core/cpu.h
>>> @@ -323,6 +323,11 @@ struct qemu_work_item;
>>>   * @ignore_memory_transaction_failures: Cached copy of the MachineState
>>>   *flag of the same name: allows the board to suppress calling of the
>>>   *CPU do_transaction_failed hook function.
>>> + * @kvm_dirty_ring_full:
>>> + *   Whether the kvm dirty ring of this vcpu is soft-full.
>>> + * @kvm_dirty_ring_avail:
>>> + *   Semaphore to be posted when the kvm dirty ring of the vcpu is
>>> + *   available again.
>> The doc does not match code.
> 
> Right; fixed.
> 
> Thanks for taking a look, keqian.
> 



Re: [PATCH v5 00/10] KVM: Dirty ring support (QEMU part)

2021-03-22 Thread Keqian Zhu
Hi Peter,

On 2021/3/11 4:32, Peter Xu wrote:
> This is v5 of the qemu dirty ring interface support.
> 
> 
> 
> v5:
> 
> - rebase
> 
> - dropped patch "update-linux-headers: Include const.h" after rebase
> 
> - dropped patch "KVM: Fixup kvm_log_clear_one_slot() ioctl return check" since
> 
>   similar patch got merged recently (38e0b7904eca7cd32f8953c3)
> 
> 
> 
> = v4 cover letter below =
> 
> 
> 
> It is merely the same as v3 content-wise, but there're a few things to mention
> 
> besides the rebase itself:
> 
> 
> 
>   - I picked up two patches from Eric Farman for the linux-header updates 
> (from
> 
> Eric's v3 series) for convenience just in case any of the series would got
> 
> queued by any maintainer.
> 
> 
> 
>   - One more patch is added as "KVM: Disable manual dirty log when dirty ring
> 
> enabled".  I found this when testing the branch after rebasing to latest
> 
> qemu, that not only the manual dirty log capability is not needed for kvm
> 
> dirty ring, but more importantly INITIALLY_ALL_SET is totally against kvm
> 
> dirty ring and it could silently crash the guest after migration.  For 
> this
> 
> new commit, I touched up "KVM: Add dirty-gfn-count property" a bit.
> 
> 
> 
>   - A few more documentation lines in qemu-options.hx.
> 
> 
> 
>   - I removed the RFC tag after kernel series got merged.
> 
> 
> 
> Again, this is only the 1st step to support dirty ring.  Ideally dirty ring
> 
> should grant QEMU the possibility to remove the whole layered dirty bitmap so
> 
> that dirty ring will work similarly as auto-converge enabled but should 
> better;
> 
> we will just throttle vcpus with the dirty ring kvm exit rather than 
> explicitly
> 
> adding a timer to stop the vcpu thread from entering the guest again (like 
> what
> 
> we did with current migration auto-converge).  Some more information could 
> also
> 
> be found in the kvm forum 2020 talk regarding kvm dirty ring (slides 21/22 
> [1]).
I have read this pdf and code, and I have some questions, hope you can help me. 
:)

You emphasize that dirty ring is a "Thread-local buffers", but dirty bitmap is 
global,
but I don't see it has optimization about "locking" compared to dirty bitmap.

The thread-local means that vCPU can flush hardware buffer into dirty ring 
without
locking, but for bitmap, vCPU can also use atomic set to mark dirty without 
locking.
Maybe I miss something?

The second question is that you observed longer migration time (55s->73s) when 
guest
has 24G ram and dirty rate is 800M/s. I am not clear about the reason. As with 
dirty
ring enabled, Qemu can get dirty info faster which means it handles dirty page 
more
quick, and guest can be throttled which means dirty page is generated slower. 
What's
the rationale for the longer migration time?

PS: As the dirty ring is still converted into dirty_bitmap of kvm_slot, so the
"get dirty info faster" maybe not true. :-(

Thanks,
Keqian

> 
> 
> 
> That next step (to remove all the dirty bitmaps, as mentioned above) is still
> 
> discussable: firstly I don't know whether there's anything I've overlooked in
> 
> there.  Meanwhile that's also only services huge VM cases, may not be 
> extremely
> 
> helpful with a lot major scenarios where VMs are not that huge.
> 
> 
> 
> There's probably other ways to fix huge VM migration issues, majorly focusing
> 
> on responsiveness and convergence.  For example, Google has proposed some new
> 
> userfaultfd kernel capability called "minor modes" [2] to track page minor
> 
> faults and that could be finally served for that purpose too using postcopy.
> 
> That's another long story so I'll stop here, but just as a marker along with
> 
> the dirty ring series so there'll still be a record to reference.
> 
> 
> 
> Said that, I still think this series is very worth merging even if we don't
> 
> persue the next steps yet, since dirty ring is disabled by default, and we can
> 
> always work upon this series.
> 
> 
> 
> Please review, thanks.
> 
> 
> 
> V3: 
> https://lore.kernel.org/qemu-devel/20200523232035.1029349-1-pet...@redhat.com/
> 
> (V3 contains all the pre-v3 changelog)
> 
> 
> 
> QEMU branch for testing (requires kernel version 5.11-rc1+):
> 
> https://github.com/xzpeter/qemu/tree/kvm-dirty-ring
> 
> 
> 
> [1] 
> https://static.sched.com/hosted_files/kvmforum2020/97/kvm_dirty_ring_peter.pdf
> 
> [2] 
> https://lore.kernel.org/lkml/20210107190453.3051110-1-axelrasmus...@google.com/
> 
> 
> 
> ---8<-
> 
> 
> 
> Overview
> 
> 
> 
> 
> 
> KVM dirty ring is a new interface to pass over dirty bits from kernel
> 
> to the userspace.  Instead of using a bitmap for each memory region,
> 
> the dirty ring contains an array of dirtied GPAs to fetch, one ring
> 
> per vcpu.
> 
> 
> 
> There're a few major changes comparing to how the old dirty logging
> 
> interface would work:
> 
> 
> 
> - Granularity of dirty bits
> 
> 
> 

Re: [PATCH v5 10/10] KVM: Dirty ring support

2021-03-22 Thread Keqian Zhu



On 2021/3/11 4:33, Peter Xu wrote:
> KVM dirty ring is a new interface to pass over dirty bits from kernel to the
> userspace.  Instead of using a bitmap for each memory region, the dirty ring
> contains an array of dirtied GPAs to fetch (in the form of offset in slots).
> For each vcpu there will be one dirty ring that binds to it.
> 
> kvm_dirty_ring_reap() is the major function to collect dirty rings.  It can be
> called either by a standalone reaper thread that runs in the background,
> collecting dirty pages for the whole VM.  It can also be called directly by 
> any
> thread that has BQL taken.
> 
> Signed-off-by: Peter Xu 
> ---
>  accel/kvm/kvm-all.c| 332 -
>  accel/kvm/trace-events |   7 +
>  include/hw/core/cpu.h  |   8 +
>  3 files changed, 344 insertions(+), 3 deletions(-)
> 
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index ae9393266b2..bf2b21f038b 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -15,6 +15,7 @@
>  
>  #include "qemu/osdep.h"
>  #include 
> +#include 
>  
>  #include 
>  
> @@ -80,6 +81,25 @@ struct KVMParkedVcpu {
>  QLIST_ENTRY(KVMParkedVcpu) node;
>  };
>  
> +enum KVMDirtyRingReaperState {
> +KVM_DIRTY_RING_REAPER_NONE = 0,
> +/* The reaper is sleeping */
> +KVM_DIRTY_RING_REAPER_WAIT,
> +/* The reaper is reaping for dirty pages */
> +KVM_DIRTY_RING_REAPER_REAPING,
> +};
> +
> +/*
> + * KVM reaper instance, responsible for collecting the KVM dirty bits
> + * via the dirty ring.
> + */
> +struct KVMDirtyRingReaper {
> +/* The reaper thread */
> +QemuThread reaper_thr;
> +volatile uint64_t reaper_iteration; /* iteration number of reaper thr */
> +volatile enum KVMDirtyRingReaperState reaper_state; /* reap thr state */
> +};
> +
>  struct KVMState
>  {
>  AccelState parent_obj;
> @@ -131,6 +151,7 @@ struct KVMState
>  bool kvm_dirty_ring_enabled;/* Whether KVM dirty ring is enabled */
>  uint64_t kvm_dirty_ring_size;   /* Size of the per-vcpu dirty ring */
>  uint32_t kvm_dirty_gfn_count;   /* Number of dirty GFNs per ring */
> +struct KVMDirtyRingReaper reaper;
>  };
>  
>  KVMState *kvm_state;
> @@ -392,6 +413,13 @@ static int do_kvm_destroy_vcpu(CPUState *cpu)
>  goto err;
>  }
>  
> +if (cpu->kvm_dirty_gfns) {
> +ret = munmap(cpu->kvm_dirty_gfns, s->kvm_dirty_ring_size);
> +if (ret < 0) {
> +goto err;
> +}
> +}
> +
>  vcpu = g_malloc0(sizeof(*vcpu));
>  vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
>  vcpu->kvm_fd = cpu->kvm_fd;
> @@ -468,6 +496,19 @@ int kvm_init_vcpu(CPUState *cpu, Error **errp)
>  (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE;
>  }
>  
> +if (s->kvm_dirty_ring_enabled) {
> +/* Use MAP_SHARED to share pages with the kernel */
> +cpu->kvm_dirty_gfns = mmap(NULL, s->kvm_dirty_ring_size,
> +   PROT_READ | PROT_WRITE, MAP_SHARED,
> +   cpu->kvm_fd,
> +   PAGE_SIZE * KVM_DIRTY_LOG_PAGE_OFFSET);
> +if (cpu->kvm_dirty_gfns == MAP_FAILED) {
> +ret = -errno;
> +DPRINTF("mmap'ing vcpu dirty gfns failed: %d\n", ret);
> +goto err;
> +}
> +}
> +
>  ret = kvm_arch_init_vcpu(cpu);
>  if (ret < 0) {
>  error_setg_errno(errp, -ret,
> @@ -586,6 +627,11 @@ static void kvm_slot_sync_dirty_pages(KVMSlot *slot)
>  cpu_physical_memory_set_dirty_lebitmap(slot->dirty_bmap, start, pages);
>  }
>  
> +static void kvm_slot_reset_dirty_pages(KVMSlot *slot)
> +{
> +memset(slot->dirty_bmap, 0, slot->dirty_bmap_size);
> +}
> +
>  #define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))
>  
>  /* Allocate the dirty bitmap for a slot  */
> @@ -642,6 +688,170 @@ static bool kvm_slot_get_dirty_log(KVMState *s, KVMSlot 
> *slot)
>  return ret == 0;
>  }
>  
> +/* Should be with all slots_lock held for the address spaces. */
> +static void kvm_dirty_ring_mark_page(KVMState *s, uint32_t as_id,
> + uint32_t slot_id, uint64_t offset)
> +{
> +KVMMemoryListener *kml;
> +KVMSlot *mem;
> +
> +if (as_id >= s->nr_as) {
> +return;
> +}
> +
> +kml = s->as[as_id].ml;
> +mem = >slots[slot_id];
> +
> +if (!mem->memory_size || offset >= (mem->memory_size / 
> TARGET_PAGE_SIZE)) {
It seems that TARGET_PAGE_SIZE should be qemu_real_host_page_size.


> +return;
> +}
> +
> +set_bit(offset, mem->dirty_bmap);
> +}
> +
> +static bool dirty_gfn_is_dirtied(struct kvm_dirty_gfn *gfn)
> +{
> +return gfn->flags == KVM_DIRTY_GFN_F_DIRTY;
> +}
> +
> +static void dirty_gfn_set_collected(struct kvm_dirty_gfn *gfn)
> +{
> +gfn->flags = KVM_DIRTY_GFN_F_RESET;
> +}
> +
> +/*
> + * Should be with all slots_lock held for the address spaces.  It returns the
> + * dirty page we've collected on this dirty ring.
> + */
> 

Re: [PATCH v5 02/10] KVM: Use a big lock to replace per-kml slots_lock

2021-03-22 Thread Keqian Zhu
Hi Peter,

On 2021/3/11 4:32, Peter Xu wrote:
> Per-kml slots_lock will bring some trouble if we want to take all slots_lock 
> of
> all the KMLs, especially when we're in a context that we could have taken some
> of the KML slots_lock, then we even need to figure out what we've taken and
> what we need to take.
> 
> Make this simple by merging all KML slots_lock into a single slots lock.
> 
> Per-kml slots_lock isn't anything that helpful anyway - so far only x86 has 
> two
> address spaces (so, two slots_locks).  All the rest archs will be having one
> address space always, which means there's actually one slots_lock so it will 
> be
> the same as before.
> 
> Signed-off-by: Peter Xu 
> ---
>  accel/kvm/kvm-all.c  | 32 +---
>  include/sysemu/kvm_int.h |  2 --
>  2 files changed, 17 insertions(+), 17 deletions(-)
> 
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index f88a52393fe..94e881f123b 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -174,8 +174,10 @@ typedef struct KVMResampleFd KVMResampleFd;
>  static QLIST_HEAD(, KVMResampleFd) kvm_resample_fd_list =
>  QLIST_HEAD_INITIALIZER(kvm_resample_fd_list);
>  
> -#define kvm_slots_lock(kml)  qemu_mutex_lock(&(kml)->slots_lock)
> -#define kvm_slots_unlock(kml)qemu_mutex_unlock(&(kml)->slots_lock)
> +static QemuMutex kml_slots_lock;
> +
> +#define kvm_slots_lock()  qemu_mutex_lock(_slots_lock)
> +#define kvm_slots_unlock()  qemu_mutex_unlock(_slots_lock)
nit: qemu_mutex_lock and qemu_mutex_unlock is not aligned.


>  
>  static inline void kvm_resample_fd_remove(int gsi)
>  {
> @@ -241,9 +243,9 @@ bool kvm_has_free_slot(MachineState *ms)
>  bool result;
>  KVMMemoryListener *kml = >memory_listener;
>  
> -kvm_slots_lock(kml);
> +kvm_slots_lock();
>  result = !!kvm_get_free_slot(kml);
> -kvm_slots_unlock(kml);
> +kvm_slots_unlock();
>  
>  return result;
>  }
> @@ -309,7 +311,7 @@ int kvm_physical_memory_addr_from_host(KVMState *s, void 
> *ram,
>  KVMMemoryListener *kml = >memory_listener;
>  int i, ret = 0;
>  
> -kvm_slots_lock(kml);
> +kvm_slots_lock();
>  for (i = 0; i < s->nr_slots; i++) {
>  KVMSlot *mem = >slots[i];
>  
> @@ -319,7 +321,7 @@ int kvm_physical_memory_addr_from_host(KVMState *s, void 
> *ram,
>  break;
>  }
>  }
> -kvm_slots_unlock(kml);
> +kvm_slots_unlock();
>  
>  return ret;
>  }
> @@ -515,7 +517,7 @@ static int kvm_section_update_flags(KVMMemoryListener 
> *kml,
>  return 0;
>  }
>  
> -kvm_slots_lock(kml);
> +kvm_slots_lock();
>  
>  while (size && !ret) {
>  slot_size = MIN(kvm_max_slot_size, size);
> @@ -531,7 +533,7 @@ static int kvm_section_update_flags(KVMMemoryListener 
> *kml,
>  }
>  
>  out:
> -kvm_slots_unlock(kml);
> +kvm_slots_unlock();
>  return ret;
>  }
>  
> @@ -819,7 +821,7 @@ static int kvm_physical_log_clear(KVMMemoryListener *kml,
>  return ret;
>  }
>  
> -kvm_slots_lock(kml);
> +kvm_slots_lock();
>  
>  for (i = 0; i < s->nr_slots; i++) {
>  mem = >slots[i];
> @@ -845,7 +847,7 @@ static int kvm_physical_log_clear(KVMMemoryListener *kml,
>  }
>  }
>  
> -kvm_slots_unlock(kml);
> +kvm_slots_unlock();
>  
>  return ret;
>  }
> @@ -1150,7 +1152,7 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
>  ram = memory_region_get_ram_ptr(mr) + section->offset_within_region +
>(start_addr - section->offset_within_address_space);
>  
> -kvm_slots_lock(kml);
> +kvm_slots_lock();
>  
>  if (!add) {
>  do {
> @@ -1208,7 +1210,7 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
>  } while (size);
>  
>  out:
> -kvm_slots_unlock(kml);
> +kvm_slots_unlock();
>  }
>  
>  static void kvm_region_add(MemoryListener *listener,
> @@ -1235,9 +1237,9 @@ static void kvm_log_sync(MemoryListener *listener,
>  KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, 
> listener);
>  int r;
>  
> -kvm_slots_lock(kml);
> +kvm_slots_lock();
>  r = kvm_physical_sync_dirty_bitmap(kml, section);
> -kvm_slots_unlock(kml);
> +kvm_slots_unlock();
>  if (r < 0) {
>  abort();
>  }
> @@ -1337,7 +1339,7 @@ void kvm_memory_listener_register(KVMState *s, 
> KVMMemoryListener *kml,
>  {
>  int i;
>  
> -qemu_mutex_init(>slots_lock);
> +qemu_mutex_init(_slots_lock);
As you said, x86 has two address spaces, is it a problem that we may have multi 
initialization for kml_slots_lock?

Thanks,
Keqian

>  kml->slots = g_malloc0(s->nr_slots * sizeof(KVMSlot));
>  kml->as_id = as_id;
>  
> diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h
> index ccb8869f01b..1da30e18841 100644
> --- a/include/sysemu/kvm_int.h
> +++ b/include/sysemu/kvm_int.h
> @@ -27,8 +27,6 @@ typedef struct KVMSlot
>  
>  typedef struct KVMMemoryListener {
>  

Re: [PATCH v5 09/10] KVM: Disable manual dirty log when dirty ring enabled

2021-03-22 Thread Keqian Zhu
Hi Peter,

On 2021/3/11 4:33, Peter Xu wrote:
> KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is for KVM_CLEAR_DIRTY_LOG, which is only
> useful for KVM_GET_DIRTY_LOG.  Skip enabling it for kvm dirty ring.
> 
> More importantly, KVM_DIRTY_LOG_INITIALLY_SET will not wr-protect all the 
> pages
> initially, which is against how kvm dirty ring is used - there's no way for 
> kvm
> dirty ring to re-protect a page before it's notified as being written first
> with a GFN entry in the ring!  So when KVM_DIRTY_LOG_INITIALLY_SET is enabled
> with dirty ring, we'll see silent data loss after migration.
I feel a little regret that dirty ring can not work with 
KVM_DIRTY_LOG_INITIALLY_SET ...
With KVM_DIRTY_LOG_INITIALLY_SET, we can speedup dirty log start. More 
important, we can
enable dirty log gradually. For write fault based dirty log, it greatly reduces 
the side
effect of dirty log over guest.

I hope we can put forward another similar optimization under dirty ring mode. :)

Thanks,
Keqian

> 
> Signed-off-by: Peter Xu 
> ---
>  accel/kvm/kvm-all.c | 37 +++--
>  1 file changed, 23 insertions(+), 14 deletions(-)
> 
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index 10137b6af11..ae9393266b2 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -2173,20 +2173,29 @@ static int kvm_init(MachineState *ms)
>  }
>  }
>  
> -dirty_log_manual_caps =
> -kvm_check_extension(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
> -dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
> -  KVM_DIRTY_LOG_INITIALLY_SET);
> -s->manual_dirty_log_protect = dirty_log_manual_caps;
> -if (dirty_log_manual_caps) {
> -ret = kvm_vm_enable_cap(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0,
> -   dirty_log_manual_caps);
> -if (ret) {
> -warn_report("Trying to enable capability %"PRIu64" of "
> -"KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 but failed. "
> -"Falling back to the legacy mode. ",
> -dirty_log_manual_caps);
> -s->manual_dirty_log_protect = 0;
> +/*
> + * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is not needed when dirty ring is
> + * enabled.  More importantly, KVM_DIRTY_LOG_INITIALLY_SET will assume no
> + * page is wr-protected initially, which is against how kvm dirty ring is
> + * usage - kvm dirty ring requires all pages are wr-protected at the very
> + * beginning.  Enabling this feature for dirty ring causes data 
> corruption.
> + */
> +if (!s->kvm_dirty_ring_enabled) {
> +dirty_log_manual_caps =
> +kvm_check_extension(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
> +dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
> +  KVM_DIRTY_LOG_INITIALLY_SET);
> +s->manual_dirty_log_protect = dirty_log_manual_caps;
> +if (dirty_log_manual_caps) {
> +ret = kvm_vm_enable_cap(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0,
> +dirty_log_manual_caps);
> +if (ret) {
> +warn_report("Trying to enable capability %"PRIu64" of "
> +"KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 but failed. "
> +"Falling back to the legacy mode. ",
> +dirty_log_manual_caps);
> +s->manual_dirty_log_protect = 0;
> +}
>  }
>  }
>  
> 



Re: [PATCH 0/3] hw: Constify VMStateDescription

2021-03-15 Thread Keqian Zhu
Hi Philippe,

It seems that vmstate_ecc_state and vmstate_x86_cpu can also be constified.
Found by .

Thanks,
Keqian

On 2021/3/14 1:11, Philippe Mathieu-Daudé wrote:
> VMStateDescription isn't supposed to be modified.
> 
> 
> 
> Philippe Mathieu-Daudé (3):
> 
>   hw/arm: Constify VMStateDescription
> 
>   hw/display/qxl: Constify VMStateDescription
> 
>   hw/usb: Constify VMStateDescription
> 
> 
> 
>  hw/arm/highbank.c | 2 +-
> 
>  hw/arm/pxa2xx_pic.c   | 2 +-
> 
>  hw/arm/spitz.c| 4 ++--
> 
>  hw/arm/strongarm.c| 2 +-
> 
>  hw/arm/z2.c   | 4 ++--
> 
>  hw/display/qxl.c  | 8 
> 
>  hw/dma/pxa2xx_dma.c   | 4 ++--
> 
>  hw/misc/mst_fpga.c| 2 +-
> 
>  hw/usb/ccid-card-passthru.c   | 2 +-
> 
>  hw/usb/dev-smartcard-reader.c | 8 
> 
>  10 files changed, 19 insertions(+), 19 deletions(-)
> 
> 
> 



Re: [PATCH V3 1/8] hw/block/nvme: support namespace detach

2021-03-14 Thread Keqian Zhu
Hi,

I don't dig into code logic, just some nit below.

On 2021/3/1 0:10, Minwoo Im wrote:
> Given that now we have nvme-subsys device supported, we can manage
> namespace allocated, but not attached: detached.  This patch introduced
s/introduced/introduces

> a parameter for nvme-ns device named 'detached'.  This parameter
> indicates whether the given namespace device is detached from
> a entire NVMe subsystem('subsys' given case, shared namespace) or a
> controller('bus' given case, private namespace).
> 
> - Allocated namespace
> 
>   1) Shared ns in the subsystem 'subsys0':
> 
>  -device nvme-ns,id=ns1,drive=blknvme0,nsid=1,subsys=subsys0,detached=true
> 
>   2) Private ns for the controller 'nvme0' of the subsystem 'subsys0':
> 
>  -device nvme-subsys,id=subsys0
>  -device nvme,serial=foo,id=nvme0,subsys=subsys0
>  -device nvme-ns,id=ns1,drive=blknvme0,nsid=1,bus=nvme0,detached=true
> 
>   3) (Invalid case) Controller 'nvme0' has no subsystem to manage ns:
> 
>  -device nvme,serial=foo,id=nvme0
>  -device nvme-ns,id=ns1,drive=blknvme0,nsid=1,bus=nvme0,detached=true
> 
> Signed-off-by: Minwoo Im 
> Tested-by: Klaus Jensen 
> Reviewed-by: Klaus Jensen 
> ---
>  hw/block/nvme-ns.c |  1 +
>  hw/block/nvme-ns.h |  1 +
>  hw/block/nvme-subsys.h |  1 +
>  hw/block/nvme.c| 41 +++--
>  hw/block/nvme.h| 22 ++
>  5 files changed, 64 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
> index 0e8760020483..eda6a0c003a4 100644
> --- a/hw/block/nvme-ns.c
> +++ b/hw/block/nvme-ns.c
> @@ -399,6 +399,7 @@ static Property nvme_ns_props[] = {
>  DEFINE_BLOCK_PROPERTIES(NvmeNamespace, blkconf),
>  DEFINE_PROP_LINK("subsys", NvmeNamespace, subsys, TYPE_NVME_SUBSYS,
>   NvmeSubsystem *),
> +DEFINE_PROP_BOOL("detached", NvmeNamespace, params.detached, false),
>  DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0),
>  DEFINE_PROP_UUID("uuid", NvmeNamespace, params.uuid),
>  DEFINE_PROP_UINT16("mssrl", NvmeNamespace, params.mssrl, 128),
> diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
> index 7af6884862b5..b0c00e115d81 100644
> --- a/hw/block/nvme-ns.h
> +++ b/hw/block/nvme-ns.h
> @@ -26,6 +26,7 @@ typedef struct NvmeZone {
>  } NvmeZone;
>  
>  typedef struct NvmeNamespaceParams {
> +bool detached;
>  uint32_t nsid;
>  QemuUUID uuid;
>  
> diff --git a/hw/block/nvme-subsys.h b/hw/block/nvme-subsys.h
> index ccf6a71398d3..890d118117dc 100644
> --- a/hw/block/nvme-subsys.h
> +++ b/hw/block/nvme-subsys.h
> @@ -23,6 +23,7 @@ typedef struct NvmeSubsystem {
>  uint8_t subnqn[256];
>  
>  NvmeCtrl*ctrls[NVME_SUBSYS_MAX_CTRLS];
> +/* Allocated namespaces for this subsystem */
>  NvmeNamespace *namespaces[NVME_SUBSYS_MAX_NAMESPACES];
>  } NvmeSubsystem;
>  
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index edd0b85c10ce..f6aeae081840 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -23,7 +23,7 @@
>   *  max_ioqpairs=, \
>   *  aerl=, aer_max_queued=, \
>   *  mdts=,zoned.append_size_limit=, \
> - *  subsys= \
> + *  subsys=,detached=
>   *  -device nvme-ns,drive=,bus=,nsid=,\
>   *  zoned=, \
>   *  subsys=
> @@ -82,6 +82,13 @@
>   *   controllers in the subsystem. Otherwise, `bus` must be given to attach
>   *   this namespace to a specified single controller as a non-shared 
> namespace.
>   *
> + * - `detached`
> + *   Not to attach the namespace device to controllers in the NVMe subsystem
> + *   during boot-up. If not given, namespaces are all attahced to all
s/attahced/attached

> + *   controllers in the subsystem by default.
> + *   It's mutual exclusive with 'bus' parameter. It's only valid in case
> + *   `subsys` is provided.
> + *
>   * Setting `zoned` to true selects Zoned Command Set at the namespace.
>   * In this case, the following namespace properties are available to 
> configure
>   * zoned operation:
> @@ -4613,6 +4620,20 @@ static void nvme_init_state(NvmeCtrl *n)
>  n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
>  }
>  
> +static int nvme_attach_namespace(NvmeCtrl *n, NvmeNamespace *ns, Error 
> **errp)
> +{
> +if (nvme_ns_is_attached(n, ns)) {
> +error_setg(errp,
> +   "namespace %d is already attached to controller %d",
> +   nvme_nsid(ns), n->cntlid);
> +return -1;
> +}
> +
> +nvme_ns_attach(n, ns);
> +
> +return 0;
> +}
> +
>  int nvme_register_namespace(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
>  {
>  uint32_t nsid = nvme_nsid(ns);
> @@ -4644,7 +4665,23 @@ int nvme_register_namespace(NvmeCtrl *n, NvmeNamespace 
> *ns, Error **errp)
>  
>  trace_pci_nvme_register_namespace(nsid);
>  
> -n->namespaces[nsid - 1] = ns;
> +/*
> + * If subsys is not given, namespae is 

Re: [PATCH] vfio: Support host translation granule size

2021-03-10 Thread Keqian Zhu
Hi Alex,

On 2021/3/11 4:24, Alex Williamson wrote:
> On Wed, 10 Mar 2021 15:19:33 +0800
> Kunkun Jiang  wrote:
> 
>> Hi Alex,
>>
>> On 2021/3/10 7:17, Alex Williamson wrote:
>>> On Thu, 4 Mar 2021 21:34:46 +0800
>>> Kunkun Jiang  wrote:
>>>  
 The cpu_physical_memory_set_dirty_lebitmap() can quickly deal with
 the dirty pages of memory by bitmap-traveling, regardless of whether
 the bitmap is aligned correctly or not.

 cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
 host page size. So it'd better to set bitmap_pgsize to host page size
 to support more translation granule sizes.

 Fixes: 87ea529c502 (vfio: Get migration capability flags for container)
 Signed-off-by: Kunkun Jiang 
 ---
   hw/vfio/common.c | 44 ++--
   1 file changed, 22 insertions(+), 22 deletions(-)

 diff --git a/hw/vfio/common.c b/hw/vfio/common.c
 index 6ff1daa763..69fb5083a4 100644
 --- a/hw/vfio/common.c
 +++ b/hw/vfio/common.c
 @@ -378,7 +378,7 @@ static int vfio_dma_unmap_bitmap(VFIOContainer 
 *container,
   {
   struct vfio_iommu_type1_dma_unmap *unmap;
   struct vfio_bitmap *bitmap;
 -uint64_t pages = TARGET_PAGE_ALIGN(size) >> TARGET_PAGE_BITS;
 +uint64_t pages = REAL_HOST_PAGE_ALIGN(size) / 
 qemu_real_host_page_size;
   int ret;
   
   unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
 @@ -390,12 +390,12 @@ static int vfio_dma_unmap_bitmap(VFIOContainer 
 *container,
   bitmap = (struct vfio_bitmap *)>data;
   
   /*
 - * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of
 - * TARGET_PAGE_SIZE to mark those dirty. Hence set bitmap_pgsize to
 - * TARGET_PAGE_SIZE.
 + * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap 
 of
 + * qemu_real_host_page_size to mark those dirty. Hence set 
 bitmap_pgsize
 + * to qemu_real_host_page_size.  
>>>
>>> I don't see that this change is well supported by the code,
>>> cpu_physical_memory_set_dirty_lebitmap() seems to operate on  
>> Yes, cpu_physical_memory_set_dirty_lebitmap() is finally to operate on
>> TARGET_PAGE_SIZE. But actually it supports pages in bitmap of
>> qemu_real_host_page_size to mark those dirty. It uses
>> "hpratio = qemu_real_host_page_size / TARGET_PAGE_SIZE" to adapt to
>> different translation granule size(e.g. 4K 2M 1G).
> 
> Thanks for the explanation, it becomes more clear but I'm still a
> little confused.  It appears that
> cpu_physical_memory_set_dirty_lebitmap() requires a bitmap in terms of
> qemu_real_host_page_size which is translated to target pages using
> hpratio.  It's not clear to me by the explanation here and in the
> commit log that we're technically using the wrong page size reference
> for this function.
> 
>>> TARGET_PAGE_SIZE, and the next three patch chunks take a detour through
>>> memory listener code that seem unrelated to the change described in the
>>> commit log.  This claims to fix something, what is actually broken?
>>> Thanks,
>>>
>>> Alex  
>> This patch 87ea529c502 (vfio: Get migration capability flags for container)
>> is the start of the bug. The code of [1](marked below) restricts the host
>> page size must be TARGET_PAGE_SIZE(e.g. 4K) to set
>> container->dirty_pages_supported = true. It is inappropriate to limit the
>> page size to TARGET_PAGE_SIZE.
> 
> Right, the noted code requires that vfio supports the target page size,
> which for all practical purposes implies an hpratio = 1 currently.
> That unnecessarily restricts migration support to cases where target and
> host use the same page size, but this change allows migration in any
> case where vfio dirty bitmap supports the host page size, which is
> effectively any case where the device supports migration.  However, the
> hpratio calculation requires that qemu_real_host_page_size is >=
> TARGET_PAGE_SIZE, otherwise the value becomes zero and it appears we'd
> only ever dirty the target zero page.  Is this configuration restricted
> elsewhere, ex. 64K guest on 4K host?  Exchanging an unnecessary
> restriction for allowing a buggy configuration isn't a good trade-off.
> Thanks,
FYI, The restriction that (qemu_real_host_page_size is >= TARGET_PAGE_SIZE) has 
already
existed. As in the kvm_init(), we have an assert: assert(TARGET_PAGE_SIZE <= 
qemu_real_host_page_size);

As I understand, the TARGET_PAGE_SIZE is an architecture specific value,
and not affected by the page size of guest OS. For arm64, it is fixed to be
12 bit(4K), while the qemu_real_host_page_size depends on host kernel 
configuration,
it can be 4K, 16K or 64K.

IIUC, the kernel vfio/iommu_type1 actually reports supported page_size of dirty 
log
to be host_page_size, so if host use 16K or 64K, Qemu will refuse to support 
vfio migration.

Thanks,
Keqian



> 
> Alex
> 
*/
   

Re: [PATCH v2 2/2] accel: kvm: Add aligment assert for kvm_log_clear_one_slot

2021-03-09 Thread Keqian Zhu



On 2021/3/10 0:08, Peter Xu wrote:
> On Tue, Mar 09, 2021 at 02:57:53PM +, Dr. David Alan Gilbert wrote:
>> * Thomas Huth (th...@redhat.com) wrote:
>>> On 09/03/2021 15.05, Keqian Zhu wrote:
>>>>
>>>>
>>>> On 2021/3/9 21:48, Thomas Huth wrote:
>>>>> On 17/12/2020 02.49, Keqian Zhu wrote:
[...]

>>>
>>> #0  0x72c1584f in raise () at /lib64/libc.so.6
>>> #1  0x72bffc45 in abort () at /lib64/libc.so.6
>>> #2  0x72bffb19 in _nl_load_domain.cold.0 () at /lib64/libc.so.6
>>> #3  0x72c0de36 in .annobin_assert.c_end () at /lib64/libc.so.6
>>> #4  0x55ba25f3 in kvm_log_clear_one_slot
>>> (size=6910080, start=0, as_id=0, mem=0x56e1ee00)
>>> at ../../devel/qemu/accel/kvm/kvm-all.c:691
>>> #5  0x55ba25f3 in kvm_physical_log_clear
>>> (section=0x7fffd0b0, section=0x7fffd0b0, kml=0x56dbaac0)
>>> at ../../devel/qemu/accel/kvm/kvm-all.c:843
>>> #6  0x55ba25f3 in kvm_log_clear (listener=0x56dbaac0, 
>>> section=0x7fffd0b0)
>>> at ../../devel/qemu/accel/kvm/kvm-all.c:1253
>>> #7  0x55b023d8 in memory_region_clear_dirty_bitmap
>>> (mr=mr@entry=0x573394c0, start=start@entry=0, len=len@entry=6910080)
>>> at ../../devel/qemu/softmmu/memory.c:2132
>>> #8  0x55b313d9 in cpu_physical_memory_snapshot_and_clear_dirty
>>> (mr=mr@entry=0x573394c0, offset=offset@entry=0, 
>>> length=length@entry=6910080, client=client@entry=0) at 
>>> ../../devel/qemu/softmmu/physmem.c:1109
>>> #9  0x55b02483 in memory_region_snapshot_and_clear_dirty
>>> (mr=mr@entry=0x573394c0, addr=addr@entry=0, 
>>> size=size@entry=6910080, client=client@entry=0)
>>> at ../../devel/qemu/softmmu/memory.c:2146
>>
>> Could you please figure out which memory region this is?
>> WTH is that size? Is that really the problem that the size is just
>> crazy?
> 
> It seems vga_draw_graphic() could call 
> memory_region_snapshot_and_clear_dirty()
> with not-page-aligned size.  cpu_physical_memory_snapshot_and_clear_dirty()
> actually took care of most of it on alignment, however still the "length"
> parameter got passed in without alignment check or so.
> 
> Cc Gerd too.
> 
> I'm not sure how many use cases are there like this.. if there're a lot maybe
> we can indeed drop this assert patch, but instead in kvm_log_clear_one_slot()
> we should ALIGN_DOWN the size to smallest host page size. Say, if we need to
> clear dirty bit for range (0, 0x1020), we should only clean (0, 0x1000) since
> there can still be dirty data on range (0x1020, 0x2000).
Right, the @start and @size should be properly aligned by 
kvm_log_clear_one_slot().
We shouldn't clear areas that beyond what caller expected.

An assert here is not properly.

Thanks,
Keqian
> 
> Thanks,
> 
>>
>> Dave
>>
>>> #10 0x55babe99 in vga_draw_graphic (full_update=0, s=0x573394b0)
>>> at ../../devel/qemu/hw/display/vga.c:1661
>>> #11 0x55babe99 in vga_update_display (opaque=0x573394b0)
>>> at ../../devel/qemu/hw/display/vga.c:1784
>>> #12 0x55babe99 in vga_update_display (opaque=0x573394b0)
>>> at ../../devel/qemu/hw/display/vga.c:1757
>>> #13 0x558ddd32 in graphic_hw_update (con=0x56a11800)
>>> at ../../devel/qemu/ui/console.c:279
>>> #14 0x558dccd2 in dpy_refresh (s=0x56c17da0) at 
>>> ../../devel/qemu/ui/console.c:1742
>>> #15 0x558dccd2 in gui_update (opaque=opaque@entry=0x56c17da0)
>>> at ../../devel/qemu/ui/console.c:209
>>> #16 0x55dbd520 in timerlist_run_timers (timer_list=0x56937c50)
>>> at ../../devel/qemu/util/qemu-timer.c:574
>>> #17 0x55dbd520 in timerlist_run_timers (timer_list=0x56937c50)
>>> at ../../devel/qemu/util/qemu-timer.c:499
>>> #18 0x55dbd74a in qemu_clock_run_timers (type=)
>>> at ../../devel/qemu/util/qemu-timer.c:670
>>> #19 0x55dbd74a in qemu_clock_run_all_timers () at 
>>> ../../devel/qemu/util/qemu-timer.c:670
>>>
>>> Looks like something in the vga code calls this with size=6910080
>>> and thus triggers the alignment assertion?
>>>
>>>  Thomas
>> -- 
>> Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
>>
> 



Re: [PATCH] vfio: Support host translation granule size

2021-03-09 Thread Keqian Zhu
Hi Alex,

On 2021/3/10 7:17, Alex Williamson wrote:
> On Thu, 4 Mar 2021 21:34:46 +0800
> Kunkun Jiang  wrote:
> 
>> The cpu_physical_memory_set_dirty_lebitmap() can quickly deal with
>> the dirty pages of memory by bitmap-traveling, regardless of whether
>> the bitmap is aligned correctly or not.
>>
>> cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
>> host page size. So it'd better to set bitmap_pgsize to host page size
>> to support more translation granule sizes.
>>
>> Fixes: 87ea529c502 (vfio: Get migration capability flags for container)
>> Signed-off-by: Kunkun Jiang 
>> ---
>>  hw/vfio/common.c | 44 ++--
>>  1 file changed, 22 insertions(+), 22 deletions(-)
>>
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index 6ff1daa763..69fb5083a4 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -378,7 +378,7 @@ static int vfio_dma_unmap_bitmap(VFIOContainer 
>> *container,
>>  {
>>  struct vfio_iommu_type1_dma_unmap *unmap;
>>  struct vfio_bitmap *bitmap;
>> -uint64_t pages = TARGET_PAGE_ALIGN(size) >> TARGET_PAGE_BITS;
>> +uint64_t pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size;
>>  int ret;
>>  
>>  unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
>> @@ -390,12 +390,12 @@ static int vfio_dma_unmap_bitmap(VFIOContainer 
>> *container,
>>  bitmap = (struct vfio_bitmap *)>data;
>>  
>>  /*
>> - * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of
>> - * TARGET_PAGE_SIZE to mark those dirty. Hence set bitmap_pgsize to
>> - * TARGET_PAGE_SIZE.
>> + * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
>> + * qemu_real_host_page_size to mark those dirty. Hence set bitmap_pgsize
>> + * to qemu_real_host_page_size.
> 
> 
> I don't see that this change is well supported by the code,
> cpu_physical_memory_set_dirty_lebitmap() seems to operate on
> TARGET_PAGE_SIZE, and the next three patch chunks take a detour through
> memory listener code that seem unrelated to the change described in the
> commit log.  This claims to fix something, what is actually broken?
Actually I have reported this bug long ago. 
cpu_physical_memory_set_dirty_lebitmap() expects
the granule of @bitmap to be qemu_real_host_page_size, as it uses @hpratio to 
convert the bitmap.

Thanks,
Keqian

> Thanks,
> 
> Alex
> 
>>   */
>>  
>> -bitmap->pgsize = TARGET_PAGE_SIZE;
>> +bitmap->pgsize = qemu_real_host_page_size;
>>  bitmap->size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
>> BITS_PER_BYTE;
>>  
>> @@ -674,16 +674,16 @@ static void vfio_listener_region_add(MemoryListener 
>> *listener,
>>  return;
>>  }
>>  
>> -if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) 
>> !=
>> - (section->offset_within_region & ~TARGET_PAGE_MASK))) {
>> +if (unlikely((section->offset_within_address_space & 
>> ~qemu_real_host_page_mask) !=
>> + (section->offset_within_region & 
>> ~qemu_real_host_page_mask))) {
>>  error_report("%s received unaligned region", __func__);
>>  return;
>>  }
>>  
>> -iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
>> +iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
>>  llend = int128_make64(section->offset_within_address_space);
>>  llend = int128_add(llend, section->size);
>> -llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
>> +llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask));
>>  
>>  if (int128_ge(int128_make64(iova), llend)) {
>>  return;
>> @@ -892,8 +892,8 @@ static void vfio_listener_region_del(MemoryListener 
>> *listener,
>>  return;
>>  }
>>  
>> -if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) 
>> !=
>> - (section->offset_within_region & ~TARGET_PAGE_MASK))) {
>> +if (unlikely((section->offset_within_address_space & 
>> ~qemu_real_host_page_mask) !=
>> + (section->offset_within_region & 
>> ~qemu_real_host_page_mask))) {
>>  error_report("%s received unaligned region", __func__);
>>  return;
>>  }
>> @@ -921,10 +921,10 @@ static void vfio_listener_region_del(MemoryListener 
>> *listener,
>>   */
>>  }
>>  
>> -iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
>> +iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
>>  llend = int128_make64(section->offset_within_address_space);
>>  llend = int128_add(llend, section->size);
>> -llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
>> +llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask));
>>  
>>  if (int128_ge(int128_make64(iova), llend)) {
>>  return;
>> @@ -1004,13 +1004,13 @@ static int vfio_get_dirty_bitmap(VFIOContainer 
>> *container, uint64_t iova,
>>  

Re: [PATCH v2 2/2] accel: kvm: Add aligment assert for kvm_log_clear_one_slot

2021-03-09 Thread Keqian Zhu



On 2021/3/9 21:48, Thomas Huth wrote:
> On 17/12/2020 02.49, Keqian Zhu wrote:
>> The parameters start and size are transfered from QEMU memory
>> emulation layer. It can promise that they are TARGET_PAGE_SIZE
>> aligned. However, KVM needs they are qemu_real_page_size aligned.
>>
>> Though no caller breaks this aligned requirement currently, we'd
>> better add an explicit assert to avoid future breaking.
>>
>> Signed-off-by: Keqian Zhu 
>> ---
>>   accel/kvm/kvm-all.c | 7 +++
>>   1 file changed, 7 insertions(+)
>>
>> ---
>> v2
>>   - Address Andrew's commment (Use assert instead of return err).
>>
>> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
>> index f6b16a8df8..73b195cc41 100644
>> --- a/accel/kvm/kvm-all.c
>> +++ b/accel/kvm/kvm-all.c
>> @@ -692,6 +692,10 @@ out:
>>   #define KVM_CLEAR_LOG_ALIGN  (qemu_real_host_page_size << 
>> KVM_CLEAR_LOG_SHIFT)
>>   #define KVM_CLEAR_LOG_MASK   (-KVM_CLEAR_LOG_ALIGN)
>>   +/*
>> + * As the granule of kvm dirty log is qemu_real_host_page_size,
>> + * @start and @size are expected and restricted to align to it.
>> + */
>>   static int kvm_log_clear_one_slot(KVMSlot *mem, int as_id, uint64_t start,
>> uint64_t size)
>>   {
>> @@ -701,6 +705,9 @@ static int kvm_log_clear_one_slot(KVMSlot *mem, int 
>> as_id, uint64_t start,
>>   unsigned long *bmap_clear = NULL, psize = qemu_real_host_page_size;
>>   int ret;
>>   +/* Make sure start and size are qemu_real_host_page_size aligned */
>> +assert(QEMU_IS_ALIGNED(start | size, psize));
> 
> Sorry, but that was a bad idea: It triggers and kills my Centos 6 VM:
> 
> $ qemu-system-x86_64 -accel kvm -hda ~/virt/images/centos6.qcow2 -m 1G
> qemu-system-x86_64: ../../devel/qemu/accel/kvm/kvm-all.c:690: 
> kvm_log_clear_one_slot: Assertion `QEMU_IS_ALIGNED(start | size, psize)' 
> failed.
> Aborted (core dumped)
Hi Thomas,

I think this patch is ok, maybe it trigger a potential bug?

Thanks,
Keqian

> 
> Can we please revert this patch?
> 
>  Thomas
> 
> .
> 



[PATCH v2] vfio/migrate: Move switch of dirty tracking into vfio_memory_listener

2021-03-08 Thread Keqian Zhu
For now the switch of vfio dirty page tracking is integrated into
@vfio_save_handler. The reason is that some PCI vendor driver may
start to track dirty base on _SAVING state of device, so if dirty
tracking is started before setting device state, vfio will report
full-dirty to QEMU.

However, the dirty bmap of all ramblocks are fully set when setup
ram saving, so it's not matter whether the device is in _SAVING
state when start vfio dirty tracking.

Moreover, this logic causes some problems [1]. The object of dirty
tracking is guest memory, but the object of @vfio_save_handler is
device state, which produces unnecessary coupling and conflicts:

1. Coupling: Their saving granule is different (perVM vs perDevice).
   vfio will enable dirty_page_tracking for each devices, actually
   once is enough.

2. Conflicts: The ram_save_setup() traverses all memory_listeners
   to execute their log_start() and log_sync() hooks to get the
   first round dirty bitmap, which is used by the bulk stage of
   ram saving. However, as vfio dirty tracking is not yet started,
   it can't get dirty bitmap from vfio. Then we give up the chance
   to handle vfio dirty page at bulk stage.

Move the switch of vfio dirty_page_tracking into vfio_memory_listener
can solve above problems. Besides, Do not require devices in SAVING
state for vfio_sync_dirty_bitmap().

[1] https://www.spinics.net/lists/kvm/msg229967.html

Reported-by: Zenghui Yu 
Signed-off-by: Keqian Zhu 
Suggested-by: Paolo Bonzini 
---

changelog:

v2:
 - use log_global_(start|stop) to switch vfio dirty tracking. (Paolo)
 - add the initial design idea.

---
 hw/vfio/common.c| 49 -
 hw/vfio/migration.c | 35 
 2 files changed, 40 insertions(+), 44 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 6ff1daa763..a27aa9dda8 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -311,7 +311,7 @@ bool vfio_mig_active(void)
 return true;
 }
 
-static bool vfio_devices_all_saving(VFIOContainer *container)
+static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
 {
 VFIOGroup *group;
 VFIODevice *vbasedev;
@@ -329,13 +329,8 @@ static bool vfio_devices_all_saving(VFIOContainer 
*container)
 return false;
 }
 
-if (migration->device_state & VFIO_DEVICE_STATE_SAVING) {
-if ((vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF)
-&& (migration->device_state & VFIO_DEVICE_STATE_RUNNING)) {
-return false;
-}
-continue;
-} else {
+if ((vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF)
+&& (migration->device_state & VFIO_DEVICE_STATE_RUNNING)) {
 return false;
 }
 }
@@ -987,6 +982,40 @@ static void vfio_listener_region_del(MemoryListener 
*listener,
 }
 }
 
+static void vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
+{
+int ret;
+struct vfio_iommu_type1_dirty_bitmap dirty = {
+.argsz = sizeof(dirty),
+};
+
+if (start) {
+dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START;
+} else {
+dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP;
+}
+
+ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, );
+if (ret) {
+error_report("Failed to set dirty tracking flag 0x%x errno: %d",
+ dirty.flags, errno);
+}
+}
+
+static void vfio_listener_log_global_start(MemoryListener *listener)
+{
+VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+
+vfio_set_dirty_page_tracking(container, true);
+}
+
+static void vfio_listener_log_global_stop(MemoryListener *listener)
+{
+VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+
+vfio_set_dirty_page_tracking(container, false);
+}
+
 static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
  uint64_t size, ram_addr_t ram_addr)
 {
@@ -1128,7 +1157,7 @@ static void vfio_listerner_log_sync(MemoryListener 
*listener,
 return;
 }
 
-if (vfio_devices_all_saving(container)) {
+if (vfio_devices_all_dirty_tracking(container)) {
 vfio_sync_dirty_bitmap(container, section);
 }
 }
@@ -1136,6 +1165,8 @@ static void vfio_listerner_log_sync(MemoryListener 
*listener,
 static const MemoryListener vfio_memory_listener = {
 .region_add = vfio_listener_region_add,
 .region_del = vfio_listener_region_del,
+.log_global_start = vfio_listener_log_global_start,
+.log_global_stop = vfio_listener_log_global_stop,
 .log_sync = vfio_listerner_log_sync,
 };
 
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 00daa50ed8..c0f646823a 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -395,40 +395,10 @@ static int 

[PING] accel: kvm: Some bugfixes for kvm dirty log

2021-03-02 Thread Keqian Zhu
Hi,

This patch is still not queued. Who can help to do this? Thanks :)

Keqian

On 2020/12/17 9:49, Keqian Zhu wrote:
> Hi all,
> 
> This series fixes memory waste and adds alignment check for unmatched
> qemu_real_host_page_size and TARGET_PAGE_SIZE.
> 
> Thanks.
> 
> Keqian Zhu (2):
>   accel: kvm: Fix memory waste under mismatch page size
>   accel: kvm: Add aligment assert for kvm_log_clear_one_slot
> 
>  accel/kvm/kvm-all.c | 13 -
>  1 file changed, 12 insertions(+), 1 deletion(-)
> 



Re: [PATCH] vfio/migrate: Move switch of dirty tracking into vfio_memory_listener

2021-02-28 Thread Keqian Zhu
Hi Kirti,

What's your opinion about this? Thanks.

Keqian

On 2021/1/30 14:30, Keqian Zhu wrote:
> Hi Kirti,
> 
> On 2021/1/28 5:03, Kirti Wankhede wrote:
>>
>>
>> On 1/11/2021 1:04 PM, Keqian Zhu wrote:
>>> For now the switch of vfio dirty page tracking is integrated into
>>> the vfio_save_handler, it causes some problems [1].
>>>
>>
>> Sorry, I missed [1] mail, somehow it didn't landed in my inbox.
>>
>>> The object of dirty tracking is guest memory, but the object of
>>> the vfio_save_handler is device state. This mixed logic produces
>>> unnecessary coupling and conflicts:
>>>
>>> 1. Coupling: Their saving granule is different (perVM vs perDevice).
>>> vfio will enable dirty_page_tracking for each devices, actually
>>> once is enough.
>>
>> That's correct, enabling dirty page tracking once is enough. But log_start 
>> and log_stop gets called on address space update transaction, region_add() 
>> or region_del(), at this point migration may not be active. We don't want to 
>> allocate bitmap memory in kernel for lifetime of VM, without knowing 
>> migration will be happen or not. vfio_iommu_type1 module should allocate 
>> bitmap memory only while migration is active.
>>
> Yeah, we can use global start/stop callbacks as suggested by Paolo, which 
> solves this problem.
> 
>> Paolo's suggestion here to use log_global_start and log_global_stop 
>> callbacks seems correct here. But at this point vfio device state is not yet 
>> changed to |_SAVING as you had identified it in [1]. May be we can start 
>> tracking bitmap in iommu_type1 module while device is not yet _SAVING, but 
>> getting dirty bitmap while device is yet not in _SAVING|_RUNNING state 
>> doesn't seem optimal solution.
>>
>> Pasting here your question from [1]
>>
>>> Before start dirty tracking, we will check and ensure that the device
>>>  is at _SAVING state and return error otherwise.  But the question is
>>>  that what is the rationale?  Why does the VFIO_IOMMU_DIRTY_PAGES
>>> ioctl have something to do with the device state?
>>
>> Lets walk through the types of devices we are supporting:
>> 1. mdev devices without IOMMU backed device
>> Vendor driver pins pages as and when required during runtime. We can say 
>> that vendor driver is smart which identifies the pages to pin. We are good 
>> here.
>>
>> 2. mdev device with IOMMU backed device
>> This is similar to vfio-pci, direct assigned device, where all pages are 
>> pinned at VM bootup. Vendor driver is not smart, so bitmap query will report 
>> all pages dirty always. If --auto-converge is not set, VM stucks infinitely 
>> in pre-copy phase. This is known to us.
>>
> little question here ;-) . Why auto-converge (slow down vCPU) helps to ease 
> the case of full dirty?
> 
>> 3. mdev device with IOMMU backed device with smart vendor driver
>> In this case as well all pages are pinned at VM bootup, but vendor 
>> driver is smart to identify the pages and pin them explicitly.
>> Pages can be pinned anytime, i.e. during normal VM runtime or on setting 
>> _SAVING flag (entering pre-copy phase) or while in iterative pre-copy phase. 
>> There is no restriction based on these phases for calling vfio_pin_pages(). 
>> Vendor driver can start pinning pages based on its device state when _SAVING 
>> flag is set. In that case, if dirty bitmap is queried before that then it 
>> will report all sysmem as dirty with an unnecessary copy of sysmem.
>> As an optimal solution, I think its better to query bitmap only after all 
>> vfio devices are in pre-copy phase, i.e. after _SAVING flag is set.
> OK, I get your idea. But Qemu assumes all pages are dirty initially, this 
> seems not a problem.
> Let's assume we have a device of type 3, and this device starts to pin pages 
> on setting _SAVING flag.
> 
> Before this patch, the work flow is:
> {
> ram_save_setup()
> memory_global_dirty_log_start():  start dirty tracking excludes vfio part.
> migration_bitmap_sync_precopy():  try to sync dirty log from kvm, vhost 
> etc, including vfio (as all device saving is not satisfied, fail to get log 
> from vfio). The result is that bitmap of ramblock is all dirty.
> 
> vfio_save_setup() of this device
> vfio_migration_set_state(): Add SAVING state to this device, and vfio 
> starts to log dirty page of this device.
> 
> first round (i.e. bulk stage) of ram saving: only handle dirty log which is 
> collected from the first call of migration_bitmap_sync_precopy().
> 
> iterative stage of ra

Re: [PATCH v2 2/2] accel: kvm: Add aligment assert for kvm_log_clear_one_slot

2021-02-01 Thread Keqian Zhu
Hi Philippe,

On 2021/2/1 23:14, Philippe Mathieu-Daudé wrote:
> Hi,
> 
> On 12/17/20 2:49 AM, Keqian Zhu wrote:
>> The parameters start and size are transfered from QEMU memory
>> emulation layer. It can promise that they are TARGET_PAGE_SIZE
>> aligned. However, KVM needs they are qemu_real_page_size aligned.
>>
>> Though no caller breaks this aligned requirement currently, we'd
>> better add an explicit assert to avoid future breaking.
>>
>> Signed-off-by: Keqian Zhu 
>> ---
>>  accel/kvm/kvm-all.c | 7 +++
>>  1 file changed, 7 insertions(+)
>>
>> ---
>> v2
>>  - Address Andrew's commment (Use assert instead of return err).
>>
>> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
>> index f6b16a8df8..73b195cc41 100644
>> --- a/accel/kvm/kvm-all.c
>> +++ b/accel/kvm/kvm-all.c
>> @@ -692,6 +692,10 @@ out:
>>  #define KVM_CLEAR_LOG_ALIGN  (qemu_real_host_page_size << 
>> KVM_CLEAR_LOG_SHIFT)
>>  #define KVM_CLEAR_LOG_MASK   (-KVM_CLEAR_LOG_ALIGN)
>>  
>> +/*
>> + * As the granule of kvm dirty log is qemu_real_host_page_size,
>> + * @start and @size are expected and restricted to align to it.
>> + */
>>  static int kvm_log_clear_one_slot(KVMSlot *mem, int as_id, uint64_t start,
>>uint64_t size)
>>  {
>> @@ -701,6 +705,9 @@ static int kvm_log_clear_one_slot(KVMSlot *mem, int 
>> as_id, uint64_t start,
>>  unsigned long *bmap_clear = NULL, psize = qemu_real_host_page_size;
>>  int ret;
>>  
>> +/* Make sure start and size are qemu_real_host_page_size aligned */
>> +assert(QEMU_IS_ALIGNED(start | size, psize));
> 
> Why not return an error instead of aborting the VM?
Yep, I return an error in v1. As suggested by Peter Xu: "Returning -EINVAL is 
the same as abort() currently - it'll just abort() at
kvm_log_clear() instead."

> 
>>  /*
>>   * We need to extend either the start or the size or both to
>>   * satisfy the KVM interface requirement.  Firstly, do the start
>>
> 
> .
> 
Thanks for review.

Keqian.



Re: [PATCH v2 0/2] accel: kvm: Some bugfixes for kvm dirty log

2021-02-01 Thread Keqian Zhu
Hi All,

Kindly Ping ;-) ...

On 2021/1/25 15:51, Keqian Zhu wrote:
> Hi Paolo,
> 
> Any suggestions on this patch series :-) ? Thanks,
> 
> Keqian
> 
> On 2020/12/17 9:49, Keqian Zhu wrote:
>> Hi all,
>>
>> This series fixes memory waste and adds alignment check for unmatched
>> qemu_real_host_page_size and TARGET_PAGE_SIZE.
>>
>> Thanks.
>>
>> Keqian Zhu (2):
>>   accel: kvm: Fix memory waste under mismatch page size
>>   accel: kvm: Add aligment assert for kvm_log_clear_one_slot
>>
>>  accel/kvm/kvm-all.c | 13 -
>>  1 file changed, 12 insertions(+), 1 deletion(-)
>>
> 
> .
> 



Re: [PATCH] vfio/migrate: Move switch of dirty tracking into vfio_memory_listener

2021-01-29 Thread Keqian Zhu
Hi Kirti,

On 2021/1/28 5:03, Kirti Wankhede wrote:
> 
> 
> On 1/11/2021 1:04 PM, Keqian Zhu wrote:
>> For now the switch of vfio dirty page tracking is integrated into
>> the vfio_save_handler, it causes some problems [1].
>>
> 
> Sorry, I missed [1] mail, somehow it didn't landed in my inbox.
> 
>> The object of dirty tracking is guest memory, but the object of
>> the vfio_save_handler is device state. This mixed logic produces
>> unnecessary coupling and conflicts:
>>
>> 1. Coupling: Their saving granule is different (perVM vs perDevice).
>> vfio will enable dirty_page_tracking for each devices, actually
>> once is enough.
> 
> That's correct, enabling dirty page tracking once is enough. But log_start 
> and log_stop gets called on address space update transaction, region_add() or 
> region_del(), at this point migration may not be active. We don't want to 
> allocate bitmap memory in kernel for lifetime of VM, without knowing 
> migration will be happen or not. vfio_iommu_type1 module should allocate 
> bitmap memory only while migration is active.
> 
Yeah, we can use global start/stop callbacks as suggested by Paolo, which 
solves this problem.

> Paolo's suggestion here to use log_global_start and log_global_stop callbacks 
> seems correct here. But at this point vfio device state is not yet changed to 
> |_SAVING as you had identified it in [1]. May be we can start tracking bitmap 
> in iommu_type1 module while device is not yet _SAVING, but getting dirty 
> bitmap while device is yet not in _SAVING|_RUNNING state doesn't seem optimal 
> solution.
> 
> Pasting here your question from [1]
> 
>> Before start dirty tracking, we will check and ensure that the device
>>  is at _SAVING state and return error otherwise.  But the question is
>>  that what is the rationale?  Why does the VFIO_IOMMU_DIRTY_PAGES
>> ioctl have something to do with the device state?
> 
> Lets walk through the types of devices we are supporting:
> 1. mdev devices without IOMMU backed device
> Vendor driver pins pages as and when required during runtime. We can say 
> that vendor driver is smart which identifies the pages to pin. We are good 
> here.
> 
> 2. mdev device with IOMMU backed device
> This is similar to vfio-pci, direct assigned device, where all pages are 
> pinned at VM bootup. Vendor driver is not smart, so bitmap query will report 
> all pages dirty always. If --auto-converge is not set, VM stucks infinitely 
> in pre-copy phase. This is known to us.
> 
little question here ;-) . Why auto-converge (slow down vCPU) helps to ease the 
case of full dirty?

> 3. mdev device with IOMMU backed device with smart vendor driver
> In this case as well all pages are pinned at VM bootup, but vendor driver 
> is smart to identify the pages and pin them explicitly.
> Pages can be pinned anytime, i.e. during normal VM runtime or on setting 
> _SAVING flag (entering pre-copy phase) or while in iterative pre-copy phase. 
> There is no restriction based on these phases for calling vfio_pin_pages(). 
> Vendor driver can start pinning pages based on its device state when _SAVING 
> flag is set. In that case, if dirty bitmap is queried before that then it 
> will report all sysmem as dirty with an unnecessary copy of sysmem.
> As an optimal solution, I think its better to query bitmap only after all 
> vfio devices are in pre-copy phase, i.e. after _SAVING flag is set.
OK, I get your idea. But Qemu assumes all pages are dirty initially, this seems 
not a problem.
Let's assume we have a device of type 3, and this device starts to pin pages on 
setting _SAVING flag.

Before this patch, the work flow is:
{
ram_save_setup()
memory_global_dirty_log_start():  start dirty tracking excludes vfio part.
migration_bitmap_sync_precopy():  try to sync dirty log from kvm, vhost 
etc, including vfio (as all device saving is not satisfied, fail to get log 
from vfio). The result is that bitmap of ramblock is all dirty.

vfio_save_setup() of this device
vfio_migration_set_state(): Add SAVING state to this device, and vfio 
starts to log dirty page of this device.

first round (i.e. bulk stage) of ram saving: only handle dirty log which is 
collected from the first call of migration_bitmap_sync_precopy().

iterative stage of ram saving: when the remaining dirty log is less than 
threshold, call migration_bitmap_sync_precopy() again. At this stage, all 
device is saving, so success to get log from vfio.
}

With this patch, the work flow is:
{
ram_save_setup()
memory_global_dirty_log_start():  start dirty tracking includes vfio part.
migration_bitmap_sync_precopy():  try to sync dirty log from kvm, vhost 
etc, including vfio (as all device saving is not checked, success to get full 
dirty lo

Re: [PATCH] vfio/migrate: Move switch of dirty tracking into vfio_memory_listener

2021-01-29 Thread Keqian Zhu



On 2021/1/29 15:49, Paolo Bonzini wrote:
> On 28/01/21 21:02, Dr. David Alan Gilbert wrote:
>> * Paolo Bonzini (pbonz...@redhat.com) wrote:
>>> On 11/01/21 08:34, Keqian Zhu wrote:
>>>> +static void vfio_listener_log_start(MemoryListener *listener,
>>>> +MemoryRegionSection *section,
>>>> +int old, int new)
>>>> +{
>>>> +VFIOContainer *container = container_of(listener, VFIOContainer, 
>>>> listener);
>>>> +
>>>> +vfio_set_dirty_page_tracking(container, true);
>>>> +}
>>>
>>> This would enable dirty page tracking also just for having a framebuffer
>>> (DIRTY_MEMORY_VGA).  Technically it would be correct, but it would also be
>>> more heavyweight than expected.
>>
>> Wouldn't that only happen on emulated video devices?
> 
> Yes, but still it's not impossible to have both an emulated VGA and an 
> assigned GPU or vGPU.
> 
>>> In order to only cover live migration, you can use the log_global_start and
>>> log_global_stop callbacks instead.
>>>
>>> If you want to use log_start and log_stop, you need to add respectively
>>>
>>>  if (old != 0) {
>>>  return;
>>>  }
>>>
>>> and
>>>
>>>  if (new != 0) {
>>>  return;
>>>  }
>>
>> Why 0, wouldn't you be checking for DIRTY_LOG_MIGRATION somewhere?
> 
> Actually thinking more about it log_start/log_stop are just wrong, because 
> they would be called many times, for each MemoryRegionSection.

Agree. This will be called for each MemoryRegionSection and each time when 
dirty_log_mask changed.

KVM uses log_start/log_stop, because it can start dirty tracking for every 
memslot individually, but vfio just has global start/stop semantics.

Anyway, use global start/stop is correct choice.

Thanks,
Keqian

> 
> Paolo
> 
> .
> 



Re: [PATCH] vfio/migrate: Move switch of dirty tracking into vfio_memory_listener

2021-01-28 Thread Keqian Zhu
Hi Paolo and Kirti,

Many thanks for reply. I am busy today and will reply you tomorrow, Thanks.

Keqian.

On 2021/1/28 5:03, Kirti Wankhede wrote:
> 
> 
> On 1/11/2021 1:04 PM, Keqian Zhu wrote:
>> For now the switch of vfio dirty page tracking is integrated into
>> the vfio_save_handler, it causes some problems [1].
>>
> 
> Sorry, I missed [1] mail, somehow it didn't landed in my inbox.
> 
>> The object of dirty tracking is guest memory, but the object of
>> the vfio_save_handler is device state. This mixed logic produces
>> unnecessary coupling and conflicts:
>>
>> 1. Coupling: Their saving granule is different (perVM vs perDevice).
>> vfio will enable dirty_page_tracking for each devices, actually
>> once is enough.
> 
> That's correct, enabling dirty page tracking once is enough. But log_start 
> and log_stop gets called on address space update transaction, region_add() or 
> region_del(), at this point migration may not be active. We don't want to 
> allocate bitmap memory in kernel for lifetime of VM, without knowing 
> migration will be happen or not. vfio_iommu_type1 module should allocate 
> bitmap memory only while migration is active.
> 
> Paolo's suggestion here to use log_global_start and log_global_stop callbacks 
> seems correct here. But at this point vfio device state is not yet changed to 
> |_SAVING as you had identified it in [1]. May be we can start tracking bitmap 
> in iommu_type1 module while device is not yet _SAVING, but getting dirty 
> bitmap while device is yet not in _SAVING|_RUNNING state doesn't seem optimal 
> solution.
> 
> Pasting here your question from [1]
> 
>> Before start dirty tracking, we will check and ensure that the device
>>  is at _SAVING state and return error otherwise.  But the question is
>>  that what is the rationale?  Why does the VFIO_IOMMU_DIRTY_PAGES
>> ioctl have something to do with the device state?
> 
> Lets walk through the types of devices we are supporting:
> 1. mdev devices without IOMMU backed device
> Vendor driver pins pages as and when required during runtime. We can say 
> that vendor driver is smart which identifies the pages to pin. We are good 
> here.
> 
> 2. mdev device with IOMMU backed device
> This is similar to vfio-pci, direct assigned device, where all pages are 
> pinned at VM bootup. Vendor driver is not smart, so bitmap query will report 
> all pages dirty always. If --auto-converge is not set, VM stucks infinitely 
> in pre-copy phase. This is known to us.
> 
> 3. mdev device with IOMMU backed device with smart vendor driver
> In this case as well all pages are pinned at VM bootup, but vendor driver 
> is smart to identify the pages and pin them explicitly.
> Pages can be pinned anytime, i.e. during normal VM runtime or on setting 
> _SAVING flag (entering pre-copy phase) or while in iterative pre-copy phase. 
> There is no restriction based on these phases for calling vfio_pin_pages(). 
> Vendor driver can start pinning pages based on its device state when _SAVING 
> flag is set. In that case, if dirty bitmap is queried before that then it 
> will report all sysmem as dirty with an unnecessary copy of sysmem.
> As an optimal solution, I think its better to query bitmap only after all 
> vfio devices are in pre-copy phase, i.e. after _SAVING flag is set.
> 
>> 2. Conflicts: The ram_save_setup() traverses all memory_listeners
>> to execute their log_start() and log_sync() hooks to get the
>> first round dirty bitmap, which is used by the bulk stage of
>> ram saving. However, it can't get dirty bitmap from vfio, as
>> @savevm_ram_handlers is registered before @vfio_save_handler.
>>
> Right, but it can get dirty bitmap from vfio device in it's iterative callback
> ram_save_pending ->
> migration_bitmap_sync_precopy() .. ->
>  vfio_listerner_log_sync
> 
> Thanks,
> Kirti
> 
>> Move the switch of vfio dirty_page_tracking into vfio_memory_listener
>> can solve above problems. Besides, Do not require devices in SAVING
>> state for vfio_sync_dirty_bitmap().
>>
>> [1] https://www.spinics.net/lists/kvm/msg229967.html
>>
>> Reported-by: Zenghui Yu 
>> Signed-off-by: Keqian Zhu 
>> ---
>>   hw/vfio/common.c| 53 +
>>   hw/vfio/migration.c | 35 --
>>   2 files changed, 44 insertions(+), 44 deletions(-)
>>
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index 6ff1daa763..9128cd7ee1 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -311,7 +311,7 @@ bool vfio_mig_active(void)
>>   return t

Re: [PATCH v2 0/2] accel: kvm: Some bugfixes for kvm dirty log

2021-01-24 Thread Keqian Zhu
Hi Paolo,

Any suggestions on this patch series :-) ? Thanks,

Keqian

On 2020/12/17 9:49, Keqian Zhu wrote:
> Hi all,
> 
> This series fixes memory waste and adds alignment check for unmatched
> qemu_real_host_page_size and TARGET_PAGE_SIZE.
> 
> Thanks.
> 
> Keqian Zhu (2):
>   accel: kvm: Fix memory waste under mismatch page size
>   accel: kvm: Add aligment assert for kvm_log_clear_one_slot
> 
>  accel/kvm/kvm-all.c | 13 -
>  1 file changed, 12 insertions(+), 1 deletion(-)
> 



[PATCH] vfio/migrate: Move switch of dirty tracking into vfio_memory_listener

2021-01-10 Thread Keqian Zhu
For now the switch of vfio dirty page tracking is integrated into
the vfio_save_handler, it causes some problems [1].

The object of dirty tracking is guest memory, but the object of
the vfio_save_handler is device state. This mixed logic produces
unnecessary coupling and conflicts:

1. Coupling: Their saving granule is different (perVM vs perDevice).
   vfio will enable dirty_page_tracking for each devices, actually
   once is enough.
2. Conflicts: The ram_save_setup() traverses all memory_listeners
   to execute their log_start() and log_sync() hooks to get the
   first round dirty bitmap, which is used by the bulk stage of
   ram saving. However, it can't get dirty bitmap from vfio, as
   @savevm_ram_handlers is registered before @vfio_save_handler.

Move the switch of vfio dirty_page_tracking into vfio_memory_listener
can solve above problems. Besides, Do not require devices in SAVING
state for vfio_sync_dirty_bitmap().

[1] https://www.spinics.net/lists/kvm/msg229967.html

Reported-by: Zenghui Yu 
Signed-off-by: Keqian Zhu 
---
 hw/vfio/common.c| 53 +
 hw/vfio/migration.c | 35 --
 2 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 6ff1daa763..9128cd7ee1 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -311,7 +311,7 @@ bool vfio_mig_active(void)
 return true;
 }
 
-static bool vfio_devices_all_saving(VFIOContainer *container)
+static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
 {
 VFIOGroup *group;
 VFIODevice *vbasedev;
@@ -329,13 +329,8 @@ static bool vfio_devices_all_saving(VFIOContainer 
*container)
 return false;
 }
 
-if (migration->device_state & VFIO_DEVICE_STATE_SAVING) {
-if ((vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF)
-&& (migration->device_state & VFIO_DEVICE_STATE_RUNNING)) {
-return false;
-}
-continue;
-} else {
+if ((vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF)
+&& (migration->device_state & VFIO_DEVICE_STATE_RUNNING)) {
 return false;
 }
 }
@@ -987,6 +982,44 @@ static void vfio_listener_region_del(MemoryListener 
*listener,
 }
 }
 
+static void vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
+{
+int ret;
+struct vfio_iommu_type1_dirty_bitmap dirty = {
+.argsz = sizeof(dirty),
+};
+
+if (start) {
+dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START;
+} else {
+dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP;
+}
+
+ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, );
+if (ret) {
+error_report("Failed to set dirty tracking flag 0x%x errno: %d",
+ dirty.flags, errno);
+}
+}
+
+static void vfio_listener_log_start(MemoryListener *listener,
+MemoryRegionSection *section,
+int old, int new)
+{
+VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+
+vfio_set_dirty_page_tracking(container, true);
+}
+
+static void vfio_listener_log_stop(MemoryListener *listener,
+   MemoryRegionSection *section,
+   int old, int new)
+{
+VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+
+vfio_set_dirty_page_tracking(container, false);
+}
+
 static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
  uint64_t size, ram_addr_t ram_addr)
 {
@@ -1128,7 +1161,7 @@ static void vfio_listerner_log_sync(MemoryListener 
*listener,
 return;
 }
 
-if (vfio_devices_all_saving(container)) {
+if (vfio_devices_all_dirty_tracking(container)) {
 vfio_sync_dirty_bitmap(container, section);
 }
 }
@@ -1136,6 +1169,8 @@ static void vfio_listerner_log_sync(MemoryListener 
*listener,
 static const MemoryListener vfio_memory_listener = {
 .region_add = vfio_listener_region_add,
 .region_del = vfio_listener_region_del,
+.log_start = vfio_listener_log_start,
+.log_stop = vfio_listener_log_stop,
 .log_sync = vfio_listerner_log_sync,
 };
 
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 00daa50ed8..c0f646823a 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -395,40 +395,10 @@ static int vfio_load_device_config_state(QEMUFile *f, 
void *opaque)
 return qemu_file_get_error(f);
 }
 
-static int vfio_set_dirty_page_tracking(VFIODevice *vbasedev, bool start)
-{
-int ret;
-VFIOMigration *migration = vbasedev->migration;
-VFIOContainer *container = vbasedev->group->container;
-struct vfio_iommu_type1_dirty_bitmap di

[PATCH v2] Docs/RCU: Correct sample code of qatomic_rcu_set

2021-01-05 Thread Keqian Zhu
Correct sample code to avoid confusing readers.

Signed-off-by: Keqian Zhu 
Cc: qemu-triv...@nongnu.org
Reviewed-by: Paolo Bonzini 
Reviewed-by: Peter Xu 
---

v2:
 - Add Cc and R-b.

---
 docs/devel/rcu.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/devel/rcu.txt b/docs/devel/rcu.txt
index cdf002edd8..2e6cc607a1 100644
--- a/docs/devel/rcu.txt
+++ b/docs/devel/rcu.txt
@@ -392,7 +392,7 @@ Instead, we store the size of the array with the array 
itself:
 
 /* Removal phase.  */
 old_array = global_array;
-qatomic_rcu_set(_array->data, new_array);
+qatomic_rcu_set(_array, new_array);
 synchronize_rcu();
 
 /* Reclamation phase.  */
-- 
2.19.1




Re: [PATCH v2 0/2] accel: kvm: Some bugfixes for kvm dirty log

2021-01-05 Thread Keqian Zhu
Friendly ping ...

Hi, please queue this well reviewed series, Thanks :-)

Keqian

On 2020/12/17 9:49, Keqian Zhu wrote:
> Hi all,
> 
> This series fixes memory waste and adds alignment check for unmatched
> qemu_real_host_page_size and TARGET_PAGE_SIZE.
> 
> Thanks.
> 
> Keqian Zhu (2):
>   accel: kvm: Fix memory waste under mismatch page size
>   accel: kvm: Add aligment assert for kvm_log_clear_one_slot
> 
>  accel/kvm/kvm-all.c | 13 -
>  1 file changed, 12 insertions(+), 1 deletion(-)
> 



Re: [PATCH v2 1/2] ramlist: Make dirty bitmap blocks of ramlist resizable

2020-12-25 Thread Keqian Zhu


[...]

>>> -for (j = old_num_blocks; j < new_num_blocks; j++) {
>>> -new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
>>> +if (extend) {
>>> +for (j = cpy_num_blocks; j < new_num_blocks; j++) {
>>> +new_blocks->blocks[j] = 
>>> bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
>>> +}
>>> +} else {
>>> +for (j = cpy_num_blocks; j < old_num_blocks; j++) {
>>> +/* We are safe to free it, for that it is out-of-use */
>>> +g_free(old_blocks->blocks[j]);
>>
>> This looks unsafe because this code uses Read Copy Update (RCU):
>>
>>   old_blocks = qatomic_rcu_read(_list.dirty_memory[i]);
>>
>> Other threads may still be accessing old_blocks so we cannot modify it
>> immediately. Changes need to be deferred until the next RCU period.
>> g_free_rcu() needs to be used to do this.
>>
> Hi Stefan,
> 
> You are right. I was thinking about the VM life cycle before. We shrink the 
> dirty_memory
> when we are removing unused ramblock. However we can not rely on this.
> 
> I also notice that "Organization into blocks allows dirty memory to grow (but 
> not shrink)
> under RCU". Why "but not shrink"? Any thoughts?
Hi,

After my analysis, it's both unsafe to grow or shrink under RCU.

ram_list.blocks and ram_list.dirty_memory[X] are closely related and
both protected by RCU. For the lockless RCU readers, we can't promise they
always see consistent version of the two structures.

For grow, a reader may see un-growed @dirty_memory and growed @blocks, causing 
out-of-bound access.
For shrink, a reader may see shrinked @dirty_memory and un-shrinked @blocks, 
causing out-of-bound access too.

I think it's a design problem, RCU can just protect one structure, not two.

Thanks,
Keqian.
> 
> [...]
>  * Organization into blocks allows dirty memory to grow (but not shrink) under
>  * RCU.  When adding new RAMBlocks requires the dirty memory to grow, a new
>  * DirtyMemoryBlocks array is allocated with pointers to existing blocks kept
>  * the same.  Other threads can safely access existing blocks while dirty
>  * memory is being grown.  When no threads are using the old DirtyMemoryBlocks
>  * anymore it is freed by RCU (but the underlying blocks stay because they are
>  * pointed to from the new DirtyMemoryBlocks).
>  */
> #define DIRTY_MEMORY_BLOCK_SIZE ((ram_addr_t)256 * 1024 * 8)
> typedef struct {
> struct rcu_head rcu;
> unsigned long *blocks[];
> } DirtyMemoryBlocks;
> [...]
> 
> Thanks,
> Keqian
> 
> 
> .
> 



[PATCH] Docs/RCU: Correct sample code of qatomic_rcu_set

2020-12-21 Thread Keqian Zhu
Correct sample code to avoid confusing readers.

Signed-off-by: Keqian Zhu 
---
 docs/devel/rcu.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/devel/rcu.txt b/docs/devel/rcu.txt
index cdf002edd8..2e6cc607a1 100644
--- a/docs/devel/rcu.txt
+++ b/docs/devel/rcu.txt
@@ -392,7 +392,7 @@ Instead, we store the size of the array with the array 
itself:
 
 /* Removal phase.  */
 old_array = global_array;
-qatomic_rcu_set(_array->data, new_array);
+qatomic_rcu_set(_array, new_array);
 synchronize_rcu();
 
 /* Reclamation phase.  */
-- 
2.23.0




Re: [PATCH v2 1/2] ramlist: Make dirty bitmap blocks of ramlist resizable

2020-12-20 Thread Keqian Zhu



On 2020/12/17 18:05, Stefan Hajnoczi wrote:
> On Mon, Nov 30, 2020 at 09:11:03PM +0800, Keqian Zhu wrote:
>> @@ -1839,15 +1841,26 @@ static void dirty_memory_extend(ram_addr_t 
>> old_ram_size,
>>  new_blocks = g_malloc(sizeof(*new_blocks) +
>>sizeof(new_blocks->blocks[0]) * 
>> new_num_blocks);
>>  
>> -if (old_num_blocks) {
>> +if (cpy_num_blocks) {
>>  memcpy(new_blocks->blocks, old_blocks->blocks,
>> -   old_num_blocks * sizeof(old_blocks->blocks[0]));
>> +   cpy_num_blocks * sizeof(old_blocks->blocks[0]));
>>  }
>>  
>> -for (j = old_num_blocks; j < new_num_blocks; j++) {
>> -new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
>> +if (extend) {
>> +for (j = cpy_num_blocks; j < new_num_blocks; j++) {
>> +new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
>> +}
>> +} else {
>> +for (j = cpy_num_blocks; j < old_num_blocks; j++) {
>> +/* We are safe to free it, for that it is out-of-use */
>> +g_free(old_blocks->blocks[j]);
> 
> This looks unsafe because this code uses Read Copy Update (RCU):
> 
>   old_blocks = qatomic_rcu_read(_list.dirty_memory[i]);
> 
> Other threads may still be accessing old_blocks so we cannot modify it
> immediately. Changes need to be deferred until the next RCU period.
> g_free_rcu() needs to be used to do this.
> 
Hi Stefan,

You are right. I was thinking about the VM life cycle before. We shrink the 
dirty_memory
when we are removing unused ramblock. However we can not rely on this.

I also notice that "Organization into blocks allows dirty memory to grow (but 
not shrink)
under RCU". Why "but not shrink"? Any thoughts?

[...]
 * Organization into blocks allows dirty memory to grow (but not shrink) under
 * RCU.  When adding new RAMBlocks requires the dirty memory to grow, a new
 * DirtyMemoryBlocks array is allocated with pointers to existing blocks kept
 * the same.  Other threads can safely access existing blocks while dirty
 * memory is being grown.  When no threads are using the old DirtyMemoryBlocks
 * anymore it is freed by RCU (but the underlying blocks stay because they are
 * pointed to from the new DirtyMemoryBlocks).
 */
#define DIRTY_MEMORY_BLOCK_SIZE ((ram_addr_t)256 * 1024 * 8)
typedef struct {
struct rcu_head rcu;
unsigned long *blocks[];
} DirtyMemoryBlocks;
[...]

Thanks,
Keqian




Ping: [PATCH v2] bugfix: hostmem: Free host_nodes list right after visited

2020-12-18 Thread Keqian Zhu
Hi Peter,

Friendly ping :-) Are you going to queue this?

Thanks,
Keqian

On 2020/12/10 15:52, Keqian Zhu wrote:
> In host_memory_backend_get_host_nodes, we build host_nodes
> list and output it to v (a StringOutputVisitor) but forget
> to free the list. This fixes the memory leak.
> 
> The memory leak stack:
> 
>  Direct leak of 32 byte(s) in 2 object(s) allocated from:
> #0 0xfffda30b3393 in __interceptor_calloc 
> (/usr/lib64/libasan.so.4+0xd3393)
> #1 0xfffda1d28b9b in g_malloc0 (/usr/lib64/libglib-2.0.so.0+0x58b9b)
> #2 0xaaab05ca6e43 in host_memory_backend_get_host_nodes 
> backends/hostmem.c:94
> #3 0xaaab061ddf83 in object_property_get_uint16List qom/object.c:1478
> #4 0xaaab05866513 in query_memdev hw/core/machine-qmp-cmds.c:312
> #5 0xaaab061d980b in do_object_child_foreach qom/object.c:1001
> #6 0xaaab0586779b in qmp_query_memdev hw/core/machine-qmp-cmds.c:328
> #7 0xaaab0615ed3f in qmp_marshal_query_memdev 
> qapi/qapi-commands-machine.c:327
> #8 0xaaab0632d647 in do_qmp_dispatch qapi/qmp-dispatch.c:147
> #9 0xaaab0632d647 in qmp_dispatch qapi/qmp-dispatch.c:190
> #10 0xaaab0610f74b in monitor_qmp_dispatch monitor/qmp.c:120
> #11 0xaaab0611074b in monitor_qmp_bh_dispatcher monitor/qmp.c:209
> #12 0xaaab063caefb in aio_bh_poll util/async.c:117
> #13 0xaaab063d30fb in aio_dispatch util/aio-posix.c:459
> #14 0xaaab063cac8f in aio_ctx_dispatch util/async.c:268
> #15 0xfffda1d22a6b in g_main_context_dispatch 
> (/usr/lib64/libglib-2.0.so.0+0x52a6b)
> #16 0xaaab063d0e97 in glib_pollfds_poll util/main-loop.c:218
> #17 0xaaab063d0e97 in os_host_main_loop_wait util/main-loop.c:241
> #18 0xaaab063d0e97 in main_loop_wait util/main-loop.c:517
> #19 0xaaab05c8bfa7 in main_loop /root/rpmbuild/BUILD/qemu-4.1.0/vl.c:1791
> #20 0xaaab05713bc3 in main /root/rpmbuild/BUILD/qemu-4.1.0/vl.c:4473
> #21 0xfffda0a83ebf in __libc_start_main (/usr/lib64/libc.so.6+0x23ebf)
> #22 0xaaab0571ed5f  (aarch64-softmmu/qemu-system-aarch64+0x88ed5f)
>  SUMMARY: AddressSanitizer: 32 byte(s) leaked in 2 allocation(s).
> 
> Fixes: 4cf1b76bf1e2 (hostmem: add properties for NUMA memory policy)
> Reported-by: Euler Robot 
> Tested-by: Chen Qun 
> Reviewed-by: Igor Mammedov 
> Signed-off-by: Keqian Zhu 
> ---
>  backends/hostmem.c | 1 +
>  1 file changed, 1 insertion(+)
> ---
> 
> v2:
>  - Update commit message.
>  - Add Chen Qun's T.b.
>  - Add Igor's R.b.
> 
> diff --git a/backends/hostmem.c b/backends/hostmem.c
> index 4bde00e8e7..9f9ac95edd 100644
> --- a/backends/hostmem.c
> +++ b/backends/hostmem.c
> @@ -105,6 +105,7 @@ host_memory_backend_get_host_nodes(Object *obj, Visitor 
> *v, const char *name,
>  
>  ret:
>  visit_type_uint16List(v, name, _nodes, errp);
> +qapi_free_uint16List(host_nodes);
>  }
>  
>  static void
> 



[PATCH v2 1/2] accel: kvm: Fix memory waste under mismatch page size

2020-12-16 Thread Keqian Zhu
When handle dirty log, we face qemu_real_host_page_size and
TARGET_PAGE_SIZE. The first one is the granule of KVM dirty
bitmap, and the second one is the granule of QEMU dirty bitmap.

As qemu_real_host_page_size >= TARGET_PAGE_SIZE (kvm_init()
enforced it), misuse TARGET_PAGE_SIZE to init kvmslot dirty_bmap
may waste memory. For example, when qemu_real_host_page_size is
64K and TARGET_PAGE_SIZE is 4K, it wastes 93.75% (15/16) memory.

Signed-off-by: Keqian Zhu 
Reviewed-by: Andrew Jones 
Reviewed-by: Peter Xu 
---
 accel/kvm/kvm-all.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

---

v2
 - Address Andrew's comment (qemu_real_host_page_size >= TARGET_PAGE_SIZE
   is a rule).
 - Add Andrew and Peter's R-b.

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 389eaace72..f6b16a8df8 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -620,8 +620,12 @@ static void kvm_memslot_init_dirty_bitmap(KVMSlot *mem)
  * too, in most cases).
  * So for now, let's align to 64 instead of HOST_LONG_BITS here, in
  * a hope that sizeof(long) won't become >8 any time soon.
+ *
+ * Note: the granule of kvm dirty log is qemu_real_host_page_size.
+ * And mem->memory_size is aligned to it (otherwise this mem can't
+ * be registered to KVM).
  */
-hwaddr bitmap_size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS),
+hwaddr bitmap_size = ALIGN(mem->memory_size / qemu_real_host_page_size,
 /*HOST_LONG_BITS*/ 64) / 8;
 mem->dirty_bmap = g_malloc0(bitmap_size);
 }
-- 
2.23.0




[PATCH v2 2/2] accel: kvm: Add aligment assert for kvm_log_clear_one_slot

2020-12-16 Thread Keqian Zhu
The parameters start and size are transfered from QEMU memory
emulation layer. It can promise that they are TARGET_PAGE_SIZE
aligned. However, KVM needs they are qemu_real_page_size aligned.

Though no caller breaks this aligned requirement currently, we'd
better add an explicit assert to avoid future breaking.

Signed-off-by: Keqian Zhu 
---
 accel/kvm/kvm-all.c | 7 +++
 1 file changed, 7 insertions(+)

---
v2
 - Address Andrew's commment (Use assert instead of return err).

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index f6b16a8df8..73b195cc41 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -692,6 +692,10 @@ out:
 #define KVM_CLEAR_LOG_ALIGN  (qemu_real_host_page_size << KVM_CLEAR_LOG_SHIFT)
 #define KVM_CLEAR_LOG_MASK   (-KVM_CLEAR_LOG_ALIGN)
 
+/*
+ * As the granule of kvm dirty log is qemu_real_host_page_size,
+ * @start and @size are expected and restricted to align to it.
+ */
 static int kvm_log_clear_one_slot(KVMSlot *mem, int as_id, uint64_t start,
   uint64_t size)
 {
@@ -701,6 +705,9 @@ static int kvm_log_clear_one_slot(KVMSlot *mem, int as_id, 
uint64_t start,
 unsigned long *bmap_clear = NULL, psize = qemu_real_host_page_size;
 int ret;
 
+/* Make sure start and size are qemu_real_host_page_size aligned */
+assert(QEMU_IS_ALIGNED(start | size, psize));
+
 /*
  * We need to extend either the start or the size or both to
  * satisfy the KVM interface requirement.  Firstly, do the start
-- 
2.23.0




[PATCH v2 0/2] accel: kvm: Some bugfixes for kvm dirty log

2020-12-16 Thread Keqian Zhu
Hi all,

This series fixes memory waste and adds alignment check for unmatched
qemu_real_host_page_size and TARGET_PAGE_SIZE.

Thanks.

Keqian Zhu (2):
  accel: kvm: Fix memory waste under mismatch page size
  accel: kvm: Add aligment assert for kvm_log_clear_one_slot

 accel/kvm/kvm-all.c | 13 -
 1 file changed, 12 insertions(+), 1 deletion(-)

-- 
2.23.0




Re: [PATCH 1/2] accel: kvm: Fix memory waste under mismatch page size

2020-12-16 Thread Keqian Zhu



On 2020/12/17 4:48, Peter Xu wrote:
> On Wed, Dec 16, 2020 at 04:21:17PM +0800, Keqian Zhu wrote:
>> One more thing, we should consider whether @start and @size is psize aligned 
>> (my second
>> patch). Do you agree to add assert on them directly?
> 
> Yes I think the 2nd patch is okay, but please address Drew's comments.
> 
> Returning -EINVAL is the same as abort() currently - it'll just abort() at
> kvm_log_clear() instead.
OK, I will send v2 soon.

Thanks,
Keqian

> 
> Thanks,
> 



Re: [PATCH 1/2] accel: kvm: Fix memory waste under mismatch page size

2020-12-16 Thread Keqian Zhu
Hi Peter,

On 2020/12/16 1:57, Peter Xu wrote:
> On Tue, Dec 15, 2020 at 03:19:47PM +0800, Keqian Zhu wrote:
>> When handle dirty log, we face qemu_real_host_page_size and
>> TARGET_PAGE_SIZE. The first one is the granule of KVM dirty
>> bitmap, and the second one is the granule of QEMU dirty bitmap.
>>
>> Generally speaking, qemu_real_host_page_size >= TARGET_PAGE_SIZE,
>> so misuse TARGET_PAGE_SIZE to init kvmslot dirty_bmap may waste
>> memory. For example, when qemu_real_host_page_size is 64K and
>> TARGET_PAGE_SIZE is 4K, this bugfix can save 93.75% memory.
>>
>> Signed-off-by: Keqian Zhu 
>> ---
>>  accel/kvm/kvm-all.c | 6 +-
>>  1 file changed, 5 insertions(+), 1 deletion(-)
>>
>> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
>> index baaa54249d..c5e06288eb 100644
>> --- a/accel/kvm/kvm-all.c
>> +++ b/accel/kvm/kvm-all.c
>> @@ -620,8 +620,12 @@ static void kvm_memslot_init_dirty_bitmap(KVMSlot *mem)
>>   * too, in most cases).
>>   * So for now, let's align to 64 instead of HOST_LONG_BITS here, in
>>   * a hope that sizeof(long) won't become >8 any time soon.
>> + *
>> + * Note: the granule of kvm dirty log is qemu_real_host_page_size.
>> + * And mem->memory_size is aligned to it (otherwise this mem can't
>> + * be registered to KVM).
>>   */
>> -hwaddr bitmap_size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS),
>> +hwaddr bitmap_size = ALIGN(mem->memory_size / qemu_real_host_page_size,
>>  /*HOST_LONG_BITS*/ 64) / 8;
> 
> Yes I think this is correct.  Thanks.
> 
> So one thing I failed to notice is cpu_physical_memory_set_dirty_lebitmap()
> will "amplify" the host dirty pages into guest ones; seems we're all good 
> then.
> 
> Reviewed-by: Peter Xu 
> 
OK Thanks :-)

One more thing, we should consider whether @start and @size is psize aligned 
(my second
patch). Do you agree to add assert on them directly?


Thanks,
Keqian



Re: [PATCH 2/2] accel: kvm: Add aligment check for kvm_log_clear_one_slot

2020-12-16 Thread Keqian Zhu



On 2020/12/16 15:23, Andrew Jones wrote:
> On Tue, Dec 15, 2020 at 12:55:50PM +0100, Andrew Jones wrote:
>> On Tue, Dec 15, 2020 at 03:19:48PM +0800, Keqian Zhu wrote:
>>> The parameters start and size are transfered from QEMU memory
>>> emulation layer. It can promise that they are TARGET_PAGE_SIZE
>>> aligned. However, KVM needs they are qemu_real_page_size aligned.
>>>
>>> Though no caller breaks this aligned requirement currently, we'd
>>> better add an explicit check to avoid future breaking.
>>>
>>> Signed-off-by: Keqian Zhu 
>>> ---
>>>  accel/kvm/kvm-all.c | 5 +
>>>  1 file changed, 5 insertions(+)
>>>
>>> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
>>> index c5e06288eb..3d0e3aa872 100644
>>> --- a/accel/kvm/kvm-all.c
>>> +++ b/accel/kvm/kvm-all.c
>>> @@ -701,6 +701,11 @@ static int kvm_log_clear_one_slot(KVMSlot *mem, int 
>>> as_id, uint64_t start,
>>>  unsigned long *bmap_clear = NULL, psize = qemu_real_host_page_size;
>>>  int ret;
>>>  
>>> +/* Make sure start and size are psize aligned */
>>> +if (!QEMU_IS_ALIGNED(start, psize) || !QEMU_IS_ALIGNED(size, psize)) {
>>> +return -EINVAL;
>>> +}
>>> +
>>>  /*
>>>   * We need to extend either the start or the size or both to
>>>   * satisfy the KVM interface requirement.  Firstly, do the start
>>> -- 
>>> 2.23.0
>>>
>>>
>>
>> It's not clear to me that this function has any restrictions on start
>> and size. If it does, then please document those restrictions in the
>> function's header and assert rather than return.
>>
> 
> Also, I see this patch is on its way in
> 
> https://patchwork.ozlabs.org/project/qemu-devel/patch/20201215175445.1272776-27-pbonz...@redhat.com/

Hi drew,

Thanks for informing me. I see that they are not the same patch.
The above patch fixes 64-pages-alignment problem,
but this patch handles page-alignment problem.

Thanks,
Keqian
> 
> Thanks,
> drew 
> 
> .
> 



Re: [PATCH 2/2] accel: kvm: Add aligment check for kvm_log_clear_one_slot

2020-12-16 Thread Keqian Zhu



On 2020/12/15 19:55, Andrew Jones wrote:
> On Tue, Dec 15, 2020 at 03:19:48PM +0800, Keqian Zhu wrote:
>> The parameters start and size are transfered from QEMU memory
>> emulation layer. It can promise that they are TARGET_PAGE_SIZE
>> aligned. However, KVM needs they are qemu_real_page_size aligned.
>>
>> Though no caller breaks this aligned requirement currently, we'd
>> better add an explicit check to avoid future breaking.
>>
>> Signed-off-by: Keqian Zhu 
>> ---
>>  accel/kvm/kvm-all.c | 5 +
>>  1 file changed, 5 insertions(+)
>>
>> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
>> index c5e06288eb..3d0e3aa872 100644
>> --- a/accel/kvm/kvm-all.c
>> +++ b/accel/kvm/kvm-all.c
>> @@ -701,6 +701,11 @@ static int kvm_log_clear_one_slot(KVMSlot *mem, int 
>> as_id, uint64_t start,
>>  unsigned long *bmap_clear = NULL, psize = qemu_real_host_page_size;
>>  int ret;
>>  
>> +/* Make sure start and size are psize aligned */
>> +if (!QEMU_IS_ALIGNED(start, psize) || !QEMU_IS_ALIGNED(size, psize)) {
>> +return -EINVAL;
>> +}
>> +
>>  /*
>>   * We need to extend either the start or the size or both to
>>   * satisfy the KVM interface requirement.  Firstly, do the start
>> -- 
>> 2.23.0
>>
>>
> 
> It's not clear to me that this function has any restrictions on start
> and size. If it does, then please document those restrictions in the
> function's header and assert rather than return.

Hi drew,

The following code implicitly expands the clear area when start_delta is
not psize aligned.

  start_delta /= psize;

Thanks,
Keqian


> 
> Thanks,
> drew
> 
> .
> 



Re: [PATCH 1/2] accel: kvm: Fix memory waste under mismatch page size

2020-12-16 Thread Keqian Zhu
Hi drew,

On 2020/12/15 19:20, Andrew Jones wrote:
> On Tue, Dec 15, 2020 at 03:19:47PM +0800, Keqian Zhu wrote:
>> When handle dirty log, we face qemu_real_host_page_size and
>> TARGET_PAGE_SIZE. The first one is the granule of KVM dirty
>> bitmap, and the second one is the granule of QEMU dirty bitmap.
>>
>> Generally speaking, qemu_real_host_page_size >= TARGET_PAGE_SIZE,
> 
> Not just generally speaking, but must be. We have
> 
> /*
>  * On systems where the kernel can support different base page
>  * sizes, host page size may be different from TARGET_PAGE_SIZE,
>  * even with KVM.  TARGET_PAGE_SIZE is assumed to be the minimum
>  * page size for the system though.
>  */
> assert(TARGET_PAGE_SIZE <= qemu_real_host_page_size);
Yes, I noticed it, but my statement (Generally speaking) is not suitable.
Thanks for pointing it out.

> 
> at the top of kvm_init() to enforce it.
> 
> The comment also says TARGET_PAGE_SIZE is assumed to be the minimum,
> which is more of a requirement than assumption. And, that requirement
> implies that we require all memory regions and base addresses to be
> aligned to the maximum possible target page size (in case the target
> actually uses something larger).
> 
> Please remove 'Generally speaking' from the commit message. It
> implies this change is based on an assumption rather than a rule.
> 
> (It'd be nice if we had these guest memory and TARGET_PAGE_SIZE
> requirements better documented and in one place.)
Maybe someone could do this :-)

> 
>> so misuse TARGET_PAGE_SIZE to init kvmslot dirty_bmap may waste
>> memory. For example, when qemu_real_host_page_size is 64K and
>> TARGET_PAGE_SIZE is 4K, this bugfix can save 93.75% memory.
>>
>> Signed-off-by: Keqian Zhu 
>> ---
>>  accel/kvm/kvm-all.c | 6 +-
>>  1 file changed, 5 insertions(+), 1 deletion(-)
>>
>> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
>> index baaa54249d..c5e06288eb 100644
>> --- a/accel/kvm/kvm-all.c
>> +++ b/accel/kvm/kvm-all.c
>> @@ -620,8 +620,12 @@ static void kvm_memslot_init_dirty_bitmap(KVMSlot *mem)
>>   * too, in most cases).
>>   * So for now, let's align to 64 instead of HOST_LONG_BITS here, in
>>   * a hope that sizeof(long) won't become >8 any time soon.
>> + *
>> + * Note: the granule of kvm dirty log is qemu_real_host_page_size.
>> + * And mem->memory_size is aligned to it (otherwise this mem can't
>> + * be registered to KVM).
>>   */
>> -hwaddr bitmap_size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS),
>> +hwaddr bitmap_size = ALIGN(mem->memory_size / qemu_real_host_page_size,
>>  /*HOST_LONG_BITS*/ 64) / 8;
>>  mem->dirty_bmap = g_malloc0(bitmap_size);
>>  }
>> -- 
>> 2.23.0
>>
>>
> 
> Besides the commit message
> 
> Reviewed-by: Andrew Jones 
> 
OK, I will correct it and send v2 soon.

Cheers,
Keqian
> 
> Thanks,
> drew
> 
> .
> 



[PATCH 0/2] accel: kvm: Some bugfixes for kvm dirty log

2020-12-14 Thread Keqian Zhu
Keqian Zhu (2):
  accel: kvm: Fix memory waste under mismatch page size
  accel: kvm: Add aligment check for kvm_log_clear_one_slot

 accel/kvm/kvm-all.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

-- 
2.23.0




[PATCH 2/2] accel: kvm: Add aligment check for kvm_log_clear_one_slot

2020-12-14 Thread Keqian Zhu
The parameters start and size are transfered from QEMU memory
emulation layer. It can promise that they are TARGET_PAGE_SIZE
aligned. However, KVM needs they are qemu_real_page_size aligned.

Though no caller breaks this aligned requirement currently, we'd
better add an explicit check to avoid future breaking.

Signed-off-by: Keqian Zhu 
---
 accel/kvm/kvm-all.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index c5e06288eb..3d0e3aa872 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -701,6 +701,11 @@ static int kvm_log_clear_one_slot(KVMSlot *mem, int as_id, 
uint64_t start,
 unsigned long *bmap_clear = NULL, psize = qemu_real_host_page_size;
 int ret;
 
+/* Make sure start and size are psize aligned */
+if (!QEMU_IS_ALIGNED(start, psize) || !QEMU_IS_ALIGNED(size, psize)) {
+return -EINVAL;
+}
+
 /*
  * We need to extend either the start or the size or both to
  * satisfy the KVM interface requirement.  Firstly, do the start
-- 
2.23.0




[PATCH 1/2] accel: kvm: Fix memory waste under mismatch page size

2020-12-14 Thread Keqian Zhu
When handle dirty log, we face qemu_real_host_page_size and
TARGET_PAGE_SIZE. The first one is the granule of KVM dirty
bitmap, and the second one is the granule of QEMU dirty bitmap.

Generally speaking, qemu_real_host_page_size >= TARGET_PAGE_SIZE,
so misuse TARGET_PAGE_SIZE to init kvmslot dirty_bmap may waste
memory. For example, when qemu_real_host_page_size is 64K and
TARGET_PAGE_SIZE is 4K, this bugfix can save 93.75% memory.

Signed-off-by: Keqian Zhu 
---
 accel/kvm/kvm-all.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index baaa54249d..c5e06288eb 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -620,8 +620,12 @@ static void kvm_memslot_init_dirty_bitmap(KVMSlot *mem)
  * too, in most cases).
  * So for now, let's align to 64 instead of HOST_LONG_BITS here, in
  * a hope that sizeof(long) won't become >8 any time soon.
+ *
+ * Note: the granule of kvm dirty log is qemu_real_host_page_size.
+ * And mem->memory_size is aligned to it (otherwise this mem can't
+ * be registered to KVM).
  */
-hwaddr bitmap_size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS),
+hwaddr bitmap_size = ALIGN(mem->memory_size / qemu_real_host_page_size,
 /*HOST_LONG_BITS*/ 64) / 8;
 mem->dirty_bmap = g_malloc0(bitmap_size);
 }
-- 
2.23.0




[PATCH v2] bugfix: hostmem: Free host_nodes list right after visited

2020-12-09 Thread Keqian Zhu
In host_memory_backend_get_host_nodes, we build host_nodes
list and output it to v (a StringOutputVisitor) but forget
to free the list. This fixes the memory leak.

The memory leak stack:

 Direct leak of 32 byte(s) in 2 object(s) allocated from:
#0 0xfffda30b3393 in __interceptor_calloc (/usr/lib64/libasan.so.4+0xd3393)
#1 0xfffda1d28b9b in g_malloc0 (/usr/lib64/libglib-2.0.so.0+0x58b9b)
#2 0xaaab05ca6e43 in host_memory_backend_get_host_nodes 
backends/hostmem.c:94
#3 0xaaab061ddf83 in object_property_get_uint16List qom/object.c:1478
#4 0xaaab05866513 in query_memdev hw/core/machine-qmp-cmds.c:312
#5 0xaaab061d980b in do_object_child_foreach qom/object.c:1001
#6 0xaaab0586779b in qmp_query_memdev hw/core/machine-qmp-cmds.c:328
#7 0xaaab0615ed3f in qmp_marshal_query_memdev 
qapi/qapi-commands-machine.c:327
#8 0xaaab0632d647 in do_qmp_dispatch qapi/qmp-dispatch.c:147
#9 0xaaab0632d647 in qmp_dispatch qapi/qmp-dispatch.c:190
#10 0xaaab0610f74b in monitor_qmp_dispatch monitor/qmp.c:120
#11 0xaaab0611074b in monitor_qmp_bh_dispatcher monitor/qmp.c:209
#12 0xaaab063caefb in aio_bh_poll util/async.c:117
#13 0xaaab063d30fb in aio_dispatch util/aio-posix.c:459
#14 0xaaab063cac8f in aio_ctx_dispatch util/async.c:268
#15 0xfffda1d22a6b in g_main_context_dispatch 
(/usr/lib64/libglib-2.0.so.0+0x52a6b)
#16 0xaaab063d0e97 in glib_pollfds_poll util/main-loop.c:218
#17 0xaaab063d0e97 in os_host_main_loop_wait util/main-loop.c:241
#18 0xaaab063d0e97 in main_loop_wait util/main-loop.c:517
#19 0xaaab05c8bfa7 in main_loop /root/rpmbuild/BUILD/qemu-4.1.0/vl.c:1791
#20 0xaaab05713bc3 in main /root/rpmbuild/BUILD/qemu-4.1.0/vl.c:4473
#21 0xfffda0a83ebf in __libc_start_main (/usr/lib64/libc.so.6+0x23ebf)
#22 0xaaab0571ed5f  (aarch64-softmmu/qemu-system-aarch64+0x88ed5f)
 SUMMARY: AddressSanitizer: 32 byte(s) leaked in 2 allocation(s).

Fixes: 4cf1b76bf1e2 (hostmem: add properties for NUMA memory policy)
Reported-by: Euler Robot 
Tested-by: Chen Qun 
Reviewed-by: Igor Mammedov 
Signed-off-by: Keqian Zhu 
---
 backends/hostmem.c | 1 +
 1 file changed, 1 insertion(+)
---

v2:
 - Update commit message.
 - Add Chen Qun's T.b.
 - Add Igor's R.b.

diff --git a/backends/hostmem.c b/backends/hostmem.c
index 4bde00e8e7..9f9ac95edd 100644
--- a/backends/hostmem.c
+++ b/backends/hostmem.c
@@ -105,6 +105,7 @@ host_memory_backend_get_host_nodes(Object *obj, Visitor *v, 
const char *name,
 
 ret:
 visit_type_uint16List(v, name, _nodes, errp);
+qapi_free_uint16List(host_nodes);
 }
 
 static void
-- 
2.23.0




[PATCH] bugfix: hostmem: Free host_nodes list right after visited

2020-12-08 Thread Keqian Zhu
In host_memory_backend_get_host_nodes, we build host_nodes
list and output it to v (a StringOutputVisitor) but forget
to free the list. This fixes the memory leak.

The memory leak stack:

==qemu-kvm==209357==ERROR: LeakSanitizer: detected memory leaks
Direct leak of 32 byte(s) in 2 object(s) allocated from:
  #0 0xfffe430e3393 in __interceptor_calloc (/lib64/libasan.so.4+0xd3393)  ??:?
  #1 0xfffe41d58b9b in g_malloc0 (/lib64/libglib-2.0.so.0+0x58b9b)  ??:?
  #2 0xaaac0cdb6e43 (/usr/libexec/qemu-kvm+0xe16e43)  backends/hostmem.c:94
  #3 0xaaac0d2edf83 (/usr/libexec/qemu-kvm+0x134df83) qom/object.c:1478
  #4 0xaaac0c976513 (/usr/libexec/qemu-kvm+0x9d6513)  
hw/core/machine-qmp-cmds.c:312
  #5 0xaaac0d2e980b (/usr/libexec/qemu-kvm+0x134980b) qom/object.c:1001
  #6 0xaaac0c97779b (/usr/libexec/qemu-kvm+0x9d779b)  
hw/core/machine-qmp-cmds.c:328 (discriminator 1)
  #7 0xaaac0d26ed3f (/usr/libexec/qemu-kvm+0x12ced3f) 
qapi/qapi-commands-machine.c:327
  #8 0xaaac0d43d647 (/usr/libexec/qemu-kvm+0x149d647) qapi/qmp-dispatch.c:147
  #9 0xaaac0d21f74b (/usr/libexec/qemu-kvm+0x127f74b) monitor/qmp.c:120
  #10 0xaaac0d22074b (/usr/libexec/qemu-kvm+0x128074b) monitor/qmp.c:209 
(discriminator 4)
  #11 0xaaac0d4daefb (/usr/libexec/qemu-kvm+0x153aefb) util/async.c:117
  #12 0xaaac0d4e30fb (/usr/libexec/qemu-kvm+0x15430fb) util/aio-posix.c:459
  #13 0xaaac0d4dac8f (/usr/libexec/qemu-kvm+0x153ac8f) util/async.c:268
  #14 0xfffe41d52a6b in g_main_context_dispatch 
(/lib64/libglib-2.0.so.0+0x52a6b)  ??:?
  #15 0xaaac0d4e0e97 (/usr/libexec/qemu-kvm+0x1540e97)  util/main-loop.c:218
  #16 0xaaac0cd9bfa7 (/usr/libexec/qemu-kvm+0xdfbfa7)  vl.c:1791
  #17 0xaaac0c823bc3 (/usr/libexec/qemu-kvm+0x883bc3)  vl.c:4473
  #18 0xfffe40ab3ebf in __libc_start_main (/lib64/libc.so.6+0x23ebf)  ??:?
  #19 0xaaac0c82ed5f (/usr/libexec/qemu-kvm+0x88ed5f)  ??:?
SUMMARY: AddressSanitizer: 32 byte(s) leaked in 2 allocation(s).

Fixes: 4cf1b76bf1e2 (hostmem: add properties for NUMA memory policy)
Reported-by: Euler Robot 
Signed-off-by: Keqian Zhu 
---
 backends/hostmem.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/backends/hostmem.c b/backends/hostmem.c
index 4bde00e8e7..9f9ac95edd 100644
--- a/backends/hostmem.c
+++ b/backends/hostmem.c
@@ -105,6 +105,7 @@ host_memory_backend_get_host_nodes(Object *obj, Visitor *v, 
const char *name,
 
 ret:
 visit_type_uint16List(v, name, _nodes, errp);
+qapi_free_uint16List(host_nodes);
 }
 
 static void
-- 
2.23.0




[PATCH v2 0/2] bugfix: Decrease dirty bitmap blocks after we remove ramblock

2020-11-30 Thread Keqian Zhu
Keqian Zhu (2):
  ramlist: Make dirty bitmap blocks of ramlist resizable
  ramlist: Resize dirty bitmap blocks after remove ramblock

 softmmu/physmem.c | 37 +
 1 file changed, 29 insertions(+), 8 deletions(-)

-- 
2.23.0




[PATCH v2 2/2] ramlist: Resize dirty bitmap blocks after remove ramblock

2020-11-30 Thread Keqian Zhu
Use the new "dirty_memory_resize" interface to reduce dirty bitmap
blocks after we remove a ramblock from ramlist.

This bug is found by ASAN after executing several qmp commands about
object-add/object-del of memory-backend-ram. After applying this patch,
the memory leak is not reported anymore.

=
==qemu-system-aarch64==1720167==ERROR: LeakSanitizer: detected memory leaks
Direct leak of 2359296 byte(s) in 9 object(s) allocated from:
#0 0xfffeedf3e938 in __interceptor_calloc (/lib64/libasan.so.5+0xee938)
#1 0xf6f1e740 in bitmap_new /qemu/include/qemu/bitmap.h:101
#2 0xf6f1e81c in dirty_memory_extend ../exec.c:2212
#3 0xf6f22524 in ram_block_add ../exec.c:2261
#4 0xf6f22988 in qemu_ram_alloc_internal ../exec.c:2434
#5 0xf6f8ae70 in memory_region_init_ram_shared_nomigrate 
../softmmu/memory.c:1513
#6 0xf66edee0 in ram_backend_memory_alloc ../backends/hostmem-ram.c:30
#7 0xf660d03c in host_memory_backend_memory_complete 
../backends/hostmem.c:333
#8 0xf70f6968 in user_creatable_complete ../qom/object_interfaces.c:23
#9 0xf70f6dac in user_creatable_add_type ../qom/object_interfaces.c:93
#10 0xf70f7030 in user_creatable_add_dict ../qom/object_interfaces.c:134
#11 0xf7340174 in do_qmp_dispatch_bh ../qapi/qmp-dispatch.c:110
#12 0xf732da30 in aio_bh_poll ../util/async.c:164
#13 0xf735c9a8 in aio_dispatch ../util/aio-posix.c:381
#14 0xf732d2ec in aio_ctx_dispatch ../util/async.c:306
#15 0xfffeecb029d8 in g_main_context_dispatch 
(/lib64/libglib-2.0.so.0+0x529d8)
#16 0xf733bb78 in os_host_main_loop_wait ../util/main-loop.c:244
#17 0xf733beac in main_loop_wait ../util/main-loop.c:520
#18 0xf70802a4 in qemu_main_loop ../softmmu/vl.c:1677
#19 0xf655786c in main ../softmmu/main.c:50
#20 0xfffeec043f5c in __libc_start_main (/lib64/libc.so.6+0x23f5c)
#21 0xf656a198  (/qemu/build/qemu-system-aarch64+0x9ba198)
SUMMARY: AddressSanitizer: 2359296 byte(s) leaked in 9 allocation(s).

Reported-by: Euler Robot 
Signed-off-by: Keqian Zhu 


little concern:
According to code, my bugfix can solve two problems:

1. Lose reference to dirty bitmap of deleted ramblock, because the reference is
   covered by dirty bitmap of newly added ramblock.
2. All dirty bitmap is not freed before qemu exit.

However, ASAN do not report memory leak for point 2.
Any thoughts or explanations?
---
 softmmu/physmem.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/softmmu/physmem.c b/softmmu/physmem.c
index 3e4f29f126..8c5f910677 100644
--- a/softmmu/physmem.c
+++ b/softmmu/physmem.c
@@ -2132,6 +2132,8 @@ static void reclaim_ramblock(RAMBlock *block)
 
 void qemu_ram_free(RAMBlock *block)
 {
+ram_addr_t old_ram_size, new_ram_size;
+
 if (!block) {
 return;
 }
@@ -2141,12 +2143,18 @@ void qemu_ram_free(RAMBlock *block)
 }
 
 qemu_mutex_lock_ramlist();
+
+old_ram_size = last_ram_page();
 QLIST_REMOVE_RCU(block, next);
+new_ram_size = last_ram_page();
+dirty_memory_resize(old_ram_size, new_ram_size);
+
 ram_list.mru_block = NULL;
 /* Write list before version */
 smp_wmb();
 ram_list.version++;
 call_rcu(block, reclaim_ramblock, rcu);
+
 qemu_mutex_unlock_ramlist();
 }
 
-- 
2.23.0




[PATCH v2 1/2] ramlist: Make dirty bitmap blocks of ramlist resizable

2020-11-30 Thread Keqian Zhu
When we remove a ramblock, we should decrease the dirty bitmap blocks
of ramlist to avoid memory leakage. This patch rebuilds dirty_memory_
extend to support both "extend" and "decrease".

Reported-by: Euler Robot 
Signed-off-by: Keqian Zhu 
---
 softmmu/physmem.c | 29 +
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/softmmu/physmem.c b/softmmu/physmem.c
index 3027747c03..3e4f29f126 100644
--- a/softmmu/physmem.c
+++ b/softmmu/physmem.c
@@ -1816,17 +1816,19 @@ void qemu_ram_msync(RAMBlock *block, ram_addr_t start, 
ram_addr_t length)
 }
 
 /* Called with ram_list.mutex held */
-static void dirty_memory_extend(ram_addr_t old_ram_size,
+static void dirty_memory_resize(ram_addr_t old_ram_size,
 ram_addr_t new_ram_size)
 {
 ram_addr_t old_num_blocks = DIV_ROUND_UP(old_ram_size,
  DIRTY_MEMORY_BLOCK_SIZE);
 ram_addr_t new_num_blocks = DIV_ROUND_UP(new_ram_size,
  DIRTY_MEMORY_BLOCK_SIZE);
+ram_addr_t cpy_num_blocks = MIN(old_num_blocks, new_num_blocks);
+bool extend = new_num_blocks > old_num_blocks;
 int i;
 
-/* Only need to extend if block count increased */
-if (new_num_blocks <= old_num_blocks) {
+/* Only need to resize if block count changed */
+if (new_num_blocks == old_num_blocks) {
 return;
 }
 
@@ -1839,15 +1841,26 @@ static void dirty_memory_extend(ram_addr_t old_ram_size,
 new_blocks = g_malloc(sizeof(*new_blocks) +
   sizeof(new_blocks->blocks[0]) * new_num_blocks);
 
-if (old_num_blocks) {
+if (cpy_num_blocks) {
 memcpy(new_blocks->blocks, old_blocks->blocks,
-   old_num_blocks * sizeof(old_blocks->blocks[0]));
+   cpy_num_blocks * sizeof(old_blocks->blocks[0]));
 }
 
-for (j = old_num_blocks; j < new_num_blocks; j++) {
-new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
+if (extend) {
+for (j = cpy_num_blocks; j < new_num_blocks; j++) {
+new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
+}
+} else {
+for (j = cpy_num_blocks; j < old_num_blocks; j++) {
+/* We are safe to free it, for that it is out-of-use */
+g_free(old_blocks->blocks[j]);
+}
 }
 
+if (!new_num_blocks) {
+g_free(new_blocks);
+new_blocks = NULL;
+}
 qatomic_rcu_set(_list.dirty_memory[i], new_blocks);
 
 if (old_blocks) {
@@ -1894,7 +1907,7 @@ static void ram_block_add(RAMBlock *new_block, Error 
**errp, bool shared)
 new_ram_size = MAX(old_ram_size,
   (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS);
 if (new_ram_size > old_ram_size) {
-dirty_memory_extend(old_ram_size, new_ram_size);
+dirty_memory_resize(old_ram_size, new_ram_size);
 }
 /* Keep the list sorted from biggest to smallest block.  Unlike QTAILQ,
  * QLIST (which has an RCU-friendly variant) does not have insertion at
-- 
2.23.0




[PATCH] net: Use correct default-path macro for downscript

2020-11-22 Thread Keqian Zhu
Fixes: 63c4db4c2e6d (net: relocate paths to helpers and scripts)
Signed-off-by: Keqian Zhu 
---
 net/tap.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/tap.c b/net/tap.c
index c46ff66184..b8e5cca51c 100644
--- a/net/tap.c
+++ b/net/tap.c
@@ -951,7 +951,8 @@ free_fail:
 script = default_script = 
get_relocated_path(DEFAULT_NETWORK_SCRIPT);
 }
 if (!downscript) {
-downscript = default_downscript = 
get_relocated_path(DEFAULT_NETWORK_SCRIPT);
+downscript = default_downscript =
+ 
get_relocated_path(DEFAULT_NETWORK_DOWN_SCRIPT);
 }
 
 if (tap->has_ifname) {
-- 
2.23.0




[RFC PATCH 2/2] ramlist: Resize dirty bitmap blocks after remove ramblock

2020-11-20 Thread Keqian Zhu
Use the new "dirty_bitmap_resize" interface to reduce dirty bitmap
blocks after we remove a ramblock from ramlist.

Signed-off-by: Keqian Zhu 
---
 softmmu/physmem.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/softmmu/physmem.c b/softmmu/physmem.c
index f6ff78378e..2a17e0a89a 100644
--- a/softmmu/physmem.c
+++ b/softmmu/physmem.c
@@ -2131,6 +2131,8 @@ static void reclaim_ramblock(RAMBlock *block)
 
 void qemu_ram_free(RAMBlock *block)
 {
+ram_addr_t old_ram_size, new_ram_size;
+
 if (!block) {
 return;
 }
@@ -2140,12 +2142,18 @@ void qemu_ram_free(RAMBlock *block)
 }
 
 qemu_mutex_lock_ramlist();
+
+old_ram_size = last_ram_page();
 QLIST_REMOVE_RCU(block, next);
+new_ram_size = last_ram_page();
+dirty_bitmap_resize(old_ram_size, new_ram_size);
+
 ram_list.mru_block = NULL;
 /* Write list before version */
 smp_wmb();
 ram_list.version++;
 call_rcu(block, reclaim_ramblock, rcu);
+
 qemu_mutex_unlock_ramlist();
 }
 
-- 
2.23.0




[RFC PATCH 1/2] ramlist: Make dirty bitmap blocks of ramlist resizable

2020-11-20 Thread Keqian Zhu
When we remove a ramblock, we should decrease the dirty bitmap blocks
of ramlist to avoid memory leakage. This patch rebuilds dirty_memory_
extend to support both "extend" and "decrease".

Signed-off-by: Keqian Zhu 
---
 softmmu/physmem.c | 28 
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/softmmu/physmem.c b/softmmu/physmem.c
index 3027747c03..f6ff78378e 100644
--- a/softmmu/physmem.c
+++ b/softmmu/physmem.c
@@ -1816,17 +1816,19 @@ void qemu_ram_msync(RAMBlock *block, ram_addr_t start, 
ram_addr_t length)
 }
 
 /* Called with ram_list.mutex held */
-static void dirty_memory_extend(ram_addr_t old_ram_size,
+static void dirty_bitmap_resize(ram_addr_t old_ram_size,
 ram_addr_t new_ram_size)
 {
 ram_addr_t old_num_blocks = DIV_ROUND_UP(old_ram_size,
  DIRTY_MEMORY_BLOCK_SIZE);
 ram_addr_t new_num_blocks = DIV_ROUND_UP(new_ram_size,
  DIRTY_MEMORY_BLOCK_SIZE);
+ram_addr_t cpy_num_blocks = MIN(old_num_blocks, new_num_blocks);
+bool extend = new_num_blocks > old_num_blocks;
 int i;
 
-/* Only need to extend if block count increased */
-if (new_num_blocks <= old_num_blocks) {
+/* Only need to resize if block count changed */
+if (new_num_blocks == old_num_blocks) {
 return;
 }
 
@@ -1839,13 +1841,23 @@ static void dirty_memory_extend(ram_addr_t old_ram_size,
 new_blocks = g_malloc(sizeof(*new_blocks) +
   sizeof(new_blocks->blocks[0]) * new_num_blocks);
 
-if (old_num_blocks) {
+if (cpy_num_blocks) {
 memcpy(new_blocks->blocks, old_blocks->blocks,
-   old_num_blocks * sizeof(old_blocks->blocks[0]));
+   cpy_num_blocks * sizeof(old_blocks->blocks[0]));
 }
 
-for (j = old_num_blocks; j < new_num_blocks; j++) {
-new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
+if (extend) {
+for (j = old_num_blocks; j < new_num_blocks; j++) {
+new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
+}
+} else {
+for (j = cpy_num_blocks; j < old_num_blocks; j++) {
+/*
+ * We are safe to free it here, for that its RAMblock
+ * is out-of-use.
+ */
+g_free(old_blocks->blocks[j]);
+}
 }
 
 qatomic_rcu_set(_list.dirty_memory[i], new_blocks);
@@ -1894,7 +1906,7 @@ static void ram_block_add(RAMBlock *new_block, Error 
**errp, bool shared)
 new_ram_size = MAX(old_ram_size,
   (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS);
 if (new_ram_size > old_ram_size) {
-dirty_memory_extend(old_ram_size, new_ram_size);
+dirty_bitmap_resize(old_ram_size, new_ram_size);
 }
 /* Keep the list sorted from biggest to smallest block.  Unlike QTAILQ,
  * QLIST (which has an RCU-friendly variant) does not have insertion at
-- 
2.23.0




[RFC PATCH 0/2] bugfix: Decrease dirty bitmap blocks after we remove ramblock

2020-11-20 Thread Keqian Zhu
Hi all,

I failed to find where we free dirty bitmap blocks of ramlist. If this is a
memory leakage problem, I hope this patch series can fix it properly :-).

Thanks,
Keqian.

Keqian Zhu (2):
  ramlist: Make dirty bitmap blocks of ramlist resizable
  ramlist: Resize dirty bitmap blocks after remove ramblock

 softmmu/physmem.c | 36 
 1 file changed, 28 insertions(+), 8 deletions(-)

-- 
2.23.0




[PATCH] bugfix: irq: Avoid covering object refcount of qemu_irq

2020-07-27 Thread Keqian Zhu
Avoid covering object refcount of qemu_irq, otherwise it may causes
memory leak.

Signed-off-by: Keqian Zhu 
---
 hw/core/irq.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/hw/core/irq.c b/hw/core/irq.c
index fb3045b912..59af4dfc74 100644
--- a/hw/core/irq.c
+++ b/hw/core/irq.c
@@ -125,7 +125,9 @@ void qemu_irq_intercept_in(qemu_irq *gpio_in, 
qemu_irq_handler handler, int n)
 int i;
 qemu_irq *old_irqs = qemu_allocate_irqs(NULL, NULL, n);
 for (i = 0; i < n; i++) {
-*old_irqs[i] = *gpio_in[i];
+old_irqs[i]->handler = gpio_in[i]->handler;
+old_irqs[i]->opaque = gpio_in[i]->opaque;
+
 gpio_in[i]->handler = handler;
 gpio_in[i]->opaque = _irqs[i];
 }
-- 
2.19.1




[PATCH] migration: Assign current_migration as NULL after migration

2020-06-28 Thread Keqian Zhu
In migration_shutdown, global var current_migration is freed but not
assigned to NULL, which may cause heap-use-after-free problem if the
following code logic is abnormal.

Signed-off-by: Keqian Zhu 
---
 migration/migration.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/migration/migration.c b/migration/migration.c
index 481a590f72..ed7332 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -189,6 +189,7 @@ void migration_shutdown(void)
  */
 migrate_fd_cancel(current_migration);
 object_unref(OBJECT(current_migration));
+current_migration = NULL;
 }
 
 /* For outgoing */
-- 
2.19.1




[PATCH v3] migration: Count new_dirty instead of real_dirty

2020-06-21 Thread Keqian Zhu
real_dirty_pages becomes equal to total ram size after dirty log sync
in ram_init_bitmaps, the reason is that the bitmap of ramblock is
initialized to be all set, so old path counts them as "real dirty" at
beginning.

This causes wrong dirty rate and false positive throttling.

Signed-off-by: Keqian Zhu 
---
Changelog:

v3:
 - Address Dave's comments.

v2:
 - Use new_dirty_pages instead of accu_dirty_pages.
 - Adjust commit messages.
---
 include/exec/ram_addr.h | 5 +
 migration/ram.c | 8 +---
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
index 7b5c24e928..3ef729a23c 100644
--- a/include/exec/ram_addr.h
+++ b/include/exec/ram_addr.h
@@ -442,8 +442,7 @@ static inline void 
cpu_physical_memory_clear_dirty_range(ram_addr_t start,
 static inline
 uint64_t cpu_physical_memory_sync_dirty_bitmap(RAMBlock *rb,
ram_addr_t start,
-   ram_addr_t length,
-   uint64_t *real_dirty_pages)
+   ram_addr_t length)
 {
 ram_addr_t addr;
 unsigned long word = BIT_WORD((start + rb->offset) >> TARGET_PAGE_BITS);
@@ -469,7 +468,6 @@ uint64_t cpu_physical_memory_sync_dirty_bitmap(RAMBlock *rb,
 if (src[idx][offset]) {
 unsigned long bits = atomic_xchg([idx][offset], 0);
 unsigned long new_dirty;
-*real_dirty_pages += ctpopl(bits);
 new_dirty = ~dest[k];
 dest[k] |= bits;
 new_dirty &= bits;
@@ -502,7 +500,6 @@ uint64_t cpu_physical_memory_sync_dirty_bitmap(RAMBlock *rb,
 start + addr + offset,
 TARGET_PAGE_SIZE,
 DIRTY_MEMORY_MIGRATION)) {
-*real_dirty_pages += 1;
 long k = (start + addr) >> TARGET_PAGE_BITS;
 if (!test_and_set_bit(k, dest)) {
 num_dirty++;
diff --git a/migration/ram.c b/migration/ram.c
index 069b6e30bc..5554a7d2d8 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -859,9 +859,11 @@ static inline bool migration_bitmap_clear_dirty(RAMState 
*rs,
 /* Called with RCU critical section */
 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
 {
-rs->migration_dirty_pages +=
-cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length,
-  >num_dirty_pages_period);
+uint64_t new_dirty_pages =
+cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
+
+rs->migration_dirty_pages += new_dirty_pages;
+rs->num_dirty_pages_period += new_dirty_pages;
 }
 
 /**
-- 
2.19.1




[PATCH v2] migration: Count new_dirty instead of real_dirty

2020-06-15 Thread Keqian Zhu
real_dirty_pages becomes equal to total ram size after dirty log sync
in ram_init_bitmaps, the reason is that the bitmap of ramblock is
initialized to be all set, so old path counts them as "real dirty" at
beginning.

This causes wrong dirty rate and false positive throttling at the end
of first ram save iteration.

Signed-off-by: Keqian Zhu 

---
Changelog:

v2:
 - use new_dirty_pages instead of accu_dirty_pages.
 - adjust commit messages.

---
 include/exec/ram_addr.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
index 7b5c24e928..a95e2e7c25 100644
--- a/include/exec/ram_addr.h
+++ b/include/exec/ram_addr.h
@@ -443,7 +443,7 @@ static inline
 uint64_t cpu_physical_memory_sync_dirty_bitmap(RAMBlock *rb,
ram_addr_t start,
ram_addr_t length,
-   uint64_t *real_dirty_pages)
+   uint64_t *new_dirty_pages)
 {
 ram_addr_t addr;
 unsigned long word = BIT_WORD((start + rb->offset) >> TARGET_PAGE_BITS);
@@ -469,7 +469,6 @@ uint64_t cpu_physical_memory_sync_dirty_bitmap(RAMBlock *rb,
 if (src[idx][offset]) {
 unsigned long bits = atomic_xchg([idx][offset], 0);
 unsigned long new_dirty;
-*real_dirty_pages += ctpopl(bits);
 new_dirty = ~dest[k];
 dest[k] |= bits;
 new_dirty &= bits;
@@ -502,7 +501,6 @@ uint64_t cpu_physical_memory_sync_dirty_bitmap(RAMBlock *rb,
 start + addr + offset,
 TARGET_PAGE_SIZE,
 DIRTY_MEMORY_MIGRATION)) {
-*real_dirty_pages += 1;
 long k = (start + addr) >> TARGET_PAGE_BITS;
 if (!test_and_set_bit(k, dest)) {
 num_dirty++;
@@ -511,6 +509,7 @@ uint64_t cpu_physical_memory_sync_dirty_bitmap(RAMBlock *rb,
 }
 }
 
+*new_dirty_pages += num_dirty;
 return num_dirty;
 }
 #endif
-- 
2.19.1




[PATCH] migration: Count new_dirty instead of real_dirty

2020-05-31 Thread Keqian Zhu
DIRTY_LOG_INITIALLY_ALL_SET feature is on the queue. This fixs the
dirty rate calculation for this feature. After introducing this
feature, real_dirty_pages is equal to total memory size at begining.
This causing wrong dirty rate and false positive throttling.

BTW, real dirty rate is not suitable and not very accurate.

1. For not suitable: We mainly concern on the relationship between
   dirty rate and network bandwidth. Net increasement of dirty pages
   makes more sense.
2. For not very accurate: With manual dirty log clear, some dirty pages
   will be cleared during each peroid, our "real dirty rate" is less
   than real "real dirty rate".

Signed-off-by: Keqian Zhu 
---
 include/exec/ram_addr.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
index 5e59a3d8d7..af9677e291 100644
--- a/include/exec/ram_addr.h
+++ b/include/exec/ram_addr.h
@@ -443,7 +443,7 @@ static inline
 uint64_t cpu_physical_memory_sync_dirty_bitmap(RAMBlock *rb,
ram_addr_t start,
ram_addr_t length,
-   uint64_t *real_dirty_pages)
+   uint64_t *accu_dirty_pages)
 {
 ram_addr_t addr;
 unsigned long word = BIT_WORD((start + rb->offset) >> TARGET_PAGE_BITS);
@@ -469,7 +469,6 @@ uint64_t cpu_physical_memory_sync_dirty_bitmap(RAMBlock *rb,
 if (src[idx][offset]) {
 unsigned long bits = atomic_xchg([idx][offset], 0);
 unsigned long new_dirty;
-*real_dirty_pages += ctpopl(bits);
 new_dirty = ~dest[k];
 dest[k] |= bits;
 new_dirty &= bits;
@@ -502,7 +501,6 @@ uint64_t cpu_physical_memory_sync_dirty_bitmap(RAMBlock *rb,
 start + addr + offset,
 TARGET_PAGE_SIZE,
 DIRTY_MEMORY_MIGRATION)) {
-*real_dirty_pages += 1;
 long k = (start + addr) >> TARGET_PAGE_BITS;
 if (!test_and_set_bit(k, dest)) {
 num_dirty++;
@@ -511,6 +509,7 @@ uint64_t cpu_physical_memory_sync_dirty_bitmap(RAMBlock *rb,
 }
 }
 
+*accu_dirty_pages += num_dirty;
 return num_dirty;
 }
 #endif
-- 
2.19.1




[PATCH v3] migration/throttle: Add cpu-throttle-tailslow migration parameter

2020-04-13 Thread Keqian Zhu
At the tail stage of throttling, the Guest is very sensitive to
CPU percentage while the @cpu-throttle-increment is excessive
usually at tail stage.

If this parameter is true, we will compute the ideal CPU percentage
used by the Guest, which may exactly make the dirty rate match the
dirty rate threshold. Then we will choose a smaller throttle increment
between the one specified by @cpu-throttle-increment and the one
generated by ideal CPU percentage.

Therefore, it is compatible to traditional throttling, meanwhile
the throttle increment won't be excessive at tail stage. This may
make migration time longer, and is disabled by default.

Signed-off-by: Keqian Zhu 
---
Cc: Juan Quintela 
Cc: "Dr. David Alan Gilbert" 
Cc: Eric Blake 
Cc: Markus Armbruster 
---
 migration/migration.c | 13 
 migration/ram.c   | 25 +-
 monitor/hmp-cmds.c|  8 
 qapi/migration.json   | 48 +++
 4 files changed, 89 insertions(+), 5 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index 187ac0410c..d478a87290 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -785,6 +785,8 @@ MigrationParameters *qmp_query_migrate_parameters(Error 
**errp)
 params->cpu_throttle_initial = s->parameters.cpu_throttle_initial;
 params->has_cpu_throttle_increment = true;
 params->cpu_throttle_increment = s->parameters.cpu_throttle_increment;
+params->has_cpu_throttle_tailslow = true;
+params->cpu_throttle_tailslow = s->parameters.cpu_throttle_tailslow;
 params->has_tls_creds = true;
 params->tls_creds = g_strdup(s->parameters.tls_creds);
 params->has_tls_hostname = true;
@@ -1324,6 +1326,10 @@ static void 
migrate_params_test_apply(MigrateSetParameters *params,
 dest->cpu_throttle_increment = params->cpu_throttle_increment;
 }
 
+if (params->has_cpu_throttle_tailslow) {
+dest->cpu_throttle_tailslow = params->cpu_throttle_tailslow;
+}
+
 if (params->has_tls_creds) {
 assert(params->tls_creds->type == QTYPE_QSTRING);
 dest->tls_creds = g_strdup(params->tls_creds->u.s);
@@ -1412,6 +1418,10 @@ static void migrate_params_apply(MigrateSetParameters 
*params, Error **errp)
 s->parameters.cpu_throttle_increment = params->cpu_throttle_increment;
 }
 
+if (params->has_cpu_throttle_tailslow) {
+s->parameters.cpu_throttle_tailslow = params->cpu_throttle_tailslow;
+}
+
 if (params->has_tls_creds) {
 g_free(s->parameters.tls_creds);
 assert(params->tls_creds->type == QTYPE_QSTRING);
@@ -3594,6 +3604,8 @@ static Property migration_properties[] = {
 DEFINE_PROP_UINT8("x-cpu-throttle-increment", MigrationState,
   parameters.cpu_throttle_increment,
   DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT),
+DEFINE_PROP_BOOL("x-cpu-throttle-tailslow", MigrationState,
+  parameters.cpu_throttle_tailslow, false),
 DEFINE_PROP_SIZE("x-max-bandwidth", MigrationState,
   parameters.max_bandwidth, MAX_THROTTLE),
 DEFINE_PROP_UINT64("x-downtime-limit", MigrationState,
@@ -3700,6 +3712,7 @@ static void migration_instance_init(Object *obj)
 params->has_throttle_trigger_threshold = true;
 params->has_cpu_throttle_initial = true;
 params->has_cpu_throttle_increment = true;
+params->has_cpu_throttle_tailslow = true;
 params->has_max_bandwidth = true;
 params->has_downtime_limit = true;
 params->has_x_checkpoint_delay = true;
diff --git a/migration/ram.c b/migration/ram.c
index 04f13feb2e..3317c99786 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -616,20 +616,34 @@ static size_t save_page_header(RAMState *rs, QEMUFile *f, 
 RAMBlock *block,
  * able to complete migration. Some workloads dirty memory way too
  * fast and will not effectively converge, even with auto-converge.
  */
-static void mig_throttle_guest_down(void)
+static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
+uint64_t bytes_dirty_threshold)
 {
 MigrationState *s = migrate_get_current();
 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
-uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
+uint64_t pct_increment = s->parameters.cpu_throttle_increment;
+bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
 int pct_max = s->parameters.max_cpu_throttle;
 
+uint64_t throttle_now = cpu_throttle_get_percentage();
+uint64_t cpu_now, cpu_ideal, throttle_inc;
+
 /* We have not started throttling yet. Let's start it. */
 if (!cpu_throttle_active()) {
 cpu_throttle_set(pct_initial);
 } else {
 /* Throttling already on, just increase the rate */
- 

[PATCH 1/3] bugfix: Use gicr_typer in arm_gicv3_icc_reset

2020-04-13 Thread Keqian Zhu
The KVM_VGIC_ATTR macro expect the second parameter as gicr_typer,
of which high 32bit is constructed by mp_affinity. For most case,
the high 32bit of mp_affinity is zero, so it will always access the
ICC_CTLR_EL1 of CPU0.

Signed-off-by: Keqian Zhu 
---
 hw/intc/arm_gicv3_kvm.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/hw/intc/arm_gicv3_kvm.c b/hw/intc/arm_gicv3_kvm.c
index 49304ca589..ca43bf87ca 100644
--- a/hw/intc/arm_gicv3_kvm.c
+++ b/hw/intc/arm_gicv3_kvm.c
@@ -658,13 +658,11 @@ static void kvm_arm_gicv3_get(GICv3State *s)
 
 static void arm_gicv3_icc_reset(CPUARMState *env, const ARMCPRegInfo *ri)
 {
-ARMCPU *cpu;
 GICv3State *s;
 GICv3CPUState *c;
 
 c = (GICv3CPUState *)env->gicv3state;
 s = c->gic;
-cpu = ARM_CPU(c->cpu);
 
 c->icc_pmr_el1 = 0;
 c->icc_bpr[GICV3_G0] = GIC_MIN_BPR;
@@ -681,7 +679,7 @@ static void arm_gicv3_icc_reset(CPUARMState *env, const 
ARMCPRegInfo *ri)
 
 /* Initialize to actual HW supported configuration */
 kvm_device_access(s->dev_fd, KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS,
-  KVM_VGIC_ATTR(ICC_CTLR_EL1, cpu->mp_affinity),
+  KVM_VGIC_ATTR(ICC_CTLR_EL1, c->gicr_typer),
   >icc_ctlr_el1[GICV3_NS], false, _abort);
 
 c->icc_ctlr_el1[GICV3_S] = c->icc_ctlr_el1[GICV3_NS];
-- 
2.19.1




[PATCH 2/3] intc/gicv3_kvm: use kvm_gicc_access to get ICC_CTLR_EL1

2020-04-13 Thread Keqian Zhu
Replace kvm_device_access with kvm_gicc_access to simplify
code.

Signed-off-by: Keqian Zhu 
---
 hw/intc/arm_gicv3_kvm.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/hw/intc/arm_gicv3_kvm.c b/hw/intc/arm_gicv3_kvm.c
index ca43bf87ca..85f6420498 100644
--- a/hw/intc/arm_gicv3_kvm.c
+++ b/hw/intc/arm_gicv3_kvm.c
@@ -678,9 +678,8 @@ static void arm_gicv3_icc_reset(CPUARMState *env, const 
ARMCPRegInfo *ri)
 }
 
 /* Initialize to actual HW supported configuration */
-kvm_device_access(s->dev_fd, KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS,
-  KVM_VGIC_ATTR(ICC_CTLR_EL1, c->gicr_typer),
-  >icc_ctlr_el1[GICV3_NS], false, _abort);
+kvm_gicc_access(s, ICC_CTLR_EL1, c->cpu->cpu_index,
+>icc_ctlr_el1[GICV3_NS], false);
 
 c->icc_ctlr_el1[GICV3_S] = c->icc_ctlr_el1[GICV3_NS];
 }
-- 
2.19.1




[PATCH 0/3] Some trivial fixes

2020-04-13 Thread Keqian Zhu
Hi all,

This patch-set contains trivial bugfix and typo fix.

Thanks,
Keqian

Keqian Zhu (3):
  bugfix: Use gicr_typer in arm_gicv3_icc_reset
  intc/gicv3_kvm: use kvm_gicc_access to get ICC_CTLR_EL1
  Typo: Correct the name of CPU hotplug memory region

 hw/acpi/cpu.c   | 2 +-
 hw/intc/arm_gicv3_kvm.c | 7 ++-
 2 files changed, 3 insertions(+), 6 deletions(-)

-- 
2.19.1




[PATCH 3/3] Typo: Correct the name of CPU hotplug memory region

2020-04-13 Thread Keqian Zhu
Replace "acpi-mem-hotplug" with "acpi-cpu-hotplug"

Signed-off-by: Keqian Zhu 
---
 hw/acpi/cpu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/acpi/cpu.c b/hw/acpi/cpu.c
index e2c957ce00..3d6a500fb7 100644
--- a/hw/acpi/cpu.c
+++ b/hw/acpi/cpu.c
@@ -222,7 +222,7 @@ void cpu_hotplug_hw_init(MemoryRegion *as, Object *owner,
 state->devs[i].arch_id = id_list->cpus[i].arch_id;
 }
 memory_region_init_io(>ctrl_reg, owner, _hotplug_ops, state,
-  "acpi-mem-hotplug", ACPI_CPU_HOTPLUG_REG_LEN);
+  "acpi-cpu-hotplug", ACPI_CPU_HOTPLUG_REG_LEN);
 memory_region_add_subregion(as, base_addr, >ctrl_reg);
 }
 
-- 
2.19.1




[PATCH v2] migration/throttle: Add cpu-throttle-tailslow migration parameter

2020-03-15 Thread Keqian Zhu
At the tail stage of throttling, the Guest is very sensitive to
CPU percentage while the @cpu-throttle-increment is excessive
usually at tail stage.

If this parameter is true, we will compute the ideal CPU percentage
used by the Guest, which may exactly makes dirty rate to be dirty
rate threshold. Then we will choose a smaller throttle increment
between the one specified by @cpu-throttle-increment and the one
generated by ideal CPU percentage.

Therefore, it is compatible to traditional throttling, meanwhile
the throttle increment won't be excessive at tail stage. This may
make migration time longer, and is disabled by default.

Signed-off-by: Keqian Zhu 
---
Cc: Juan Quintela 
Cc: "Dr. David Alan Gilbert" 
Cc: Eric Blake 
Cc: Markus Armbruster 
---
 migration/migration.c | 13 
 migration/ram.c   | 25 +-
 monitor/hmp-cmds.c|  8 
 qapi/migration.json   | 48 +++
 4 files changed, 89 insertions(+), 5 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index c1d88ace7f..cc157cbf90 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -785,6 +785,8 @@ MigrationParameters *qmp_query_migrate_parameters(Error 
**errp)
 params->cpu_throttle_initial = s->parameters.cpu_throttle_initial;
 params->has_cpu_throttle_increment = true;
 params->cpu_throttle_increment = s->parameters.cpu_throttle_increment;
+params->has_cpu_throttle_tailslow = true;
+params->cpu_throttle_tailslow = s->parameters.cpu_throttle_tailslow;
 params->has_tls_creds = true;
 params->tls_creds = g_strdup(s->parameters.tls_creds);
 params->has_tls_hostname = true;
@@ -1323,6 +1325,10 @@ static void 
migrate_params_test_apply(MigrateSetParameters *params,
 dest->cpu_throttle_increment = params->cpu_throttle_increment;
 }
 
+if (params->has_cpu_throttle_tailslow) {
+dest->cpu_throttle_tailslow = params->cpu_throttle_tailslow;
+}
+
 if (params->has_tls_creds) {
 assert(params->tls_creds->type == QTYPE_QSTRING);
 dest->tls_creds = g_strdup(params->tls_creds->u.s);
@@ -1411,6 +1417,10 @@ static void migrate_params_apply(MigrateSetParameters 
*params, Error **errp)
 s->parameters.cpu_throttle_increment = params->cpu_throttle_increment;
 }
 
+if (params->has_cpu_throttle_tailslow) {
+s->parameters.cpu_throttle_tailslow = params->cpu_throttle_tailslow;
+}
+
 if (params->has_tls_creds) {
 g_free(s->parameters.tls_creds);
 assert(params->tls_creds->type == QTYPE_QSTRING);
@@ -3588,6 +3598,8 @@ static Property migration_properties[] = {
 DEFINE_PROP_UINT8("x-cpu-throttle-increment", MigrationState,
   parameters.cpu_throttle_increment,
   DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT),
+DEFINE_PROP_BOOL("x-cpu-throttle-tailslow", MigrationState,
+  parameters.cpu_throttle_tailslow, false),
 DEFINE_PROP_SIZE("x-max-bandwidth", MigrationState,
   parameters.max_bandwidth, MAX_THROTTLE),
 DEFINE_PROP_UINT64("x-downtime-limit", MigrationState,
@@ -3694,6 +3706,7 @@ static void migration_instance_init(Object *obj)
 params->has_throttle_trigger_threshold = true;
 params->has_cpu_throttle_initial = true;
 params->has_cpu_throttle_increment = true;
+params->has_cpu_throttle_tailslow = true;
 params->has_max_bandwidth = true;
 params->has_downtime_limit = true;
 params->has_x_checkpoint_delay = true;
diff --git a/migration/ram.c b/migration/ram.c
index c12cfdbe26..4b74461306 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -616,20 +616,34 @@ static size_t save_page_header(RAMState *rs, QEMUFile *f, 
 RAMBlock *block,
  * able to complete migration. Some workloads dirty memory way too
  * fast and will not effectively converge, even with auto-converge.
  */
-static void mig_throttle_guest_down(void)
+static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
+uint64_t bytes_dirty_threshold)
 {
 MigrationState *s = migrate_get_current();
 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
-uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
+uint64_t pct_increment = s->parameters.cpu_throttle_increment;
+bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
 int pct_max = s->parameters.max_cpu_throttle;
 
+uint64_t throttle_now = cpu_throttle_get_percentage();
+uint64_t cpu_now, cpu_ideal, throttle_inc;
+
 /* We have not started throttling yet. Let's start it. */
 if (!cpu_throttle_active()) {
 cpu_throttle_set(pct_initial);
 } else {
 /* Throttling already on, just increase the rate */
-cpu_thrott

[PATCH v2 2/2] migration: not require length align when choose fast dirty sync path

2020-03-10 Thread Keqian Zhu
In commit aa777e297c84 ("cpu_physical_memory_sync_dirty_bitmap: Another
alignment fix"), ramblock length is required to align word pages when
we choose the fast dirty sync path. The reason is that "If the Ramblock
is less than 64 pages in length that long can contain bits representing
two different RAMBlocks, but the code will update the bmap belinging to
the 1st RAMBlock only while having updated the total dirty page count
for both."

This is right before commit 801110ab22be ("find_ram_offset: Align
ram_addr_t allocation on long boundaries"), which align ram_addr_t
allocation on long boundaries. So currently we wont "updated the total
dirty page count for both".

By removing the alignment constraint of length in fast path, we can always
use fast dirty sync path if start_global is aligned to word page.

Signed-off-by: Keqian Zhu 
---
 include/exec/ram_addr.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
index 8311efb7bc..57b3edf376 100644
--- a/include/exec/ram_addr.h
+++ b/include/exec/ram_addr.h
@@ -450,9 +450,8 @@ uint64_t cpu_physical_memory_sync_dirty_bitmap(RAMBlock *rb,
 uint64_t num_dirty = 0;
 unsigned long *dest = rb->bmap;
 
-/* start address and length is aligned at the start of a word? */
-if (((word * BITS_PER_LONG) << TARGET_PAGE_BITS) == start_global &&
-!(length & ((BITS_PER_LONG << TARGET_PAGE_BITS) - 1))) {
+/* start address is aligned at the start of a word? */
+if (((word * BITS_PER_LONG) << TARGET_PAGE_BITS) == start_global) {
 int k;
 int nr = BITS_TO_LONGS(length >> TARGET_PAGE_BITS);
 unsigned long * const *src;
-- 
2.19.1




[PATCH v2 1/2] memory: Introduce start_global variable in dirty bitmap sync

2020-03-10 Thread Keqian Zhu
In the cpu_physical_memory_sync_dirty_bitmap func, use start_global
variable to make code more clear. And the addr variable is only used
in slow path, so move it to slow path.

Signed-off-by: Keqian Zhu 
---
 include/exec/ram_addr.h | 11 +--
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
index 5e59a3d8d7..8311efb7bc 100644
--- a/include/exec/ram_addr.h
+++ b/include/exec/ram_addr.h
@@ -445,14 +445,13 @@ uint64_t cpu_physical_memory_sync_dirty_bitmap(RAMBlock 
*rb,
ram_addr_t length,
uint64_t *real_dirty_pages)
 {
-ram_addr_t addr;
-unsigned long word = BIT_WORD((start + rb->offset) >> TARGET_PAGE_BITS);
+ram_addr_t start_global = start + rb->offset;
+unsigned long word = BIT_WORD(start_global >> TARGET_PAGE_BITS);
 uint64_t num_dirty = 0;
 unsigned long *dest = rb->bmap;
 
 /* start address and length is aligned at the start of a word? */
-if (((word * BITS_PER_LONG) << TARGET_PAGE_BITS) ==
- (start + rb->offset) &&
+if (((word * BITS_PER_LONG) << TARGET_PAGE_BITS) == start_global &&
 !(length & ((BITS_PER_LONG << TARGET_PAGE_BITS) - 1))) {
 int k;
 int nr = BITS_TO_LONGS(length >> TARGET_PAGE_BITS);
@@ -495,11 +494,11 @@ uint64_t cpu_physical_memory_sync_dirty_bitmap(RAMBlock 
*rb,
 memory_region_clear_dirty_bitmap(rb->mr, start, length);
 }
 } else {
-ram_addr_t offset = rb->offset;
+ram_addr_t addr;
 
 for (addr = 0; addr < length; addr += TARGET_PAGE_SIZE) {
 if (cpu_physical_memory_test_and_clear_dirty(
-start + addr + offset,
+start_global + addr,
 TARGET_PAGE_SIZE,
 DIRTY_MEMORY_MIGRATION)) {
 *real_dirty_pages += 1;
-- 
2.19.1




[PATCH v2 0/2] Some optimization in dirty bitmap sync

2020-03-10 Thread Keqian Zhu
This patch series including code style change and performace fix
about dirty bitmap sync.

---
changelogs:

v2:
 - split code style change and performace fix.

Keqian Zhu (2):
  memory: Introduce start_global variable in dirty bitmap sync
  migration: not require length align when choose fast dirty sync path

 include/exec/ram_addr.h | 14 ++
 1 file changed, 6 insertions(+), 8 deletions(-)

-- 
2.19.1




[PATCH] migration: not require length align when choose fast dirty sync path

2020-03-05 Thread Keqian Zhu
In aa777e297c840, ramblock length is required to align word pages
when we choose the fast dirty sync path. The reason is that "If the
Ramblock is less than 64 pages in length that long can contain bits
representing two different RAMBlocks, but the code will update the
bmap belinging to the 1st RAMBlock only while having updated the total
dirty page count for both."

This is right before 801110ab22be1ef2, which align ram_addr_t allocation
on long boundaries. So currently we wont "updated the total dirty page
count for both".

Remove the alignment constraint of length and we can always use fast
dirty sync path.

Signed-off-by: Keqian Zhu 
---
Cc: Paolo Bonzini 
Cc: "Dr. David Alan Gilbert" 
Cc: qemu-devel@nongnu.org
---
 include/exec/ram_addr.h | 15 ++-
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
index 5e59a3d8d7..40fd89e1cd 100644
--- a/include/exec/ram_addr.h
+++ b/include/exec/ram_addr.h
@@ -445,15 +445,13 @@ uint64_t cpu_physical_memory_sync_dirty_bitmap(RAMBlock 
*rb,
ram_addr_t length,
uint64_t *real_dirty_pages)
 {
-ram_addr_t addr;
-unsigned long word = BIT_WORD((start + rb->offset) >> TARGET_PAGE_BITS);
+ram_addr_t start_global = start + rb->offset;
+unsigned long word = BIT_WORD(start_global >> TARGET_PAGE_BITS);
 uint64_t num_dirty = 0;
 unsigned long *dest = rb->bmap;
 
-/* start address and length is aligned at the start of a word? */
-if (((word * BITS_PER_LONG) << TARGET_PAGE_BITS) ==
- (start + rb->offset) &&
-!(length & ((BITS_PER_LONG << TARGET_PAGE_BITS) - 1))) {
+/* start address is aligned at the start of a word? */
+if (((word * BITS_PER_LONG) << TARGET_PAGE_BITS) == start_global) {
 int k;
 int nr = BITS_TO_LONGS(length >> TARGET_PAGE_BITS);
 unsigned long * const *src;
@@ -495,11 +493,10 @@ uint64_t cpu_physical_memory_sync_dirty_bitmap(RAMBlock 
*rb,
 memory_region_clear_dirty_bitmap(rb->mr, start, length);
 }
 } else {
-ram_addr_t offset = rb->offset;
-
+ram_addr_t addr;
 for (addr = 0; addr < length; addr += TARGET_PAGE_SIZE) {
 if (cpu_physical_memory_test_and_clear_dirty(
-start + addr + offset,
+start_global + addr,
 TARGET_PAGE_SIZE,
 DIRTY_MEMORY_MIGRATION)) {
 *real_dirty_pages += 1;
-- 
2.19.1




[RESEND PATCH v2] migration/throttle: Add throttle-trig-thres migration parameter

2020-02-23 Thread Keqian Zhu
Currently, if the bytes_dirty_period is more than the 50% of
bytes_xfer_period, we start or increase throttling.

If we make this percentage higher, then we can tolerate higher
dirty rate during migration, which means less impact on guest.
The side effect of higher percentage is longer migration time.
We can make this parameter configurable to switch between mig-
ration time first or guest performance first.

The default value is 50 and valid range is 1 to 100.

Signed-off-by: Keqian Zhu 
---
Changelog:

v1->v2
 -Use full name for parameter. Suggested by Eric Blake.
 -Change the upper bound of threshold to 100.
 -Extract the throttle strategy as function.

---
Cc: Juan Quintela 
Cc: "Dr. David Alan Gilbert" 
Cc: Eric Blake 
Cc: Markus Armbruster 

---
 migration/migration.c | 24 
 migration/ram.c   | 52 +--
 monitor/hmp-cmds.c|  7 ++
 qapi/migration.json   | 16 -
 4 files changed, 76 insertions(+), 23 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index 8fb68795dc..42d2d556e3 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -78,6 +78,7 @@
 /*0: means nocompress, 1: best speed, ... 9: best compress ratio */
 #define DEFAULT_MIGRATE_COMPRESS_LEVEL 1
 /* Define default autoconverge cpu throttle migration parameters */
+#define DEFAULT_MIGRATE_THROTTLE_TRIGGER_THRESHOLD 50
 #define DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL 20
 #define DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT 10
 #define DEFAULT_MIGRATE_MAX_CPU_THROTTLE 99
@@ -778,6 +779,8 @@ MigrationParameters *qmp_query_migrate_parameters(Error 
**errp)
 params->compress_wait_thread = s->parameters.compress_wait_thread;
 params->has_decompress_threads = true;
 params->decompress_threads = s->parameters.decompress_threads;
+params->has_throttle_trigger_threshold = true;
+params->throttle_trigger_threshold = 
s->parameters.throttle_trigger_threshold;
 params->has_cpu_throttle_initial = true;
 params->cpu_throttle_initial = s->parameters.cpu_throttle_initial;
 params->has_cpu_throttle_increment = true;
@@ -1164,6 +1167,15 @@ static bool migrate_params_check(MigrationParameters 
*params, Error **errp)
 return false;
 }
 
+if (params->has_throttle_trigger_threshold &&
+(params->throttle_trigger_threshold < 1 ||
+ params->throttle_trigger_threshold > 100)) {
+error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
+   "throttle_trigger_threshold",
+   "an integer in the range of 1 to 100");
+return false;
+}
+
 if (params->has_cpu_throttle_initial &&
 (params->cpu_throttle_initial < 1 ||
  params->cpu_throttle_initial > 99)) {
@@ -1279,6 +1291,10 @@ static void 
migrate_params_test_apply(MigrateSetParameters *params,
 dest->decompress_threads = params->decompress_threads;
 }
 
+if (params->has_throttle_trigger_threshold) {
+dest->throttle_trigger_threshold = params->throttle_trigger_threshold;
+}
+
 if (params->has_cpu_throttle_initial) {
 dest->cpu_throttle_initial = params->cpu_throttle_initial;
 }
@@ -1360,6 +1376,10 @@ static void migrate_params_apply(MigrateSetParameters 
*params, Error **errp)
 s->parameters.decompress_threads = params->decompress_threads;
 }
 
+if (params->has_throttle_trigger_threshold) {
+s->parameters.throttle_trigger_threshold = 
params->throttle_trigger_threshold;
+}
+
 if (params->has_cpu_throttle_initial) {
 s->parameters.cpu_throttle_initial = params->cpu_throttle_initial;
 }
@@ -3506,6 +3526,9 @@ static Property migration_properties[] = {
 DEFINE_PROP_UINT8("x-decompress-threads", MigrationState,
   parameters.decompress_threads,
   DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT),
+DEFINE_PROP_UINT8("x-throttle-trigger-threshold", MigrationState,
+  parameters.throttle_trigger_threshold,
+  DEFAULT_MIGRATE_THROTTLE_TRIGGER_THRESHOLD),
 DEFINE_PROP_UINT8("x-cpu-throttle-initial", MigrationState,
   parameters.cpu_throttle_initial,
   DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL),
@@ -3606,6 +3629,7 @@ static void migration_instance_init(Object *obj)
 params->has_compress_level = true;
 params->has_compress_threads = true;
 params->has_decompress_threads = true;
+params->has_throttle_trigger_threshold = true;
 params->has_cpu_throttle_initial = true;
 params->has_cpu_throttle_increment = true;
 params->has_max_bandwidth = true;
diff --git a/migration/ram.c b/migration/ram.c
index ed23ed1c7c..3a38253903 100644
--- a/migration/ram.c
+++ b/migra

[PATCH] migration/throttle: Add throttle-trig-thres migration parameter

2020-02-23 Thread Keqian Zhu
Currently, if the bytes_dirty_period is more than the 50% of
bytes_xfer_period, we start or increase throttling.

If we make this percentage higher, then we can tolerate higher
dirty rate during migration, which means less impact on guest.
The side effect of higher percentage is longer migration time.
We can make this parameter configurable to switch between mig-
ration time first or guest performance first.

The default value is 50 and valid range is 1 to 100.

Signed-off-by: Keqian Zhu 
---
Changelog:

v1->v2
 -Use full name for parameter. Suggested by Eric Blake.
 -Change the upper bound of threshold to 100.
 -Extract the throttle strategy as function.

---
Cc: Juan Quintela 
Cc: "Dr. David Alan Gilbert" 
Cc: Eric Blake 
Cc: Markus Armbruster 

---
 migration/migration.c | 24 
 migration/ram.c   | 52 +--
 monitor/hmp-cmds.c|  7 ++
 qapi/migration.json   | 16 -
 4 files changed, 76 insertions(+), 23 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index 8fb68795dc..42d2d556e3 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -78,6 +78,7 @@
 /*0: means nocompress, 1: best speed, ... 9: best compress ratio */
 #define DEFAULT_MIGRATE_COMPRESS_LEVEL 1
 /* Define default autoconverge cpu throttle migration parameters */
+#define DEFAULT_MIGRATE_THROTTLE_TRIGGER_THRESHOLD 50
 #define DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL 20
 #define DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT 10
 #define DEFAULT_MIGRATE_MAX_CPU_THROTTLE 99
@@ -778,6 +779,8 @@ MigrationParameters *qmp_query_migrate_parameters(Error 
**errp)
 params->compress_wait_thread = s->parameters.compress_wait_thread;
 params->has_decompress_threads = true;
 params->decompress_threads = s->parameters.decompress_threads;
+params->has_throttle_trigger_threshold = true;
+params->throttle_trigger_threshold = 
s->parameters.throttle_trigger_threshold;
 params->has_cpu_throttle_initial = true;
 params->cpu_throttle_initial = s->parameters.cpu_throttle_initial;
 params->has_cpu_throttle_increment = true;
@@ -1164,6 +1167,15 @@ static bool migrate_params_check(MigrationParameters 
*params, Error **errp)
 return false;
 }
 
+if (params->has_throttle_trigger_threshold &&
+(params->throttle_trigger_threshold < 1 ||
+ params->throttle_trigger_threshold > 100)) {
+error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
+   "throttle_trigger_threshold",
+   "an integer in the range of 1 to 100");
+return false;
+}
+
 if (params->has_cpu_throttle_initial &&
 (params->cpu_throttle_initial < 1 ||
  params->cpu_throttle_initial > 99)) {
@@ -1279,6 +1291,10 @@ static void 
migrate_params_test_apply(MigrateSetParameters *params,
 dest->decompress_threads = params->decompress_threads;
 }
 
+if (params->has_throttle_trigger_threshold) {
+dest->throttle_trigger_threshold = params->throttle_trigger_threshold;
+}
+
 if (params->has_cpu_throttle_initial) {
 dest->cpu_throttle_initial = params->cpu_throttle_initial;
 }
@@ -1360,6 +1376,10 @@ static void migrate_params_apply(MigrateSetParameters 
*params, Error **errp)
 s->parameters.decompress_threads = params->decompress_threads;
 }
 
+if (params->has_throttle_trigger_threshold) {
+s->parameters.throttle_trigger_threshold = 
params->throttle_trigger_threshold;
+}
+
 if (params->has_cpu_throttle_initial) {
 s->parameters.cpu_throttle_initial = params->cpu_throttle_initial;
 }
@@ -3506,6 +3526,9 @@ static Property migration_properties[] = {
 DEFINE_PROP_UINT8("x-decompress-threads", MigrationState,
   parameters.decompress_threads,
   DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT),
+DEFINE_PROP_UINT8("x-throttle-trigger-threshold", MigrationState,
+  parameters.throttle_trigger_threshold,
+  DEFAULT_MIGRATE_THROTTLE_TRIGGER_THRESHOLD),
 DEFINE_PROP_UINT8("x-cpu-throttle-initial", MigrationState,
   parameters.cpu_throttle_initial,
   DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL),
@@ -3606,6 +3629,7 @@ static void migration_instance_init(Object *obj)
 params->has_compress_level = true;
 params->has_compress_threads = true;
 params->has_decompress_threads = true;
+params->has_throttle_trigger_threshold = true;
 params->has_cpu_throttle_initial = true;
 params->has_cpu_throttle_increment = true;
 params->has_max_bandwidth = true;
diff --git a/migration/ram.c b/migration/ram.c
index ed23ed1c7c..3a38253903 100644
--- a/migration/ram.c
+++ b/migra

[PATCH] migration/throttle: Add throttle-trig-thres migration parameter

2020-02-20 Thread Keqian Zhu
Currently, if the bytes_dirty_period is more than the 50% of
bytes_xfer_period, we start or increase throttling.

If we make this percentage higher, then we can tolerate higher
dirty rate during migration, which means less impact on guest.
The side effect of higher percentage is longer migration time.

We can configure this parameter to switch between migration time
firt or guest performance first. The default value is 50.

Signed-off-by: Keqian Zhu 
---
Cc: Juan Quintela 
Cc: "Dr. David Alan Gilbert" 
Cc: Eric Blake 
Cc: Markus Armbruster 
---
 migration/migration.c | 24 
 migration/ram.c   | 21 +++--
 monitor/hmp-cmds.c|  7 +++
 qapi/migration.json   | 16 +++-
 4 files changed, 61 insertions(+), 7 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index 8fb68795dc..e6c2451734 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -78,6 +78,7 @@
 /*0: means nocompress, 1: best speed, ... 9: best compress ratio */
 #define DEFAULT_MIGRATE_COMPRESS_LEVEL 1
 /* Define default autoconverge cpu throttle migration parameters */
+#define DEFAULT_MIGRATE_THROTTLE_TRIG_THRES 50
 #define DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL 20
 #define DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT 10
 #define DEFAULT_MIGRATE_MAX_CPU_THROTTLE 99
@@ -778,6 +779,8 @@ MigrationParameters *qmp_query_migrate_parameters(Error 
**errp)
 params->compress_wait_thread = s->parameters.compress_wait_thread;
 params->has_decompress_threads = true;
 params->decompress_threads = s->parameters.decompress_threads;
+params->has_throttle_trig_thres = true;
+params->throttle_trig_thres = s->parameters.throttle_trig_thres;
 params->has_cpu_throttle_initial = true;
 params->cpu_throttle_initial = s->parameters.cpu_throttle_initial;
 params->has_cpu_throttle_increment = true;
@@ -1164,6 +1167,15 @@ static bool migrate_params_check(MigrationParameters 
*params, Error **errp)
 return false;
 }
 
+if (params->has_throttle_trig_thres &&
+(params->throttle_trig_thres < 1 ||
+ params->throttle_trig_thres > 99)) {
+error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
+   "throttle_trig_thres",
+   "an integer in the range of 1 to 99");
+return false;
+}
+
 if (params->has_cpu_throttle_initial &&
 (params->cpu_throttle_initial < 1 ||
  params->cpu_throttle_initial > 99)) {
@@ -1279,6 +1291,10 @@ static void 
migrate_params_test_apply(MigrateSetParameters *params,
 dest->decompress_threads = params->decompress_threads;
 }
 
+if (params->has_throttle_trig_thres) {
+dest->throttle_trig_thres = params->throttle_trig_thres;
+}
+
 if (params->has_cpu_throttle_initial) {
 dest->cpu_throttle_initial = params->cpu_throttle_initial;
 }
@@ -1360,6 +1376,10 @@ static void migrate_params_apply(MigrateSetParameters 
*params, Error **errp)
 s->parameters.decompress_threads = params->decompress_threads;
 }
 
+if (params->has_throttle_trig_thres) {
+s->parameters.throttle_trig_thres = params->throttle_trig_thres;
+}
+
 if (params->has_cpu_throttle_initial) {
 s->parameters.cpu_throttle_initial = params->cpu_throttle_initial;
 }
@@ -3506,6 +3526,9 @@ static Property migration_properties[] = {
 DEFINE_PROP_UINT8("x-decompress-threads", MigrationState,
   parameters.decompress_threads,
   DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT),
+DEFINE_PROP_UINT8("x-throttle-trig-thres", MigrationState,
+  parameters.throttle_trig_thres,
+  DEFAULT_MIGRATE_THROTTLE_TRIG_THRES),
 DEFINE_PROP_UINT8("x-cpu-throttle-initial", MigrationState,
   parameters.cpu_throttle_initial,
   DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL),
@@ -3606,6 +3629,7 @@ static void migration_instance_init(Object *obj)
 params->has_compress_level = true;
 params->has_compress_threads = true;
 params->has_decompress_threads = true;
+params->has_throttle_trig_thres = true;
 params->has_cpu_throttle_initial = true;
 params->has_cpu_throttle_increment = true;
 params->has_max_bandwidth = true;
diff --git a/migration/ram.c b/migration/ram.c
index ed23ed1c7c..28081cb1e1 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -901,6 +901,11 @@ static void migration_bitmap_sync(RAMState *rs)
 RAMBlock *block;
 int64_t end_time;
 uint64_t bytes_xfer_now;
+uint64_t bytes_dirty_period;
+uint64_t bytes_xfer_period;
+uint64_t bytes_dirty_thres;
+uint64_t throttle_trig_thres;
+MigrationState *s = migrate_get_current();
 
 ra

[PATCH] migration/throttle: Make throttle slower at tail stage

2020-02-13 Thread Keqian Zhu
At the tail stage of throttle, VM is very sensitive to
CPU percentage. We just throttle 30% of remaining CPU
when throttle is more than 80 percentage.

This doesn't conflict with cpu_throttle_increment.

This may make migration time longer, and is disabled
by default.

Signed-off-by: Keqian Zhu 
---
Cc: Juan Quintela 
Cc: "Dr. David Alan Gilbert" 
Cc: Eric Blake 
Cc: Markus Armbruster 
---
 migration/migration.c | 13 +
 migration/ram.c   | 18 --
 monitor/hmp-cmds.c|  4 
 qapi/migration.json   | 21 +
 4 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index 3a21a4686c..37b569cee9 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -782,6 +782,8 @@ MigrationParameters *qmp_query_migrate_parameters(Error 
**errp)
 params->cpu_throttle_initial = s->parameters.cpu_throttle_initial;
 params->has_cpu_throttle_increment = true;
 params->cpu_throttle_increment = s->parameters.cpu_throttle_increment;
+params->has_cpu_throttle_tailslow = true;
+params->cpu_throttle_tailslow = s->parameters.cpu_throttle_tailslow;
 params->has_tls_creds = true;
 params->tls_creds = g_strdup(s->parameters.tls_creds);
 params->has_tls_hostname = true;
@@ -1287,6 +1289,10 @@ static void 
migrate_params_test_apply(MigrateSetParameters *params,
 dest->cpu_throttle_increment = params->cpu_throttle_increment;
 }
 
+if (params->has_cpu_throttle_tailslow) {
+dest->cpu_throttle_tailslow = params->cpu_throttle_tailslow;
+}
+
 if (params->has_tls_creds) {
 assert(params->tls_creds->type == QTYPE_QSTRING);
 dest->tls_creds = g_strdup(params->tls_creds->u.s);
@@ -1368,6 +1374,10 @@ static void migrate_params_apply(MigrateSetParameters 
*params, Error **errp)
 s->parameters.cpu_throttle_increment = params->cpu_throttle_increment;
 }
 
+if (params->has_cpu_throttle_tailslow) {
+s->parameters.cpu_throttle_tailslow = params->cpu_throttle_tailslow;
+}
+
 if (params->has_tls_creds) {
 g_free(s->parameters.tls_creds);
 assert(params->tls_creds->type == QTYPE_QSTRING);
@@ -3504,6 +3514,8 @@ static Property migration_properties[] = {
 DEFINE_PROP_UINT8("x-cpu-throttle-increment", MigrationState,
   parameters.cpu_throttle_increment,
   DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT),
+DEFINE_PROP_BOOL("x-cpu-throttle-tailslow", MigrationState,
+  parameters.cpu_throttle_tailslow, false),
 DEFINE_PROP_SIZE("x-max-bandwidth", MigrationState,
   parameters.max_bandwidth, MAX_THROTTLE),
 DEFINE_PROP_UINT64("x-downtime-limit", MigrationState,
@@ -3600,6 +3612,7 @@ static void migration_instance_init(Object *obj)
 params->has_decompress_threads = true;
 params->has_cpu_throttle_initial = true;
 params->has_cpu_throttle_increment = true;
+params->has_cpu_throttle_tailslow = true;
 params->has_max_bandwidth = true;
 params->has_downtime_limit = true;
 params->has_x_checkpoint_delay = true;
diff --git a/migration/ram.c b/migration/ram.c
index ed23ed1c7c..d86413a5d1 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -29,6 +29,7 @@
 #include "qemu/osdep.h"
 #include "cpu.h"
 #include 
+#include 
 #include "qemu/cutils.h"
 #include "qemu/bitops.h"
 #include "qemu/bitmap.h"
@@ -620,15 +621,28 @@ static void mig_throttle_guest_down(void)
 {
 MigrationState *s = migrate_get_current();
 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
-uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
+uint64_t pct_increment = s->parameters.cpu_throttle_increment;
+bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
 int pct_max = s->parameters.max_cpu_throttle;
 
+const uint64_t cpu_throttle_now = cpu_throttle_get_percentage();
+uint64_t cpu_throttle_inc;
+
 /* We have not started throttling yet. Let's start it. */
 if (!cpu_throttle_active()) {
 cpu_throttle_set(pct_initial);
 } else {
 /* Throttling already on, just increase the rate */
-cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
+if (cpu_throttle_now >= 80 && pct_tailslow) {
+/* Eat some scale of CPU from remaining */
+cpu_throttle_inc = ceil((100 - cpu_throttle_now) * 0.3);
+if (cpu_throttle_inc > pct_increment) {
+cpu_throttle_inc = pct_increment;
+}
+} else {
+cpu_throttle_inc = pct_increment;
+}
+cpu_throttle_set(MIN(cpu_throttle_now + cpu_throttle_inc,
  p

[PATCH] migration: Optimization about wait-unplug migration state

2020-02-03 Thread Keqian Zhu
qemu_savevm_nr_failover_devices() is originally designed to
get the number of failover devices, but it actually returns
the number of "unplug-pending" failover devices now. Moreover,
what drives migration state to wait-unplug should be the number
of "unplug-pending" failover devices, not all failover devices.

We can also notice that qemu_savevm_state_guest_unplug_pending()
and qemu_savevm_nr_failover_devices() is equivalent almost (from
the code view). So the latter is incorrect semantically and
useless, just delete it.

In the qemu_savevm_state_guest_unplug_pending(), once hit a
unplug-pending failover device, then it can return true right
now to save cpu time.

Signed-off-by: Keqian Zhu 
---
Cc: jfreim...@redhat.com
Cc: Juan Quintela 
Cc: "Dr. David Alan Gilbert" 
---
 migration/migration.c |  2 +-
 migration/savevm.c| 24 +++-
 migration/savevm.h|  1 -
 3 files changed, 4 insertions(+), 23 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index 3a21a4686c..deedc968cf 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -,7 +,7 @@ static void *migration_thread(void *opaque)
 
 qemu_savevm_state_setup(s->to_dst_file);
 
-if (qemu_savevm_nr_failover_devices()) {
+if (qemu_savevm_state_guest_unplug_pending()) {
 migrate_set_state(>state, MIGRATION_STATUS_SETUP,
   MIGRATION_STATUS_WAIT_UNPLUG);
 
diff --git a/migration/savevm.c b/migration/savevm.c
index f19cb9ec7a..1d4220ece8 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -1140,36 +1140,18 @@ void qemu_savevm_state_header(QEMUFile *f)
 }
 }
 
-int qemu_savevm_nr_failover_devices(void)
+bool qemu_savevm_state_guest_unplug_pending(void)
 {
 SaveStateEntry *se;
-int n = 0;
 
 QTAILQ_FOREACH(se, _state.handlers, entry) {
 if (se->vmsd && se->vmsd->dev_unplug_pending &&
 se->vmsd->dev_unplug_pending(se->opaque)) {
-n++;
-}
-}
-
-return n;
-}
-
-bool qemu_savevm_state_guest_unplug_pending(void)
-{
-SaveStateEntry *se;
-int n = 0;
-
-QTAILQ_FOREACH(se, _state.handlers, entry) {
-if (!se->vmsd || !se->vmsd->dev_unplug_pending) {
-continue;
-}
-if (se->vmsd->dev_unplug_pending(se->opaque)) {
-n++;
+return true;
 }
 }
 
-return n > 0;
+return false;
 }
 
 void qemu_savevm_state_setup(QEMUFile *f)
diff --git a/migration/savevm.h b/migration/savevm.h
index c42b9c80ee..ba64a7e271 100644
--- a/migration/savevm.h
+++ b/migration/savevm.h
@@ -31,7 +31,6 @@
 
 bool qemu_savevm_state_blocked(Error **errp);
 void qemu_savevm_state_setup(QEMUFile *f);
-int qemu_savevm_nr_failover_devices(void);
 bool qemu_savevm_state_guest_unplug_pending(void);
 int qemu_savevm_state_resume_prepare(MigrationState *s);
 void qemu_savevm_state_header(QEMUFile *f);
-- 
2.19.1




[PATCH v3 2/2] hw/arm: Use helper function to trigger hotplug handler plug

2020-01-19 Thread Keqian Zhu
We can use existing helper function to trigger hotplug handler
plug, which makes code clearer.

Reviewed-by: Igor Mammedov 
Signed-off-by: Keqian Zhu 
---
 hw/arm/virt.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 39ab5f47e0..656b0081c2 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -1934,7 +1934,6 @@ static void virt_memory_pre_plug(HotplugHandler 
*hotplug_dev, DeviceState *dev,
 static void virt_memory_plug(HotplugHandler *hotplug_dev,
  DeviceState *dev, Error **errp)
 {
-HotplugHandlerClass *hhc;
 VirtMachineState *vms = VIRT_MACHINE(hotplug_dev);
 Error *local_err = NULL;
 
@@ -1943,8 +1942,9 @@ static void virt_memory_plug(HotplugHandler *hotplug_dev,
 goto out;
 }
 
-hhc = HOTPLUG_HANDLER_GET_CLASS(vms->acpi_dev);
-hhc->plug(HOTPLUG_HANDLER(vms->acpi_dev), dev, _abort);
+hotplug_handler_plug(HOTPLUG_HANDLER(vms->acpi_dev),
+ dev, _abort);
+
 out:
 error_propagate(errp, local_err);
 }
-- 
2.19.1




[PATCH v3 1/2] hw/acpi: Remove extra indent in ACPI GED hotplug cb

2020-01-19 Thread Keqian Zhu
There is extra indent in ACPI GED hotplug cb that should be
deleted.

Reviewed-by: Igor Mammedov 
Signed-off-by: Keqian Zhu 
---
 hw/acpi/generic_event_device.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/acpi/generic_event_device.c b/hw/acpi/generic_event_device.c
index 9cee90cc70..55eb29d80a 100644
--- a/hw/acpi/generic_event_device.c
+++ b/hw/acpi/generic_event_device.c
@@ -175,7 +175,7 @@ static void acpi_ged_device_plug_cb(HotplugHandler 
*hotplug_dev,
 AcpiGedState *s = ACPI_GED(hotplug_dev);
 
 if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
-acpi_memory_plug_cb(hotplug_dev, >memhp_state, dev, errp);
+acpi_memory_plug_cb(hotplug_dev, >memhp_state, dev, errp);
 } else {
 error_setg(errp, "virt: device plug request for unsupported device"
" type: %s", object_get_typename(OBJECT(dev)));
-- 
2.19.1




[PATCH v3 0/2] Adjust some codes about memory hotplug

2020-01-19 Thread Keqian Zhu
This removes extra indent and makes some code refactor related to
memory hotplug.

Changelog:

v2 -> v3
 - Addressed Peter's comments.

v1 -> v2
 - Add Igor's R-b.

Keqian Zhu (2):
  hw/acpi: Remove extra indent in ACPI GED hotplug cb
  hw/arm: Use helper function to trigger hotplug handler plug

 hw/acpi/generic_event_device.c | 2 +-
 hw/arm/virt.c  | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

-- 
2.19.1




[PATCH v2] hw/arm: Adjust some coding styles about memory hotplug

2020-01-16 Thread Keqian Zhu
From: zhukeqian 

There is extra indent in ACPI GED plug cb. And we can use
existing helper function to trigger hotplug handler plug.

Reviewed-by: Igor Mammedov 
Signed-off-by: Keqian Zhu 
---

v1->v2:
 - Add Igor's R-b

Cc: Shameer Kolothum 
Cc: "Michael S. Tsirkin" 
Cc: Igor Mammedov 
Cc: Peter Maydell 
---
 hw/acpi/generic_event_device.c | 2 +-
 hw/arm/virt.c  | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/hw/acpi/generic_event_device.c b/hw/acpi/generic_event_device.c
index 9cee90cc70..55eb29d80a 100644
--- a/hw/acpi/generic_event_device.c
+++ b/hw/acpi/generic_event_device.c
@@ -175,7 +175,7 @@ static void acpi_ged_device_plug_cb(HotplugHandler 
*hotplug_dev,
 AcpiGedState *s = ACPI_GED(hotplug_dev);
 
 if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
-acpi_memory_plug_cb(hotplug_dev, >memhp_state, dev, errp);
+acpi_memory_plug_cb(hotplug_dev, >memhp_state, dev, errp);
 } else {
 error_setg(errp, "virt: device plug request for unsupported device"
" type: %s", object_get_typename(OBJECT(dev)));
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 39ab5f47e0..656b0081c2 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -1934,7 +1934,6 @@ static void virt_memory_pre_plug(HotplugHandler 
*hotplug_dev, DeviceState *dev,
 static void virt_memory_plug(HotplugHandler *hotplug_dev,
  DeviceState *dev, Error **errp)
 {
-HotplugHandlerClass *hhc;
 VirtMachineState *vms = VIRT_MACHINE(hotplug_dev);
 Error *local_err = NULL;
 
@@ -1943,8 +1942,9 @@ static void virt_memory_plug(HotplugHandler *hotplug_dev,
 goto out;
 }
 
-hhc = HOTPLUG_HANDLER_GET_CLASS(vms->acpi_dev);
-hhc->plug(HOTPLUG_HANDLER(vms->acpi_dev), dev, _abort);
+hotplug_handler_plug(HOTPLUG_HANDLER(vms->acpi_dev),
+ dev, _abort);
+
 out:
 error_propagate(errp, local_err);
 }
-- 
2.19.1




[PATCH] hw/arm: Adjust some coding styles about memory hotplug

2020-01-16 Thread Keqian Zhu
From: zhukeqian 

There is extra indent in ACPI GED plug cb. And we can use
existing helper function to trigger hotplug handler plug.

Signed-off-by: Keqian Zhu 
Cc: Shameer Kolothum 
Cc: "Michael S. Tsirkin" 
Cc: Igor Mammedov 
---
 hw/acpi/generic_event_device.c | 2 +-
 hw/arm/virt.c  | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/hw/acpi/generic_event_device.c b/hw/acpi/generic_event_device.c
index 9cee90cc70..55eb29d80a 100644
--- a/hw/acpi/generic_event_device.c
+++ b/hw/acpi/generic_event_device.c
@@ -175,7 +175,7 @@ static void acpi_ged_device_plug_cb(HotplugHandler 
*hotplug_dev,
 AcpiGedState *s = ACPI_GED(hotplug_dev);
 
 if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
-acpi_memory_plug_cb(hotplug_dev, >memhp_state, dev, errp);
+acpi_memory_plug_cb(hotplug_dev, >memhp_state, dev, errp);
 } else {
 error_setg(errp, "virt: device plug request for unsupported device"
" type: %s", object_get_typename(OBJECT(dev)));
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 39ab5f47e0..656b0081c2 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -1934,7 +1934,6 @@ static void virt_memory_pre_plug(HotplugHandler 
*hotplug_dev, DeviceState *dev,
 static void virt_memory_plug(HotplugHandler *hotplug_dev,
  DeviceState *dev, Error **errp)
 {
-HotplugHandlerClass *hhc;
 VirtMachineState *vms = VIRT_MACHINE(hotplug_dev);
 Error *local_err = NULL;
 
@@ -1943,8 +1942,9 @@ static void virt_memory_plug(HotplugHandler *hotplug_dev,
 goto out;
 }
 
-hhc = HOTPLUG_HANDLER_GET_CLASS(vms->acpi_dev);
-hhc->plug(HOTPLUG_HANDLER(vms->acpi_dev), dev, _abort);
+hotplug_handler_plug(HOTPLUG_HANDLER(vms->acpi_dev),
+ dev, _abort);
+
 out:
 error_propagate(errp, local_err);
 }
-- 
2.19.1