Re: [PATCH v5 10/10] KVM: Dirty ring support

2021-03-22 Thread Keqian Zhu



On 2021/3/23 2:52, Peter Xu wrote:
> On Mon, Mar 22, 2021 at 09:37:19PM +0800, Keqian Zhu wrote:
>>> +/* Should be with all slots_lock held for the address spaces. */
>>> +static void kvm_dirty_ring_mark_page(KVMState *s, uint32_t as_id,
>>> + uint32_t slot_id, uint64_t offset)
>>> +{
>>> +KVMMemoryListener *kml;
>>> +KVMSlot *mem;
>>> +
>>> +if (as_id >= s->nr_as) {
>>> +return;
>>> +}
>>> +
>>> +kml = s->as[as_id].ml;
>>> +mem = >slots[slot_id];
>>> +
>>> +if (!mem->memory_size || offset >= (mem->memory_size / 
>>> TARGET_PAGE_SIZE)) {
>> It seems that TARGET_PAGE_SIZE should be qemu_real_host_page_size.
> 
> Fixed.
> 
> [...]
> 
>>> +/*
>>> + * Flush all the existing dirty pages to the KVM slot buffers.  When
>>> + * this call returns, we guarantee that all the touched dirty pages
>>> + * before calling this function have been put into the per-kvmslot
>>> + * dirty bitmap.
>>> + *
>>> + * This function must be called with BQL held.
>>> + */
>>> +static void kvm_dirty_ring_flush(struct KVMDirtyRingReaper *r)
>> The argument is not used.
> 
> Indeed, removed.
> 
>>
>>> +{
>>> +trace_kvm_dirty_ring_flush(0);
>>> +/*
>>> + * The function needs to be serialized.  Since this function
>>> + * should always be with BQL held, serialization is guaranteed.
>>> + * However, let's be sure of it.
>>> + */
>>> +assert(qemu_mutex_iothread_locked());
>>> +/*
>>> + * First make sure to flush the hardware buffers by kicking all
>>> + * vcpus out in a synchronous way.
>>> + */
>>> +kvm_cpu_synchronize_kick_all();
>> Can we make this function to be architecture specific?
>> It seems that kick out vCPU is an architecture specific way to flush 
>> hardware buffers
>> to dirty ring (for x86 PML).
> 
> I can do that, but I'd say it's kind of an overkill if after all the kernel
> support is not there yet, so I still tend to make it as simple as possible.
OK.

> 
> [...]
> 
>>> +static void *kvm_dirty_ring_reaper_thread(void *data)
>>> +{
>>> +KVMState *s = data;
>>> +struct KVMDirtyRingReaper *r = >reaper;
>>> +
>>> +rcu_register_thread();
>>> +
>>> +trace_kvm_dirty_ring_reaper("init");
>>> +
>>> +while (true) {
>>> +r->reaper_state = KVM_DIRTY_RING_REAPER_WAIT;
>>> +trace_kvm_dirty_ring_reaper("wait");
>>> +/*
>>> + * TODO: provide a smarter timeout rather than a constant?
>>> + */
>>> +sleep(1);
>>> +
>>> +trace_kvm_dirty_ring_reaper("wakeup");
>>> +r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING;
>>> +
>>> +qemu_mutex_lock_iothread();
>>> +kvm_dirty_ring_reap(s);
>>> +qemu_mutex_unlock_iothread();
>>> +
>>> +r->reaper_iteration++;
>>> +}
>> I don't know when does this iteration exit?
>> And I see that we start this reaper_thread in kvm_init(), maybe it's better 
>> to start it
>> when start dirty log and stop it when stop dirty log.
> 
> Yes we can make it conditional, but note that we can't hook at functions like
> memory_global_dirty_log_start() because that is only for migration purpose.
> 
> Currently QEMU exports the dirty tracking more than that, e.g., to the VGA
> code.  We'll need to try to detect whether there's any existing MR got its
> mr->dirty_log_mask set (besides global_dirty_log being set).  When all of them
> got cleared we'll need to detect too so as to turn the thread off.
> 
> It's just easier to me to run this thread with such a timeout, then when not
> necessary it'll see empty ring and return fast (index comparison for each
> ring).  Not to mention the VGA dirty tracking should be on for most of the VM
> lifecycle, so even if we have that knob this thread will probably be running
> for 99% of the time as long as any MR has its DIRTY_MEMORY_VGA bit set.
Make sense. Thanks for your explanation!

Thanks,
Keqian
> 
> [...]
> 
>>> diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
>>> index c68bc3ba8af..2f0991d93f7 100644
>>> --- a/include/hw/core/cpu.h
>>> +++ b/include/hw/core/cpu.h
>>> @@ -323,6 +323,11 @@ struct qemu_work_item;
>>>   * @ignore_memory_transaction_failures: Cached copy of the MachineState
>>>   *flag of the same name: allows the board to suppress calling of the
>>>   *CPU do_transaction_failed hook function.
>>> + * @kvm_dirty_ring_full:
>>> + *   Whether the kvm dirty ring of this vcpu is soft-full.
>>> + * @kvm_dirty_ring_avail:
>>> + *   Semaphore to be posted when the kvm dirty ring of the vcpu is
>>> + *   available again.
>> The doc does not match code.
> 
> Right; fixed.
> 
> Thanks for taking a look, keqian.
> 



Re: [PATCH v5 10/10] KVM: Dirty ring support

2021-03-22 Thread Peter Xu
On Mon, Mar 22, 2021 at 09:37:19PM +0800, Keqian Zhu wrote:
> > +/* Should be with all slots_lock held for the address spaces. */
> > +static void kvm_dirty_ring_mark_page(KVMState *s, uint32_t as_id,
> > + uint32_t slot_id, uint64_t offset)
> > +{
> > +KVMMemoryListener *kml;
> > +KVMSlot *mem;
> > +
> > +if (as_id >= s->nr_as) {
> > +return;
> > +}
> > +
> > +kml = s->as[as_id].ml;
> > +mem = >slots[slot_id];
> > +
> > +if (!mem->memory_size || offset >= (mem->memory_size / 
> > TARGET_PAGE_SIZE)) {
> It seems that TARGET_PAGE_SIZE should be qemu_real_host_page_size.

Fixed.

[...]

> > +/*
> > + * Flush all the existing dirty pages to the KVM slot buffers.  When
> > + * this call returns, we guarantee that all the touched dirty pages
> > + * before calling this function have been put into the per-kvmslot
> > + * dirty bitmap.
> > + *
> > + * This function must be called with BQL held.
> > + */
> > +static void kvm_dirty_ring_flush(struct KVMDirtyRingReaper *r)
> The argument is not used.

Indeed, removed.

> 
> > +{
> > +trace_kvm_dirty_ring_flush(0);
> > +/*
> > + * The function needs to be serialized.  Since this function
> > + * should always be with BQL held, serialization is guaranteed.
> > + * However, let's be sure of it.
> > + */
> > +assert(qemu_mutex_iothread_locked());
> > +/*
> > + * First make sure to flush the hardware buffers by kicking all
> > + * vcpus out in a synchronous way.
> > + */
> > +kvm_cpu_synchronize_kick_all();
> Can we make this function to be architecture specific?
> It seems that kick out vCPU is an architecture specific way to flush hardware 
> buffers
> to dirty ring (for x86 PML).

I can do that, but I'd say it's kind of an overkill if after all the kernel
support is not there yet, so I still tend to make it as simple as possible.

[...]

> > +static void *kvm_dirty_ring_reaper_thread(void *data)
> > +{
> > +KVMState *s = data;
> > +struct KVMDirtyRingReaper *r = >reaper;
> > +
> > +rcu_register_thread();
> > +
> > +trace_kvm_dirty_ring_reaper("init");
> > +
> > +while (true) {
> > +r->reaper_state = KVM_DIRTY_RING_REAPER_WAIT;
> > +trace_kvm_dirty_ring_reaper("wait");
> > +/*
> > + * TODO: provide a smarter timeout rather than a constant?
> > + */
> > +sleep(1);
> > +
> > +trace_kvm_dirty_ring_reaper("wakeup");
> > +r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING;
> > +
> > +qemu_mutex_lock_iothread();
> > +kvm_dirty_ring_reap(s);
> > +qemu_mutex_unlock_iothread();
> > +
> > +r->reaper_iteration++;
> > +}
> I don't know when does this iteration exit?
> And I see that we start this reaper_thread in kvm_init(), maybe it's better 
> to start it
> when start dirty log and stop it when stop dirty log.

Yes we can make it conditional, but note that we can't hook at functions like
memory_global_dirty_log_start() because that is only for migration purpose.

Currently QEMU exports the dirty tracking more than that, e.g., to the VGA
code.  We'll need to try to detect whether there's any existing MR got its
mr->dirty_log_mask set (besides global_dirty_log being set).  When all of them
got cleared we'll need to detect too so as to turn the thread off.

It's just easier to me to run this thread with such a timeout, then when not
necessary it'll see empty ring and return fast (index comparison for each
ring).  Not to mention the VGA dirty tracking should be on for most of the VM
lifecycle, so even if we have that knob this thread will probably be running
for 99% of the time as long as any MR has its DIRTY_MEMORY_VGA bit set.

[...]

> > diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
> > index c68bc3ba8af..2f0991d93f7 100644
> > --- a/include/hw/core/cpu.h
> > +++ b/include/hw/core/cpu.h
> > @@ -323,6 +323,11 @@ struct qemu_work_item;
> >   * @ignore_memory_transaction_failures: Cached copy of the MachineState
> >   *flag of the same name: allows the board to suppress calling of the
> >   *CPU do_transaction_failed hook function.
> > + * @kvm_dirty_ring_full:
> > + *   Whether the kvm dirty ring of this vcpu is soft-full.
> > + * @kvm_dirty_ring_avail:
> > + *   Semaphore to be posted when the kvm dirty ring of the vcpu is
> > + *   available again.
> The doc does not match code.

Right; fixed.

Thanks for taking a look, keqian.

-- 
Peter Xu




Re: [PATCH v5 10/10] KVM: Dirty ring support

2021-03-22 Thread Keqian Zhu



On 2021/3/11 4:33, Peter Xu wrote:
> KVM dirty ring is a new interface to pass over dirty bits from kernel to the
> userspace.  Instead of using a bitmap for each memory region, the dirty ring
> contains an array of dirtied GPAs to fetch (in the form of offset in slots).
> For each vcpu there will be one dirty ring that binds to it.
> 
> kvm_dirty_ring_reap() is the major function to collect dirty rings.  It can be
> called either by a standalone reaper thread that runs in the background,
> collecting dirty pages for the whole VM.  It can also be called directly by 
> any
> thread that has BQL taken.
> 
> Signed-off-by: Peter Xu 
> ---
>  accel/kvm/kvm-all.c| 332 -
>  accel/kvm/trace-events |   7 +
>  include/hw/core/cpu.h  |   8 +
>  3 files changed, 344 insertions(+), 3 deletions(-)
> 
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index ae9393266b2..bf2b21f038b 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -15,6 +15,7 @@
>  
>  #include "qemu/osdep.h"
>  #include 
> +#include 
>  
>  #include 
>  
> @@ -80,6 +81,25 @@ struct KVMParkedVcpu {
>  QLIST_ENTRY(KVMParkedVcpu) node;
>  };
>  
> +enum KVMDirtyRingReaperState {
> +KVM_DIRTY_RING_REAPER_NONE = 0,
> +/* The reaper is sleeping */
> +KVM_DIRTY_RING_REAPER_WAIT,
> +/* The reaper is reaping for dirty pages */
> +KVM_DIRTY_RING_REAPER_REAPING,
> +};
> +
> +/*
> + * KVM reaper instance, responsible for collecting the KVM dirty bits
> + * via the dirty ring.
> + */
> +struct KVMDirtyRingReaper {
> +/* The reaper thread */
> +QemuThread reaper_thr;
> +volatile uint64_t reaper_iteration; /* iteration number of reaper thr */
> +volatile enum KVMDirtyRingReaperState reaper_state; /* reap thr state */
> +};
> +
>  struct KVMState
>  {
>  AccelState parent_obj;
> @@ -131,6 +151,7 @@ struct KVMState
>  bool kvm_dirty_ring_enabled;/* Whether KVM dirty ring is enabled */
>  uint64_t kvm_dirty_ring_size;   /* Size of the per-vcpu dirty ring */
>  uint32_t kvm_dirty_gfn_count;   /* Number of dirty GFNs per ring */
> +struct KVMDirtyRingReaper reaper;
>  };
>  
>  KVMState *kvm_state;
> @@ -392,6 +413,13 @@ static int do_kvm_destroy_vcpu(CPUState *cpu)
>  goto err;
>  }
>  
> +if (cpu->kvm_dirty_gfns) {
> +ret = munmap(cpu->kvm_dirty_gfns, s->kvm_dirty_ring_size);
> +if (ret < 0) {
> +goto err;
> +}
> +}
> +
>  vcpu = g_malloc0(sizeof(*vcpu));
>  vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
>  vcpu->kvm_fd = cpu->kvm_fd;
> @@ -468,6 +496,19 @@ int kvm_init_vcpu(CPUState *cpu, Error **errp)
>  (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE;
>  }
>  
> +if (s->kvm_dirty_ring_enabled) {
> +/* Use MAP_SHARED to share pages with the kernel */
> +cpu->kvm_dirty_gfns = mmap(NULL, s->kvm_dirty_ring_size,
> +   PROT_READ | PROT_WRITE, MAP_SHARED,
> +   cpu->kvm_fd,
> +   PAGE_SIZE * KVM_DIRTY_LOG_PAGE_OFFSET);
> +if (cpu->kvm_dirty_gfns == MAP_FAILED) {
> +ret = -errno;
> +DPRINTF("mmap'ing vcpu dirty gfns failed: %d\n", ret);
> +goto err;
> +}
> +}
> +
>  ret = kvm_arch_init_vcpu(cpu);
>  if (ret < 0) {
>  error_setg_errno(errp, -ret,
> @@ -586,6 +627,11 @@ static void kvm_slot_sync_dirty_pages(KVMSlot *slot)
>  cpu_physical_memory_set_dirty_lebitmap(slot->dirty_bmap, start, pages);
>  }
>  
> +static void kvm_slot_reset_dirty_pages(KVMSlot *slot)
> +{
> +memset(slot->dirty_bmap, 0, slot->dirty_bmap_size);
> +}
> +
>  #define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))
>  
>  /* Allocate the dirty bitmap for a slot  */
> @@ -642,6 +688,170 @@ static bool kvm_slot_get_dirty_log(KVMState *s, KVMSlot 
> *slot)
>  return ret == 0;
>  }
>  
> +/* Should be with all slots_lock held for the address spaces. */
> +static void kvm_dirty_ring_mark_page(KVMState *s, uint32_t as_id,
> + uint32_t slot_id, uint64_t offset)
> +{
> +KVMMemoryListener *kml;
> +KVMSlot *mem;
> +
> +if (as_id >= s->nr_as) {
> +return;
> +}
> +
> +kml = s->as[as_id].ml;
> +mem = >slots[slot_id];
> +
> +if (!mem->memory_size || offset >= (mem->memory_size / 
> TARGET_PAGE_SIZE)) {
It seems that TARGET_PAGE_SIZE should be qemu_real_host_page_size.


> +return;
> +}
> +
> +set_bit(offset, mem->dirty_bmap);
> +}
> +
> +static bool dirty_gfn_is_dirtied(struct kvm_dirty_gfn *gfn)
> +{
> +return gfn->flags == KVM_DIRTY_GFN_F_DIRTY;
> +}
> +
> +static void dirty_gfn_set_collected(struct kvm_dirty_gfn *gfn)
> +{
> +gfn->flags = KVM_DIRTY_GFN_F_RESET;
> +}
> +
> +/*
> + * Should be with all slots_lock held for the address spaces.  It returns the
> + * dirty page we've collected on this dirty ring.
> + */
> 

[PATCH v5 10/10] KVM: Dirty ring support

2021-03-10 Thread Peter Xu
KVM dirty ring is a new interface to pass over dirty bits from kernel to the
userspace.  Instead of using a bitmap for each memory region, the dirty ring
contains an array of dirtied GPAs to fetch (in the form of offset in slots).
For each vcpu there will be one dirty ring that binds to it.

kvm_dirty_ring_reap() is the major function to collect dirty rings.  It can be
called either by a standalone reaper thread that runs in the background,
collecting dirty pages for the whole VM.  It can also be called directly by any
thread that has BQL taken.

Signed-off-by: Peter Xu 
---
 accel/kvm/kvm-all.c| 332 -
 accel/kvm/trace-events |   7 +
 include/hw/core/cpu.h  |   8 +
 3 files changed, 344 insertions(+), 3 deletions(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index ae9393266b2..bf2b21f038b 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -15,6 +15,7 @@
 
 #include "qemu/osdep.h"
 #include 
+#include 
 
 #include 
 
@@ -80,6 +81,25 @@ struct KVMParkedVcpu {
 QLIST_ENTRY(KVMParkedVcpu) node;
 };
 
+enum KVMDirtyRingReaperState {
+KVM_DIRTY_RING_REAPER_NONE = 0,
+/* The reaper is sleeping */
+KVM_DIRTY_RING_REAPER_WAIT,
+/* The reaper is reaping for dirty pages */
+KVM_DIRTY_RING_REAPER_REAPING,
+};
+
+/*
+ * KVM reaper instance, responsible for collecting the KVM dirty bits
+ * via the dirty ring.
+ */
+struct KVMDirtyRingReaper {
+/* The reaper thread */
+QemuThread reaper_thr;
+volatile uint64_t reaper_iteration; /* iteration number of reaper thr */
+volatile enum KVMDirtyRingReaperState reaper_state; /* reap thr state */
+};
+
 struct KVMState
 {
 AccelState parent_obj;
@@ -131,6 +151,7 @@ struct KVMState
 bool kvm_dirty_ring_enabled;/* Whether KVM dirty ring is enabled */
 uint64_t kvm_dirty_ring_size;   /* Size of the per-vcpu dirty ring */
 uint32_t kvm_dirty_gfn_count;   /* Number of dirty GFNs per ring */
+struct KVMDirtyRingReaper reaper;
 };
 
 KVMState *kvm_state;
@@ -392,6 +413,13 @@ static int do_kvm_destroy_vcpu(CPUState *cpu)
 goto err;
 }
 
+if (cpu->kvm_dirty_gfns) {
+ret = munmap(cpu->kvm_dirty_gfns, s->kvm_dirty_ring_size);
+if (ret < 0) {
+goto err;
+}
+}
+
 vcpu = g_malloc0(sizeof(*vcpu));
 vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
 vcpu->kvm_fd = cpu->kvm_fd;
@@ -468,6 +496,19 @@ int kvm_init_vcpu(CPUState *cpu, Error **errp)
 (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE;
 }
 
+if (s->kvm_dirty_ring_enabled) {
+/* Use MAP_SHARED to share pages with the kernel */
+cpu->kvm_dirty_gfns = mmap(NULL, s->kvm_dirty_ring_size,
+   PROT_READ | PROT_WRITE, MAP_SHARED,
+   cpu->kvm_fd,
+   PAGE_SIZE * KVM_DIRTY_LOG_PAGE_OFFSET);
+if (cpu->kvm_dirty_gfns == MAP_FAILED) {
+ret = -errno;
+DPRINTF("mmap'ing vcpu dirty gfns failed: %d\n", ret);
+goto err;
+}
+}
+
 ret = kvm_arch_init_vcpu(cpu);
 if (ret < 0) {
 error_setg_errno(errp, -ret,
@@ -586,6 +627,11 @@ static void kvm_slot_sync_dirty_pages(KVMSlot *slot)
 cpu_physical_memory_set_dirty_lebitmap(slot->dirty_bmap, start, pages);
 }
 
+static void kvm_slot_reset_dirty_pages(KVMSlot *slot)
+{
+memset(slot->dirty_bmap, 0, slot->dirty_bmap_size);
+}
+
 #define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))
 
 /* Allocate the dirty bitmap for a slot  */
@@ -642,6 +688,170 @@ static bool kvm_slot_get_dirty_log(KVMState *s, KVMSlot 
*slot)
 return ret == 0;
 }
 
+/* Should be with all slots_lock held for the address spaces. */
+static void kvm_dirty_ring_mark_page(KVMState *s, uint32_t as_id,
+ uint32_t slot_id, uint64_t offset)
+{
+KVMMemoryListener *kml;
+KVMSlot *mem;
+
+if (as_id >= s->nr_as) {
+return;
+}
+
+kml = s->as[as_id].ml;
+mem = >slots[slot_id];
+
+if (!mem->memory_size || offset >= (mem->memory_size / TARGET_PAGE_SIZE)) {
+return;
+}
+
+set_bit(offset, mem->dirty_bmap);
+}
+
+static bool dirty_gfn_is_dirtied(struct kvm_dirty_gfn *gfn)
+{
+return gfn->flags == KVM_DIRTY_GFN_F_DIRTY;
+}
+
+static void dirty_gfn_set_collected(struct kvm_dirty_gfn *gfn)
+{
+gfn->flags = KVM_DIRTY_GFN_F_RESET;
+}
+
+/*
+ * Should be with all slots_lock held for the address spaces.  It returns the
+ * dirty page we've collected on this dirty ring.
+ */
+static uint32_t kvm_dirty_ring_reap_one(KVMState *s, CPUState *cpu)
+{
+struct kvm_dirty_gfn *dirty_gfns = cpu->kvm_dirty_gfns, *cur;
+uint32_t gfn_count = s->kvm_dirty_gfn_count;
+uint32_t count = 0, fetch = cpu->kvm_fetch_index;
+
+assert(dirty_gfns && gfn_count);
+trace_kvm_dirty_ring_reap_vcpu(cpu->cpu_index);
+
+while (true) {
+cur = _gfns[fetch % gfn_count];
+if