On Thu, Nov 26, 2020 at 06:17:31PM +0300, Andrey Gruzdev wrote: > In this particular implementation the same single migration > thread is responsible for both normal linear dirty page > migration and procesing UFFD page fault events. > > Processing write faults includes reading UFFD file descriptor, > finding respective RAM block and saving faulting page to > the migration stream. After page has been saved, write protection > can be removed. Since asynchronous version of qemu_put_buffer() > is expected to be used to save pages, we also have to flush > migraion stream prior to un-protecting saved memory range. > > Write protection is being removed for any previously protected > memory chunk that has hit the migration stream. That's valid > for pages from linear page scan along with write fault pages.
Thanks for working on this version, it looks much cleaner. > > Signed-off-by: Andrey Gruzdev <andrey.gruz...@virtuozzo.com> > --- > migration/ram.c | 155 +++++++++++++++++++++++++++++++++++++++++++++--- > 1 file changed, 147 insertions(+), 8 deletions(-) > > diff --git a/migration/ram.c b/migration/ram.c > index 3adfd1948d..bcdccdaef7 100644 > --- a/migration/ram.c > +++ b/migration/ram.c > @@ -1441,6 +1441,76 @@ static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t > *offset) > return block; > } > > +#ifdef CONFIG_LINUX > +/** > + * ram_find_block_by_host_address: find RAM block containing host page > + * > + * Returns pointer to RAMBlock if found, NULL otherwise > + * > + * @rs: current RAM state > + * @page_address: host page address > + */ > +static RAMBlock *ram_find_block_by_host_address(RAMState *rs, hwaddr > page_address) Reuse qemu_ram_block_from_host() somehow? > +{ > + RAMBlock *bs = rs->last_seen_block; > + > + do { > + if (page_address >= (hwaddr) bs->host && (page_address + > TARGET_PAGE_SIZE) <= > + ((hwaddr) bs->host + bs->max_length)) { > + return bs; > + } > + > + bs = QLIST_NEXT_RCU(bs, next); > + if (!bs) { > + /* Hit the end of the list */ > + bs = QLIST_FIRST_RCU(&ram_list.blocks); > + } > + } while (bs != rs->last_seen_block); > + > + return NULL; > +} > + > +/** > + * poll_fault_page: try to get next UFFD write fault page and, if pending > fault > + * is found, return RAM block pointer and page offset > + * > + * Returns pointer to the RAMBlock containing faulting page, > + * NULL if no write faults are pending > + * > + * @rs: current RAM state > + * @offset: page offset from the beginning of the block > + */ > +static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) > +{ > + struct uffd_msg uffd_msg; > + hwaddr page_address; > + RAMBlock *bs; > + int res; > + > + if (!migrate_background_snapshot()) { > + return NULL; > + } > + > + res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1); > + if (res <= 0) { > + return NULL; > + } > + > + page_address = uffd_msg.arg.pagefault.address; > + bs = ram_find_block_by_host_address(rs, page_address); > + if (!bs) { > + /* In case we couldn't find respective block, just unprotect > faulting page. */ > + uffd_protect_memory(rs->uffdio_fd, page_address, TARGET_PAGE_SIZE, > false); > + error_report("ram_find_block_by_host_address() failed: > address=0x%0lx", > + page_address); Looks ok to error_report() instead of assert(), but I'll suggest drop the call to uffd_protect_memory() at least. The only reason to not use assert() is because we try our best to avoid crashing the vm, however I really doubt whether uffd_protect_memory() is the right thing to do even if it happens - we may at last try to unprotect some strange pages that we don't even know where it is... > + return NULL; > + } > + > + *offset = (ram_addr_t) (page_address - (hwaddr) bs->host); > + return bs; > +} > +#endif /* CONFIG_LINUX */ > + > /** > * get_queued_page: unqueue a page from the postcopy requests > * > @@ -1480,6 +1550,16 @@ static bool get_queued_page(RAMState *rs, > PageSearchStatus *pss) > > } while (block && !dirty); > > +#ifdef CONFIG_LINUX > + if (!block) { > + /* > + * Poll write faults too if background snapshot is enabled; that's > + * when we have vcpus got blocked by the write protected pages. > + */ > + block = poll_fault_page(rs, &offset); > + } > +#endif /* CONFIG_LINUX */ > + > if (block) { > /* > * As soon as we start servicing pages out of order, then we have > @@ -1753,6 +1833,55 @@ static int ram_save_host_page(RAMState *rs, > PageSearchStatus *pss, > return pages; > } > > +/** > + * ram_save_host_page_pre: ram_save_host_page() pre-notifier > + * > + * @rs: current RAM state > + * @pss: page-search-status structure > + * @opaque: pointer to receive opaque context value > + */ > +static inline > +void ram_save_host_page_pre(RAMState *rs, PageSearchStatus *pss, void > **opaque) > +{ > + *(ram_addr_t *) opaque = pss->page; > +} > + > +/** > + * ram_save_host_page_post: ram_save_host_page() post-notifier > + * > + * @rs: current RAM state > + * @pss: page-search-status structure > + * @opaque: opaque context value > + * @res_override: pointer to the return value of ram_save_host_page(), > + * overwritten in case of an error > + */ > +static void ram_save_host_page_post(RAMState *rs, PageSearchStatus *pss, > + void *opaque, int *res_override) > +{ > + /* Check if page is from UFFD-managed region. */ > + if (pss->block->flags & RAM_UF_WRITEPROTECT) { > +#ifdef CONFIG_LINUX > + ram_addr_t page_from = (ram_addr_t) opaque; > + hwaddr page_address = (hwaddr) pss->block->host + > + ((hwaddr) page_from << TARGET_PAGE_BITS); I feel like most new uses of hwaddr is not correct... As I also commented in the other patch. We should save a lot of castings if switched. > + hwaddr run_length = (hwaddr) (pss->page - page_from + 1) << > TARGET_PAGE_BITS; > + int res; > + > + /* Flush async buffers before un-protect. */ > + qemu_fflush(rs->f); > + /* Un-protect memory range. */ > + res = uffd_protect_memory(rs->uffdio_fd, page_address, run_length, > false); > + /* We don't want to override existing error from > ram_save_host_page(). */ > + if (res < 0 && *res_override >= 0) { > + *res_override = res; What is this used for? If res<0, I thought we should just fail the snapshot. Meanwhile, res_override points to "pages", and then it'll be rewrite to the errno returned by uffd_protect_memory(). Smells strange. Can this ever be triggered anyway? > + } > +#else > + /* Should never happen */ > + qemu_file_set_error(rs->f, -ENOSYS); > +#endif /* CONFIG_LINUX */ > + } > +} > + > /** > * ram_find_and_save_block: finds a dirty page and sends it to f > * > @@ -1779,14 +1908,14 @@ static int ram_find_and_save_block(RAMState *rs, bool > last_stage) > return pages; > } > > + if (!rs->last_seen_block) { > + rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks); Why setup the last seen block to be the first if null? > + } > + > pss.block = rs->last_seen_block; > pss.page = rs->last_page; > pss.complete_round = false; > > - if (!pss.block) { > - pss.block = QLIST_FIRST_RCU(&ram_list.blocks); > - } > - > do { > again = true; > found = get_queued_page(rs, &pss); > @@ -1797,7 +1926,11 @@ static int ram_find_and_save_block(RAMState *rs, bool > last_stage) > } > > if (found) { > + void *opaque; > + > + ram_save_host_page_pre(rs, &pss, &opaque); > pages = ram_save_host_page(rs, &pss, last_stage); > + ram_save_host_page_post(rs, &pss, opaque, &pages); So the pre/post idea is kind of an overkill to me... How about we do the unprotect in ram_save_host_page() in the simple way, like: ram_save_host_page() start_addr = pss->page; do { ... (migrate pages) ... } while (...); if (background_snapshot_enabled()) { unprotect pages within start_addr to pss->page; } ... > } > } while (!pages && again); > > @@ -3864,9 +3997,12 @@ fail: > rs->uffdio_fd = -1; > return -1; > #else > + /* > + * Should never happen since we prohibit 'background-snapshot' > + * capability on non-Linux hosts. Yeah, yeah. So let's drop these irrelevant changes? :) > + */ > rs->uffdio_fd = -1; > - error_setg(&migrate_get_current()->error, > - "Background-snapshot not supported on non-Linux hosts"); > + error_setg(&migrate_get_current()->error, QERR_UNDEFINED_ERROR); > return -1; > #endif /* CONFIG_LINUX */ > } > @@ -3903,8 +4039,11 @@ void ram_write_tracking_stop(void) > uffd_close_fd(rs->uffdio_fd); > rs->uffdio_fd = -1; > #else > - error_setg(&migrate_get_current()->error, > - "Background-snapshot not supported on non-Linux hosts"); > + /* > + * Should never happen since we prohibit 'background-snapshot' > + * capability on non-Linux hosts. > + */ > + error_setg(&migrate_get_current()->error, QERR_UNDEFINED_ERROR); Same here. Thanks, > #endif /* CONFIG_LINUX */ > } > > -- > 2.25.1 > -- Peter Xu