* zhanghailiang (zhang.zhanghaili...@huawei.com) wrote: > On 2015/4/15 1:04, Dr. David Alan Gilbert (git) wrote: > >From: "Dr. David Alan Gilbert" <dgilb...@redhat.com> > > > >userfaultfd is a Linux syscall that gives an fd that receives a stream > >of notifications of accesses to pages registered with it and allows > >the program to acknowledge those stalls and tell the accessing > >thread to carry on. > > > >Signed-off-by: Dr. David Alan Gilbert <dgilb...@redhat.com> > >--- > > include/migration/migration.h | 4 + > > migration/postcopy-ram.c | 165 > > +++++++++++++++++++++++++++++++++++++++--- > > trace-events | 9 +++ > > 3 files changed, 169 insertions(+), 9 deletions(-) > > > >diff --git a/include/migration/migration.h b/include/migration/migration.h > >index db06fd2..4d6f33a 100644 > >--- a/include/migration/migration.h > >+++ b/include/migration/migration.h > >@@ -75,11 +75,15 @@ struct MigrationIncomingState { > > */ > > QemuEvent main_thread_load_event; > > > >+ bool have_fault_thread; > > QemuThread fault_thread; > > QemuSemaphore fault_thread_sem; > > > > /* For the kernel to send us notifications */ > > int userfault_fd; > >+ /* To tell the fault_thread to quit */ > >+ int userfault_quit_fd; > >+ > > QEMUFile *return_path; > > QemuMutex rp_mutex; /* We send replies from multiple threads */ > > void *postcopy_tmp_page; > >diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c > >index 33aadbc..b2dc3b7 100644 > >--- a/migration/postcopy-ram.c > >+++ b/migration/postcopy-ram.c > >@@ -49,6 +49,8 @@ struct PostcopyDiscardState { > > */ > > #if defined(__linux__) > > > >+#include <poll.h> > >+#include <sys/eventfd.h> > > #include <sys/mman.h> > > #include <sys/ioctl.h> > > #include <sys/syscall.h> > >@@ -273,15 +275,41 @@ int postcopy_ram_incoming_init(MigrationIncomingState > >*mis, size_t ram_pages) > > */ > > int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis) > > { > >- /* TODO: Join the fault thread once we're sure it will exit */ > >- if (qemu_ram_foreach_block(cleanup_area, mis)) { > >- return -1; > >+ trace_postcopy_ram_incoming_cleanup_entry(); > >+ > >+ if (mis->have_fault_thread) { > >+ uint64_t tmp64; > >+ > >+ if (qemu_ram_foreach_block(cleanup_area, mis)) { > >+ return -1; > >+ } > >+ /* > >+ * Tell the fault_thread to exit, it's an eventfd that should > >+ * currently be at 0, we're going to inc it to 1 > >+ */ > >+ tmp64 = 1; > >+ if (write(mis->userfault_quit_fd, &tmp64, 8) == 8) { > >+ trace_postcopy_ram_incoming_cleanup_join(); > >+ qemu_thread_join(&mis->fault_thread); > >+ } else { > >+ /* Not much we can do here, but may as well report it */ > >+ error_report("%s: incing userfault_quit_fd: %s", __func__, > >+ strerror(errno)); > >+ } > >+ trace_postcopy_ram_incoming_cleanup_closeuf(); > >+ close(mis->userfault_fd); > >+ close(mis->userfault_quit_fd); > >+ mis->have_fault_thread = false; > > } > > > >+ postcopy_state_set(mis, POSTCOPY_INCOMING_END); > >+ migrate_send_rp_shut(mis, qemu_file_get_error(mis->file) != 0); > >+ > > if (mis->postcopy_tmp_page) { > > munmap(mis->postcopy_tmp_page, getpagesize()); > > mis->postcopy_tmp_page = NULL; > > } > >+ trace_postcopy_ram_incoming_cleanup_exit(); > > return 0; > > } > > > >@@ -320,31 +348,150 @@ static int ram_block_enable_notify(const char > >*block_name, void *host_addr, > > static void *postcopy_ram_fault_thread(void *opaque) > > { > > MigrationIncomingState *mis = (MigrationIncomingState *)opaque; > >- > >- fprintf(stderr, "postcopy_ram_fault_thread\n"); > >- /* TODO: In later patch */ > >+ uint64_t hostaddr; /* The kernel always gives us 64 bit, not a pointer > >*/ > >+ int ret; > >+ size_t hostpagesize = getpagesize(); > >+ RAMBlock *rb = NULL; > >+ RAMBlock *last_rb = NULL; /* last RAMBlock we sent part of */ > >+ uint8_t *local_tmp_page; > >+ > >+ trace_postcopy_ram_fault_thread_entry(); > > qemu_sem_post(&mis->fault_thread_sem); > >- while (1) { > >- /* TODO: In later patch */ > >+ > >+ local_tmp_page = mmap(NULL, getpagesize(), > >+ PROT_READ | PROT_WRITE, MAP_PRIVATE | > >MAP_ANONYMOUS, > >+ -1, 0); > >+ if (!local_tmp_page) { > >+ error_report("%s mapping local tmp page: %s", __func__, > >+ strerror(errno)); > >+ return NULL; > > } > >+ if (madvise(local_tmp_page, getpagesize(), MADV_DONTFORK)) { > > What's this 'local_tmp_page' used for ? I don't find where it is used in this > function.
Yes, that was old code; it's already gone. > Besides, there is a helper function qemu_madvise() in qemu, maybe you should > use it :) Thanks; I could do for this one but I've got to be careful; qemu_madvise is mainly for portability, and this code is Linux specific anyway, and some of the QEMU_MADV_ flags aren't valid for all of the OSs anyway. Dave > > Thanks. > > >+ munmap(local_tmp_page, getpagesize()); > >+ error_report("%s postcopy local page DONTFORK: %s", __func__, > >+ strerror(errno)); > >+ return NULL; > >+ } > >+ > >+ while (true) { > >+ ram_addr_t rb_offset; > >+ ram_addr_t in_raspace; > >+ struct pollfd pfd[2]; > >+ > >+ /* > >+ * We're mainly waiting for the kernel to give us a faulting HVA, > >+ * however we can be told to quit via userfault_quit_fd which is > >+ * an eventfd > >+ */ > >+ pfd[0].fd = mis->userfault_fd; > >+ pfd[0].events = POLLIN; > >+ pfd[0].revents = 0; > >+ pfd[1].fd = mis->userfault_quit_fd; > >+ pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */ > >+ pfd[1].revents = 0; > >+ > >+ if (poll(pfd, 2, -1 /* Wait forever */) == -1) { > >+ error_report("%s: userfault poll: %s", __func__, > >strerror(errno)); > >+ break; > >+ } > >+ > >+ if (pfd[1].revents) { > >+ trace_postcopy_ram_fault_thread_quit(); > >+ break; > >+ } > >+ > >+ ret = read(mis->userfault_fd, &hostaddr, sizeof(hostaddr)); > >+ if (ret != sizeof(hostaddr)) { > >+ if (errno == EAGAIN) { > >+ /* > >+ * if a wake up happens on the other thread just after > >+ * the poll, there is nothing to read. > >+ */ > >+ continue; > >+ } > >+ if (ret < 0) { > >+ error_report("%s: Failed to read full userfault hostaddr: > >%s", > >+ __func__, strerror(errno)); > >+ break; > >+ } else { > >+ error_report("%s: Read %d bytes from userfaultfd expected > >%zd", > >+ __func__, ret, sizeof(hostaddr)); > >+ break; /* Lost alignment, don't know what we'd read next */ > >+ } > >+ } > > > >+ rb = qemu_ram_block_from_host((void *)(uintptr_t)hostaddr, true, > >+ &in_raspace, &rb_offset); > >+ if (!rb) { > >+ error_report("postcopy_ram_fault_thread: Fault outside guest: %" > >+ PRIx64, hostaddr); > >+ break; > >+ } > >+ > >+ trace_postcopy_ram_fault_thread_request(hostaddr, > >+ qemu_ram_get_idstr(rb), > >+ rb_offset); > >+ > >+ /* > >+ * Send the request to the source - we want to request one > >+ * of our host page sizes (which is >= TPS) > >+ */ > >+ if (rb != last_rb) { > >+ last_rb = rb; > >+ migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb), > >+ rb_offset, hostpagesize); > >+ } else { > >+ /* Save some space */ > >+ migrate_send_rp_req_pages(mis, NULL, > >+ rb_offset, hostpagesize); > >+ } > >+ } > >+ munmap(local_tmp_page, getpagesize()); > >+ trace_postcopy_ram_fault_thread_exit(); > > return NULL; > > } > > > > int postcopy_ram_enable_notify(MigrationIncomingState *mis) > > { > >- /* Create the fault handler thread and wait for it to be ready */ > >+ /* Open the fd for the kernel to give us userfaults */ > >+ mis->userfault_fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); > >+ if (mis->userfault_fd == -1) { > >+ error_report("%s: Failed to open userfault fd: %s", __func__, > >+ strerror(errno)); > >+ return -1; > >+ } > >+ > >+ /* > >+ * Although the host check already tested the API, we need to > >+ * do the check again as an ABI handshake on the new fd. > >+ */ > >+ if (!ufd_version_check(mis->userfault_fd)) { > >+ return -1; > >+ } > >+ > >+ /* Now an eventfd we use to tell the fault-thread to quit */ > >+ mis->userfault_quit_fd = eventfd(0, EFD_CLOEXEC); > >+ if (mis->userfault_quit_fd == -1) { > >+ error_report("%s: Opening userfault_quit_fd: %s", __func__, > >+ strerror(errno)); > >+ close(mis->userfault_fd); > >+ return -1; > >+ } > >+ > > qemu_sem_init(&mis->fault_thread_sem, 0); > > qemu_thread_create(&mis->fault_thread, "postcopy/fault", > > postcopy_ram_fault_thread, mis, > > QEMU_THREAD_JOINABLE); > > qemu_sem_wait(&mis->fault_thread_sem); > > qemu_sem_destroy(&mis->fault_thread_sem); > >+ mis->have_fault_thread = true; > > > > /* Mark so that we get notified of accesses to unwritten areas */ > > if (qemu_ram_foreach_block(ram_block_enable_notify, mis)) { > > return -1; > > } > > > >+ trace_postcopy_ram_enable_notify(); > >+ > > return 0; > > } > > > >diff --git a/trace-events b/trace-events > >index b65740c..72a65fa 100644 > >--- a/trace-events > >+++ b/trace-events > >@@ -1500,6 +1500,15 @@ postcopy_cleanup_area(const char *ramblock, void > >*host_addr, size_t offset, size > > postcopy_ram_discard_range(void *start, void *end) "%p,%p" > > postcopy_init_area(const char *ramblock, void *host_addr, size_t offset, > > size_t length) "%s: %p offset=%zx length=%zx" > > postcopy_place_page(void *host_addr, bool all_zero) "host=%p all_zero=%d" > >+postcopy_ram_enable_notify(void) "" > >+postcopy_ram_fault_thread_entry(void) "" > >+postcopy_ram_fault_thread_exit(void) "" > >+postcopy_ram_fault_thread_quit(void) "" > >+postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock, > >size_t offset) "Request for HVA=%" PRIx64 " rb=%s offset=%zx" > >+postcopy_ram_incoming_cleanup_closeuf(void) "" > >+postcopy_ram_incoming_cleanup_entry(void) "" > >+postcopy_ram_incoming_cleanup_exit(void) "" > >+postcopy_ram_incoming_cleanup_join(void) "" > > > > # kvm-all.c > > kvm_ioctl(int type, void *arg) "type 0x%x, arg %p" > > > > -- Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK