Isaku Yamahata <yamah...@valinux.co.jp> wrote:
> This patch implements postcopy live migration for incoming part
>
>  vl.c                                               |    8 +-
>  13 files changed, 1409 insertions(+), 21 deletions(-)
>  copy linux-headers/linux/umem.h => migration-postcopy-stub.c (55%)

Ouch, git got really confused.


> +void postcopy_incoming_prepare(void)
> +{
> +    RAMBlock *block;
> +
> +    if (!incoming_postcopy) {
> +        return;
> +    }

We are testing the negation of this before calling the function, it is
not needed in one of the sides?

> +
> +    state.state = 0;
> +    state.host_page_size = getpagesize();
> +    state.host_page_shift = ffs(state.host_page_size) - 1;
> +    state.version_id = RAM_SAVE_VERSION_ID; /* = save version of
> +                                               ram_save_live() */
> +
> +    QLIST_FOREACH(block, &ram_list.blocks, next) {
> +        block->umem = umem_new(block->host, block->length);
> +        block->flags |= RAM_POSTCOPY_UMEM_MASK;
> +    }
> +}
> +
> +static int postcopy_incoming_ram_load_get64(QEMUFile *f,
> +                                            ram_addr_t *addr, int *flags)
> +{
> +    *addr = qemu_get_be64(f);
> +    *flags = *addr & ~TARGET_PAGE_MASK;
> +    *addr &= TARGET_PAGE_MASK;
> +    return qemu_file_get_error(f);
> +}
> +
> +int postcopy_incoming_ram_load(QEMUFile *f, void *opaque, int version_id)
> +{
> +    ram_addr_t addr;
> +    int flags;
> +    int error;
> +
> +    DPRINTF("incoming ram load\n");
> +    /*
> +     * RAM_SAVE_FLAGS_EOS or
> +     * RAM_SAVE_FLAGS_MEM_SIZE + mem size + RAM_SAVE_FLAGS_EOS
> +     * see postcopy_outgoing_ram_save_live()
> +     */
> +
> +    if (version_id != RAM_SAVE_VERSION_ID) {
> +        DPRINTF("RAM_SAVE_VERSION_ID %d != %d\n",
> +                version_id, RAM_SAVE_VERSION_ID);
> +        return -EINVAL;
> +    }
> +    error = postcopy_incoming_ram_load_get64(f, &addr, &flags);
> +    DPRINTF("addr 0x%lx flags 0x%x\n", addr, flags);
> +    if (error) {
> +        DPRINTF("error %d\n", error);
> +        return error;
> +    }
> +    if (flags == RAM_SAVE_FLAG_EOS && addr == 0) {
> +        DPRINTF("EOS\n");
> +        return 0;
> +    }
> +
> +    if (flags != RAM_SAVE_FLAG_MEM_SIZE) {
> +        DPRINTF("-EINVAL flags 0x%x\n", flags);
> +        return -EINVAL;
> +    }
> +    error = ram_load_mem_size(f, addr);
> +    if (error) {
> +        DPRINTF("addr 0x%lx error %d\n", addr, error);
> +        return error;
> +    }
> +
> +    error = postcopy_incoming_ram_load_get64(f, &addr, &flags);
> +    if (error) {
> +        DPRINTF("addr 0x%lx flags 0x%x error %d\n", addr, flags, error);
> +        return error;
> +    }
> +    if (flags == RAM_SAVE_FLAG_EOS && addr == 0) {
> +        DPRINTF("done\n");
> +        return 0;
> +    }
> +    DPRINTF("-EINVAL\n");
> +    return -EINVAL;
> +}
> +
> +static void postcopy_incoming_pipe_and_fork_umemd(int mig_read_fd,
> +                                                  QEMUFile *mig_read)
> +{
> +    int fds[2];
> +    RAMBlock *block;
> +
> +    DPRINTF("fork\n");
> +
> +    /* socketpair(AF_UNIX)? */
> +
> +    if (qemu_pipe(fds) == -1) {
> +        perror("qemu_pipe");
> +        abort();
> +    }
> +    state.from_umemd_fd = fds[0];
> +    umemd.to_qemu_fd = fds[1];
> +
> +    if (qemu_pipe(fds) == -1) {
> +        perror("qemu_pipe");
> +        abort();
> +    }
> +    umemd.from_qemu_fd = fds[0];
> +    state.to_umemd_fd = fds[1];
> +
> +    pid_t child = fork();
> +    if (child < 0) {
> +        perror("fork");
> +        abort();
> +    }
> +
> +    if (child == 0) {
> +        int mig_write_fd;
> +
> +        fd_close(&state.to_umemd_fd);
> +        fd_close(&state.from_umemd_fd);
> +        umemd.host_page_size = state.host_page_size;
> +        umemd.host_page_shift = state.host_page_shift;
> +
> +        umemd.nr_host_pages_per_target_page =
> +            TARGET_PAGE_SIZE / umemd.host_page_size;
> +        umemd.nr_target_pages_per_host_page =
> +            umemd.host_page_size / TARGET_PAGE_SIZE;
> +
> +        umemd.target_to_host_page_shift =
> +            ffs(umemd.nr_host_pages_per_target_page) - 1;
> +        umemd.host_to_target_page_shift =
> +            ffs(umemd.nr_target_pages_per_host_page) - 1;
> +
> +        umemd.state = 0;
> +        umemd.version_id = state.version_id;
> +        umemd.mig_read_fd = mig_read_fd;
> +        umemd.mig_read = mig_read;
> +
> +        mig_write_fd = dup(mig_read_fd);
> +        if (mig_write_fd < 0) {
> +            perror("could not dup for writable socket \n");
> +            abort();
> +        }
> +        umemd.mig_write_fd = mig_write_fd;
> +        umemd.mig_write = qemu_fopen_nonblock(mig_write_fd);
> +
> +        postcopy_incoming_umemd(); /* noreturn */
> +    }
> +
> +    DPRINTF("qemu pid: %d daemon pid: %d\n", getpid(), child);
> +    fd_close(&umemd.to_qemu_fd);
> +    fd_close(&umemd.from_qemu_fd);
> +    state.faulted_pages = g_malloc(umem_pages_size(MAX_FAULTED_PAGES));
> +    state.faulted_pages->nr = 0;
> +
> +    /* close all UMem.shmem_fd */
> +    QLIST_FOREACH(block, &ram_list.blocks, next) {
> +        umem_close_shmem(block->umem);
> +    }
> +    umem_qemu_wait_for_daemon(state.from_umemd_fd);
> +}
> +
> +void postcopy_incoming_fork_umemd(QEMUFile *mig_read)
> +{
> +    int fd = qemu_file_fd(mig_read);
> +    assert((fcntl(fd, F_GETFL) & O_ACCMODE) == O_RDWR);
> +
> +    socket_set_nonblock(fd);
> +    postcopy_incoming_pipe_and_fork_umemd(fd, mig_read);
> +    /* now socket is disowned. So tell umem server that it's safe to use it 
> */
> +    postcopy_incoming_qemu_ready();
> +}
> +
> +static void postcopy_incoming_qemu_recv_quit(void)
> +{
> +    RAMBlock *block;
> +    if (state.state & PIS_STATE_QUIT_RECEIVED) {
> +        return;
> +    }
> +
> +    QLIST_FOREACH(block, &ram_list.blocks, next) {
> +        if (block->umem != NULL) {
> +            umem_destroy(block->umem);
> +            block->umem = NULL;
> +            block->flags &= ~RAM_POSTCOPY_UMEM_MASK;
> +        }
> +    }
> +
> +    DPRINTF("|= PIS_STATE_QUIT_RECEIVED\n");
> +    state.state |= PIS_STATE_QUIT_RECEIVED;
> +    qemu_set_fd_handler(state.from_umemd_fd, NULL, NULL, NULL);
> +    qemu_fclose(state.from_umemd);
> +    state.from_umemd = NULL;
> +    fd_close(&state.from_umemd_fd);
> +}
> +
> +static void postcopy_incoming_qemu_fflush_to_umemd_handler(void *opaque)
> +{
> +    assert(state.to_umemd != NULL);
> +
> +    nonblock_fflush(state.to_umemd);
> +    if (nonblock_pending_size(state.to_umemd) > 0) {
> +        return;
> +    }
> +
> +    qemu_set_fd_handler(state.to_umemd->fd, NULL, NULL, NULL);
> +    if (state.state & PIS_STATE_QUIT_QUEUED) {
> +        DPRINTF("|= PIS_STATE_QUIT_SENT\n");
> +        state.state |= PIS_STATE_QUIT_SENT;
> +        qemu_fclose(state.to_umemd->file);
> +        state.to_umemd = NULL;
> +        fd_close(&state.to_umemd_fd);
> +        g_free(state.faulted_pages);
> +        state.faulted_pages = NULL;
> +    }
> +}
> +
> +static void postcopy_incoming_qemu_fflush_to_umemd(void)
> +{
> +    qemu_set_fd_handler(state.to_umemd->fd, NULL,
> +                        postcopy_incoming_qemu_fflush_to_umemd_handler, 
> NULL);
> +    postcopy_incoming_qemu_fflush_to_umemd_handler(NULL);
> +}
> +
> +static void postcopy_incoming_qemu_queue_quit(void)
> +{
> +    if (state.state & PIS_STATE_QUIT_QUEUED) {
> +        return;
> +    }
> +
> +    DPRINTF("|= PIS_STATE_QUIT_QUEUED\n");
> +    umem_qemu_quit(state.to_umemd->file);
> +    state.state |= PIS_STATE_QUIT_QUEUED;
> +}
> +
> +static void postcopy_incoming_qemu_send_pages_present(void)
> +{
> +    if (state.faulted_pages->nr > 0) {
> +        umem_qemu_send_pages_present(state.to_umemd->file,
> +                                     state.faulted_pages);
> +        state.faulted_pages->nr = 0;
> +    }
> +}
> +
> +static void postcopy_incoming_qemu_faulted_pages(
> +    const struct umem_pages *pages)
> +{
> +    assert(pages->nr <= MAX_FAULTED_PAGES);
> +    assert(state.faulted_pages != NULL);
> +
> +    if (state.faulted_pages->nr + pages->nr > MAX_FAULTED_PAGES) {
> +        postcopy_incoming_qemu_send_pages_present();
> +    }
> +    memcpy(&state.faulted_pages->pgoffs[state.faulted_pages->nr],
> +           &pages->pgoffs[0], sizeof(pages->pgoffs[0]) * pages->nr);
> +    state.faulted_pages->nr += pages->nr;
> +}
> +
> +static void postcopy_incoming_qemu_cleanup_umem(void);
> +
> +static int postcopy_incoming_qemu_handle_req_one(void)
> +{
> +    int offset = 0;
> +    int ret;
> +    uint8_t cmd;
> +
> +    ret = qemu_peek_buffer(state.from_umemd, &cmd, sizeof(cmd), offset);
> +    offset += sizeof(cmd);
> +    if (ret != sizeof(cmd)) {
> +        return -EAGAIN;
> +    }
> +    DPRINTF("cmd %c\n", cmd);
> +
> +    switch (cmd) {
> +    case UMEM_DAEMON_QUIT:
> +        postcopy_incoming_qemu_recv_quit();
> +        postcopy_incoming_qemu_queue_quit();
> +        postcopy_incoming_qemu_cleanup_umem();
> +        break;
> +    case UMEM_DAEMON_TRIGGER_PAGE_FAULT: {
> +        struct umem_pages *pages =
> +            umem_qemu_trigger_page_fault(state.from_umemd, &offset);
> +        if (pages == NULL) {
> +            return -EAGAIN;
> +        }
> +        if (state.to_umemd_fd >= 0 && !(state.state & 
> PIS_STATE_QUIT_QUEUED)) {
> +            postcopy_incoming_qemu_faulted_pages(pages);
> +            g_free(pages);
> +        }
> +        break;
> +    }
> +    case UMEM_DAEMON_ERROR:
> +        /* umem daemon hit troubles, so it warned us to stop vm execution */
> +        vm_stop(RUN_STATE_IO_ERROR); /* or RUN_STATE_INTERNAL_ERROR */
> +        break;
> +    default:
> +        abort();
> +        break;
> +    }
> +
> +    if (state.from_umemd != NULL) {
> +        qemu_file_skip(state.from_umemd, offset);
> +    }
> +    return 0;
> +}
> +
> +static void postcopy_incoming_qemu_handle_req(void *opaque)
> +{
> +    do {
> +        int ret = postcopy_incoming_qemu_handle_req_one();
> +        if (ret == -EAGAIN) {
> +            break;
> +        }
> +    } while (state.from_umemd != NULL &&
> +             qemu_pending_size(state.from_umemd) > 0);
> +
> +    if (state.to_umemd != NULL) {
> +        if (state.faulted_pages->nr > 0) {
> +            postcopy_incoming_qemu_send_pages_present();
> +        }
> +        postcopy_incoming_qemu_fflush_to_umemd();
> +    }
> +}
> +
> +void postcopy_incoming_qemu_ready(void)
> +{
> +    umem_qemu_ready(state.to_umemd_fd);
> +
> +    state.from_umemd = qemu_fopen_fd(state.from_umemd_fd);
> +    state.to_umemd = qemu_fopen_nonblock(state.to_umemd_fd);
> +    qemu_set_fd_handler(state.from_umemd_fd,
> +                        postcopy_incoming_qemu_handle_req, NULL, NULL);
> +}
> +
> +static void postcopy_incoming_qemu_cleanup_umem(void)
> +{
> +    /* when qemu will quit before completing postcopy, tell umem daemon
> +       to tear down umem device and exit. */
> +    if (state.to_umemd_fd >= 0) {
> +        postcopy_incoming_qemu_queue_quit();
> +        postcopy_incoming_qemu_fflush_to_umemd();
> +    }
> +}
> +
> +void postcopy_incoming_qemu_cleanup(void)
> +{
> +    postcopy_incoming_qemu_cleanup_umem();
> +    if (state.to_umemd != NULL) {
> +        nonblock_wait_for_flush(state.to_umemd);
> +    }
> +}
> +
> +void postcopy_incoming_qemu_pages_unmapped(ram_addr_t addr, ram_addr_t size)
> +{
> +    uint64_t nr = DIV_ROUND_UP(size, state.host_page_size);
> +    size_t len = umem_pages_size(nr);
> +    ram_addr_t end = addr + size;
> +    struct umem_pages *pages;
> +    int i;
> +
> +    if (state.to_umemd_fd < 0 || state.state & PIS_STATE_QUIT_QUEUED) {
> +        return;
> +    }
> +    pages = g_malloc(len);
> +    pages->nr = nr;
> +    for (i = 0; addr < end; addr += state.host_page_size, i++) {
> +        pages->pgoffs[i] = addr >> state.host_page_shift;
> +    }
> +    umem_qemu_send_pages_unmapped(state.to_umemd->file, pages);
> +    g_free(pages);
> +    assert(state.to_umemd != NULL);
> +    postcopy_incoming_qemu_fflush_to_umemd();
> +}
> +
> +/**************************************************************************
> + * incoming umem daemon
> + */
> +
> +static void postcopy_incoming_umem_recv_quit(void)
> +{
> +    if (umemd.state & UMEM_STATE_QUIT_RECEIVED) {
> +        return;
> +    }
> +    DPRINTF("|= UMEM_STATE_QUIT_RECEIVED\n");
> +    umemd.state |= UMEM_STATE_QUIT_RECEIVED;
> +    qemu_fclose(umemd.from_qemu);
> +    umemd.from_qemu = NULL;
> +    fd_close(&umemd.from_qemu_fd);
> +}
> +
> +static void postcopy_incoming_umem_queue_quit(void)
> +{
> +    if (umemd.state & UMEM_STATE_QUIT_QUEUED) {
> +        return;
> +    }
> +    DPRINTF("|= UMEM_STATE_QUIT_QUEUED\n");
> +    umem_daemon_quit(umemd.to_qemu->file);
> +    umemd.state |= UMEM_STATE_QUIT_QUEUED;
> +}
> +
> +static void postcopy_incoming_umem_send_eoc_req(void)
> +{
> +    struct qemu_umem_req req;
> +
> +    if (umemd.state & UMEM_STATE_EOC_SENT) {
> +        return;
> +    }
> +
> +    DPRINTF("|= UMEM_STATE_EOC_SENT\n");
> +    req.cmd = QEMU_UMEM_REQ_EOC;
> +    postcopy_incoming_send_req(umemd.mig_write->file, &req);
> +    umemd.state |= UMEM_STATE_EOC_SENT;
> +    qemu_fclose(umemd.mig_write->file);
> +    umemd.mig_write = NULL;
> +    fd_close(&umemd.mig_write_fd);
> +}
> +
> +static void postcopy_incoming_umem_send_page_req(RAMBlock *block)
> +{
> +    struct qemu_umem_req req;
> +    int bit;
> +    uint64_t target_pgoff;
> +    int i;
> +
> +    umemd.page_request->nr = MAX_REQUESTS;
> +    umem_get_page_request(block->umem, umemd.page_request);
> +    DPRINTF("id %s nr %"PRId64" offs 0x%"PRIx64" 0x%"PRIx64"\n",
> +            block->idstr, (uint64_t)umemd.page_request->nr,
> +            (uint64_t)umemd.page_request->pgoffs[0],
> +            (uint64_t)umemd.page_request->pgoffs[1]);
> +
> +    if (umemd.last_block_write != block) {
> +        req.cmd = QEMU_UMEM_REQ_ON_DEMAND;
> +        req.idstr = block->idstr;
> +    } else {
> +        req.cmd = QEMU_UMEM_REQ_ON_DEMAND_CONT;
> +    }
> +
> +    req.nr = 0;
> +    req.pgoffs = umemd.target_pgoffs;
> +    if (TARGET_PAGE_SIZE >= umemd.host_page_size) {
> +        for (i = 0; i < umemd.page_request->nr; i++) {
> +            target_pgoff = umemd.page_request->pgoffs[i] >>
> +                umemd.host_to_target_page_shift;
> +            bit = (block->offset >> TARGET_PAGE_BITS) + target_pgoff;
> +
> +            if (!test_and_set_bit(bit, umemd.phys_requested)) {
> +                req.pgoffs[req.nr] = target_pgoff;
> +                req.nr++;
> +            }
> +        }
> +    } else {
> +        for (i = 0; i < umemd.page_request->nr; i++) {
> +            int j;
> +            target_pgoff = umemd.page_request->pgoffs[i] <<
> +                umemd.host_to_target_page_shift;
> +            bit = (block->offset >> TARGET_PAGE_BITS) + target_pgoff;
> +
> +            for (j = 0; j < umemd.nr_target_pages_per_host_page; j++) {
> +                if (!test_and_set_bit(bit + j, umemd.phys_requested)) {
> +                    req.pgoffs[req.nr] = target_pgoff + j;
> +                    req.nr++;
> +                }
> +            }
> +        }
> +    }
> +
> +    DPRINTF("id %s nr %d offs 0x%"PRIx64" 0x%"PRIx64"\n",
> +            block->idstr, req.nr, req.pgoffs[0], req.pgoffs[1]);
> +    if (req.nr > 0 && umemd.mig_write != NULL) {
> +        postcopy_incoming_send_req(umemd.mig_write->file, &req);
> +        umemd.last_block_write = block;
> +    }
> +}
> +
> +static void postcopy_incoming_umem_send_pages_present(void)
> +{
> +    if (umemd.present_request->nr > 0) {
> +        umem_daemon_send_pages_present(umemd.to_qemu->file,
> +                                       umemd.present_request);
> +        umemd.present_request->nr = 0;
> +    }
> +}
> +
> +static void postcopy_incoming_umem_pages_present_one(
> +    uint32_t nr, const uint64_t *pgoffs, uint64_t ramblock_pgoffset)
> +{
> +    uint32_t i;
> +    assert(nr <= MAX_PRESENT_REQUESTS);
> +
> +    if (umemd.present_request->nr + nr > MAX_PRESENT_REQUESTS) {
> +        postcopy_incoming_umem_send_pages_present();
> +    }
> +
> +    for (i = 0; i < nr; i++) {
> +        umemd.present_request->pgoffs[umemd.present_request->nr + i] =
> +            pgoffs[i] + ramblock_pgoffset;
> +    }
> +    umemd.present_request->nr += nr;
> +}
> +
> +static void postcopy_incoming_umem_pages_present(
> +    const struct umem_pages *page_cached, uint64_t ramblock_pgoffset)
> +{
> +    uint32_t left = page_cached->nr;
> +    uint32_t offset = 0;
> +
> +    while (left > 0) {
> +        uint32_t nr = MIN(left, MAX_PRESENT_REQUESTS);
> +        postcopy_incoming_umem_pages_present_one(
> +            nr, &page_cached->pgoffs[offset], ramblock_pgoffset);
> +
> +        left -= nr;
> +        offset += nr;
> +    }
> +}
> +
> +static int postcopy_incoming_umem_ram_load(void)
> +{
> +    ram_addr_t offset;
> +    int flags;
> +
> +    int ret;
> +    size_t skip = 0;
> +    uint64_t be64;
> +    RAMBlock *block;
> +
> +    void *shmem;
> +    int error;
> +    int i;
> +    int bit;
> +
> +    if (umemd.version_id != RAM_SAVE_VERSION_ID) {
> +        return -EINVAL;
> +    }
> +
> +    ret = qemu_peek_buffer(umemd.mig_read, (uint8_t*)&be64, sizeof(be64),
> +                           skip);
> +    skip += ret;
> +    if (ret != sizeof(be64)) {
> +        return -EAGAIN;
> +    }
> +    offset = be64_to_cpu(be64);
> +
> +    flags = offset & ~TARGET_PAGE_MASK;
> +    offset &= TARGET_PAGE_MASK;
> +
> +    assert(!(flags & RAM_SAVE_FLAG_MEM_SIZE));
> +
> +    if (flags & RAM_SAVE_FLAG_EOS) {
> +        DPRINTF("RAM_SAVE_FLAG_EOS\n");
> +        postcopy_incoming_umem_send_eoc_req();
> +
> +        qemu_fclose(umemd.mig_read);
> +        umemd.mig_read = NULL;
> +        fd_close(&umemd.mig_read_fd);
> +        umemd.state |= UMEM_STATE_EOS_RECEIVED;
> +
> +        postcopy_incoming_umem_queue_quit();
> +        DPRINTF("|= UMEM_STATE_EOS_RECEIVED\n");
> +        return 0;
> +    }
> +
> +    block = NULL;
> +    if (flags & RAM_SAVE_FLAG_CONTINUE) {
> +        block = umemd.last_block_read;
> +    } else {
> +        uint8_t len;
> +        char id[256];
> +
> +        ret = qemu_peek_buffer(umemd.mig_read, &len, sizeof(len), skip);
> +        skip += ret;
> +        if (ret != sizeof(len)) {
> +            return -EAGAIN;
> +        }
> +        ret = qemu_peek_buffer(umemd.mig_read, (uint8_t*)id, len, skip);
> +        skip += ret;
> +        if (ret != len) {
> +            return -EAGAIN;
> +        }
> +        block = ram_find_block(id, len);
> +    }
> +
> +    if (block == NULL) {
> +        return -EINVAL;
> +    }
> +    umemd.last_block_read = block;
> +    shmem = block->host + offset;
> +
> +    if (flags & RAM_SAVE_FLAG_COMPRESS) {
> +        uint8_t ch;
> +        ret = qemu_peek_buffer(umemd.mig_read, &ch, sizeof(ch), skip);
> +        skip += ret;
> +        if (ret != sizeof(ch)) {
> +            return -EAGAIN;
> +        }
> +        memset(shmem, ch, TARGET_PAGE_SIZE);
> +    } else if (flags & RAM_SAVE_FLAG_PAGE) {
> +        ret = qemu_peek_buffer(umemd.mig_read, shmem, TARGET_PAGE_SIZE, 
> skip);
> +        skip += ret;
> +        if (ret != TARGET_PAGE_SIZE){
> +            return -EAGAIN;
> +        }
> +    }
> +    qemu_file_skip(umemd.mig_read, skip);
> +
> +    error = qemu_file_get_error(umemd.mig_read);
> +    if (error) {
> +        DPRINTF("error %d\n", error);
> +        return error;
> +    }
> +
> +    qemu_madvise(shmem, TARGET_PAGE_SIZE, QEMU_MADV_DONTNEED);
> +
> +    umemd.page_cached->nr = 0;
> +    bit = (umemd.last_block_read->offset + offset) >> TARGET_PAGE_BITS;
> +    if (!test_and_set_bit(bit, umemd.phys_received)) {
> +        if (TARGET_PAGE_SIZE >= umemd.host_page_size) {
> +            uint64_t pgoff = offset >> umemd.host_page_shift;
> +            for (i = 0; i < umemd.nr_host_pages_per_target_page; i++) {
> +                umemd.page_cached->pgoffs[umemd.page_cached->nr] = pgoff + i;
> +                umemd.page_cached->nr++;
> +            }
> +        } else {
> +            bool mark_cache = true;
> +            for (i = 0; i < umemd.nr_target_pages_per_host_page; i++) {
> +                if (!test_bit(bit + i, umemd.phys_received)) {
> +                    mark_cache = false;
> +                    break;
> +                }
> +            }
> +            if (mark_cache) {
> +                umemd.page_cached->pgoffs[0] = offset >> 
> umemd.host_page_shift;
> +                umemd.page_cached->nr = 1;
> +            }
> +        }
> +    }
> +
> +    if (umemd.page_cached->nr > 0) {
> +        umem_mark_page_cached(umemd.last_block_read->umem, 
> umemd.page_cached);
> +
> +        if (!(umemd.state & UMEM_STATE_QUIT_QUEUED) && umemd.to_qemu_fd >=0 
> &&
> +            (incoming_postcopy_flags & INCOMING_FLAGS_FAULT_REQUEST)) {
> +            uint64_t ramblock_pgoffset;
> +
> +            ramblock_pgoffset =
> +                umemd.last_block_read->offset >> umemd.host_page_shift;
> +            postcopy_incoming_umem_pages_present(umemd.page_cached,
> +                                                 ramblock_pgoffset);
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +static bool postcopy_incoming_umem_check_umem_done(void)
> +{
> +    bool all_done = true;
> +    RAMBlock *block;
> +
> +    QLIST_FOREACH(block, &ram_list.blocks, next) {
> +        UMem *umem = block->umem;
> +        if (umem != NULL && umem->nsets == umem->nbits) {
> +            umem_unmap_shmem(umem);
> +            umem_destroy(umem);
> +            block->umem = NULL;
> +        }
> +        if (block->umem != NULL) {
> +            all_done = false;
> +        }
> +    }
> +    return all_done;
> +}
> +
> +static bool postcopy_incoming_umem_page_faulted(const struct umem_pages 
> *pages)
> +{
> +    int i;
> +
> +    for (i = 0; i < pages->nr; i++) {
> +        ram_addr_t addr = pages->pgoffs[i] << umemd.host_page_shift;
> +        RAMBlock *block = qemu_get_ram_block(addr);
> +        addr -= block->offset;
> +        umem_remove_shmem(block->umem, addr, umemd.host_page_size);
> +    }
> +    return postcopy_incoming_umem_check_umem_done();
> +}
> +
> +static bool
> +postcopy_incoming_umem_page_unmapped(const struct umem_pages *pages)
> +{
> +    RAMBlock *block;
> +    ram_addr_t addr;
> +    int i;
> +
> +    struct qemu_umem_req req = {
> +        .cmd = QEMU_UMEM_REQ_REMOVE,
> +        .nr = 0,
> +        .pgoffs = (uint64_t*)pages->pgoffs,
> +    };
> +
> +    addr = pages->pgoffs[0] << umemd.host_page_shift;
> +    block = qemu_get_ram_block(addr);
> +
> +    for (i = 0; i < pages->nr; i++)  {
> +        int pgoff;
> +
> +        addr = pages->pgoffs[i] << umemd.host_page_shift;
> +        pgoff = addr >> TARGET_PAGE_BITS;
> +        if (!test_bit(pgoff, umemd.phys_received) &&
> +            !test_bit(pgoff, umemd.phys_requested)) {
> +            req.pgoffs[req.nr] = pgoff;
> +            req.nr++;
> +        }
> +        set_bit(pgoff, umemd.phys_received);
> +        set_bit(pgoff, umemd.phys_requested);
> +
> +        umem_remove_shmem(block->umem,
> +                          addr - block->offset, umemd.host_page_size);
> +    }
> +    if (req.nr > 0 && umemd.mig_write != NULL) {
> +        req.idstr = block->idstr;
> +        postcopy_incoming_send_req(umemd.mig_write->file, &req);
> +    }
> +
> +    return postcopy_incoming_umem_check_umem_done();
> +}
> +
> +static void postcopy_incoming_umem_done(void)
> +{
> +    postcopy_incoming_umem_send_eoc_req();
> +    postcopy_incoming_umem_queue_quit();
> +}
> +
> +static int postcopy_incoming_umem_handle_qemu(void)
> +{
> +    int ret;
> +    int offset = 0;
> +    uint8_t cmd;
> +
> +    ret = qemu_peek_buffer(umemd.from_qemu, &cmd, sizeof(cmd), offset);
> +    offset += sizeof(cmd);
> +    if (ret != sizeof(cmd)) {
> +        return -EAGAIN;
> +    }
> +    DPRINTF("cmd %c\n", cmd);
> +    switch (cmd) {
> +    case UMEM_QEMU_QUIT:
> +        postcopy_incoming_umem_recv_quit();
> +        postcopy_incoming_umem_done();
> +        break;
> +    case UMEM_QEMU_PAGE_FAULTED: {
> +        struct umem_pages *pages = umem_recv_pages(umemd.from_qemu,
> +                                                   &offset);
> +        if (pages == NULL) {
> +            return -EAGAIN;
> +        }
> +        if (postcopy_incoming_umem_page_faulted(pages)){
> +            postcopy_incoming_umem_done();
> +        }
> +        g_free(pages);
> +        break;
> +    }
> +    case UMEM_QEMU_PAGE_UNMAPPED: {
> +        struct umem_pages *pages = umem_recv_pages(umemd.from_qemu,
> +                                                   &offset);
> +        if (pages == NULL) {
> +            return -EAGAIN;
> +        }
> +        if (postcopy_incoming_umem_page_unmapped(pages)){
> +            postcopy_incoming_umem_done();
> +        }
> +        g_free(pages);
> +        break;
> +    }
> +    default:
> +        abort();
> +        break;
> +    }
> +    if (umemd.from_qemu != NULL) {
> +        qemu_file_skip(umemd.from_qemu, offset);
> +    }
> +    return 0;
> +}
> +
> +static void set_fd(int fd, fd_set *fds, int *nfds)
> +{
> +    FD_SET(fd, fds);
> +    if (fd > *nfds) {
> +        *nfds = fd;
> +    }
> +}
> +
> +static int postcopy_incoming_umemd_main_loop(void)
> +{
> +    fd_set writefds;
> +    fd_set readfds;
> +    int nfds;
> +    RAMBlock *block;
> +    int ret;
> +
> +    int pending_size;
> +    bool get_page_request;
> +
> +    nfds = -1;
> +    FD_ZERO(&writefds);
> +    FD_ZERO(&readfds);
> +
> +    if (umemd.mig_write != NULL) {
> +        pending_size = nonblock_pending_size(umemd.mig_write);
> +        if (pending_size > 0) {
> +            set_fd(umemd.mig_write_fd, &writefds, &nfds);
> +        }
> +    } else {
> +        pending_size = 0;
> +    }
> +
> +#define PENDING_SIZE_MAX (MAX_REQUESTS * sizeof(uint64_t) * 2)
> +    /* If page request to the migration source is accumulated,
> +       suspend getting page fault request. */
> +    get_page_request = (pending_size <= PENDING_SIZE_MAX);
> +
> +    if (get_page_request) {
> +        QLIST_FOREACH(block, &ram_list.blocks, next) {
> +            if (block->umem != NULL) {
> +                set_fd(block->umem->fd, &readfds, &nfds);
> +            }
> +        }
> +    }
> +
> +    if (umemd.mig_read_fd >= 0) {
> +        set_fd(umemd.mig_read_fd, &readfds, &nfds);
> +    }
> +
> +    if (umemd.to_qemu != NULL &&
> +        nonblock_pending_size(umemd.to_qemu) > 0) {
> +        set_fd(umemd.to_qemu_fd, &writefds, &nfds);
> +    }
> +    if (umemd.from_qemu_fd >= 0) {
> +        set_fd(umemd.from_qemu_fd, &readfds, &nfds);
> +    }
> +
> +    ret = select(nfds + 1, &readfds, &writefds, NULL, NULL);
> +    if (ret == -1) {
> +        if (errno == EINTR) {
> +            return 0;
> +        }
> +        return ret;
> +    }
> +
> +    if (umemd.mig_write_fd >= 0 && FD_ISSET(umemd.mig_write_fd, &writefds)) {
> +        nonblock_fflush(umemd.mig_write);
> +    }
> +    if (umemd.to_qemu_fd >= 0 && FD_ISSET(umemd.to_qemu_fd, &writefds)) {
> +        nonblock_fflush(umemd.to_qemu);
> +    }
> +    if (get_page_request) {
> +        QLIST_FOREACH(block, &ram_list.blocks, next) {
> +            if (block->umem != NULL && FD_ISSET(block->umem->fd, &readfds)) {
> +                postcopy_incoming_umem_send_page_req(block);
> +            }
> +        }
> +    }
> +    if (umemd.mig_read_fd >= 0 && FD_ISSET(umemd.mig_read_fd, &readfds)) {
> +        do {
> +            ret = postcopy_incoming_umem_ram_load();
> +            if (ret == -EAGAIN) {
> +                break;
> +            }
> +            if (ret < 0) {
> +                return ret;
> +            }
> +        } while (umemd.mig_read != NULL &&
> +                 qemu_pending_size(umemd.mig_read) > 0);
> +    }
> +    if (umemd.from_qemu_fd >= 0 && FD_ISSET(umemd.from_qemu_fd, &readfds)) {
> +        do {
> +            ret = postcopy_incoming_umem_handle_qemu();
> +            if (ret == -EAGAIN) {
> +                break;
> +            }
> +        } while (umemd.from_qemu != NULL &&
> +                 qemu_pending_size(umemd.from_qemu) > 0);
> +    }
> +
> +    if (umemd.mig_write != NULL) {
> +        nonblock_fflush(umemd.mig_write);
> +    }
> +    if (umemd.to_qemu != NULL) {
> +        if (!(umemd.state & UMEM_STATE_QUIT_QUEUED)) {
> +            postcopy_incoming_umem_send_pages_present();
> +        }
> +        nonblock_fflush(umemd.to_qemu);
> +        if ((umemd.state & UMEM_STATE_QUIT_QUEUED) &&
> +            nonblock_pending_size(umemd.to_qemu) == 0) {
> +            DPRINTF("|= UMEM_STATE_QUIT_SENT\n");
> +            qemu_fclose(umemd.to_qemu->file);
> +            umemd.to_qemu = NULL;
> +            fd_close(&umemd.to_qemu_fd);
> +            umemd.state |= UMEM_STATE_QUIT_SENT;
> +        }
> +    }
> +
> +    return (umemd.state & UMEM_STATE_END_MASK) == UMEM_STATE_END_MASK;
> +}
> +
> +static void postcopy_incoming_umemd(void)
> +{
> +    ram_addr_t last_ram_offset;
> +    int nbits;
> +    RAMBlock *block;
> +    int ret;
> +
> +    qemu_daemon(1, 1);
> +    signal(SIGPIPE, SIG_IGN);
> +    DPRINTF("daemon pid: %d\n", getpid());
> +
> +    umemd.page_request = g_malloc(umem_pages_size(MAX_REQUESTS));
> +
> +    umemd.page_cached = g_malloc(
> +        umem_pages_size(MAX_REQUESTS *
> +                        (TARGET_PAGE_SIZE >= umemd.host_page_size ?
> +                         1: umemd.nr_host_pages_per_target_page)));
> +
> +    umemd.target_pgoffs =
> +        g_new(uint64_t, MAX_REQUESTS *
> +              MAX(umemd.nr_host_pages_per_target_page,
> +                  umemd.nr_target_pages_per_host_page));
> +    umemd.present_request = g_malloc(umem_pages_size(MAX_PRESENT_REQUESTS));
> +    umemd.present_request->nr = 0;
> +
> +    last_ram_offset = qemu_last_ram_offset();
> +    nbits = last_ram_offset >> TARGET_PAGE_BITS;
> +    umemd.phys_requested = g_new0(unsigned long, BITS_TO_LONGS(nbits));
> +    umemd.phys_received = g_new0(unsigned long, BITS_TO_LONGS(nbits));
> +    umemd.last_block_read = NULL;
> +    umemd.last_block_write = NULL;
> +
> +    QLIST_FOREACH(block, &ram_list.blocks, next) {
> +        UMem *umem = block->umem;
> +        umem->umem = NULL;      /* umem mapping area has VM_DONT_COPY flag,
> +                                   so we lost those mappings by fork */
> +        block->host = umem_map_shmem(umem);
> +        umem_close_shmem(umem);
> +    }
> +    umem_daemon_ready(umemd.to_qemu_fd);
> +    umemd.to_qemu = qemu_fopen_nonblock(umemd.to_qemu_fd);
> +
> +    /* wait for qemu to disown migration_fd */
> +    umem_daemon_wait_for_qemu(umemd.from_qemu_fd);
> +    umemd.from_qemu = qemu_fopen_fd(umemd.from_qemu_fd);
> +
> +    DPRINTF("entering umemd main loop\n");
> +    for (;;) {
> +        ret = postcopy_incoming_umemd_main_loop();
> +        if (ret != 0) {
> +            break;
> +        }
> +    }
> +    DPRINTF("exiting umemd main loop\n");
> +
> +    /* This daemon forked from qemu and the parent qemu is still running.
> +     * Cleanups of linked libraries like SDL should not be triggered,
> +     * otherwise the parent qemu may use resources which was already freed.
> +     */
> +    fflush(stdout);
> +    fflush(stderr);
> +    _exit(ret < 0? EXIT_FAILURE: 0);
> +}


> index 057c810..598ad4c 100644
> --- a/qemu-common.h
> +++ b/qemu-common.h
> @@ -17,6 +17,7 @@ typedef struct DeviceState DeviceState;
>  
>  struct Monitor;
>  typedef struct Monitor Monitor;
> +typedef struct UMem UMem;

Does this belong in the umem patc?

> diff --git a/savevm.c b/savevm.c
> index bd4b5bf..74b15e7 100644
> --- a/savevm.c
> +++ b/savevm.c
> @@ -1938,6 +1938,7 @@ int qemu_loadvm_state(QEMUFile *f)
>      uint8_t section_type;
>      unsigned int v;
>      int ret;
> +    QEMUFile *orig_f = NULL;

This is hack-y at least.  But I don't have a good suggestion right now :-(

> +        case QEMU_VM_POSTCOPY:
> +            if (incoming_postcopy) {
> +                /* VMStateDescription:pre/post_load and
> +                 * cpu_sychronize_all_post_init() may fault on guest RAM.
> +                 * (MSR_KVM_WALL_CLOCK, MSR_KVM_SYSTEM_TIME)
> +                 * postcopy daemon needs to be forked before the fault.
> +                 */
> +                uint32_t size = qemu_get_be32(f);
> +                uint8_t *buf = g_malloc(size);

This cames from the network.  Checking that the value is "reasonable"
looks like a good idea (yes, I know that migration is specially bad
about testing values that cames from the network.


> @@ -2032,11 +2069,17 @@ int qemu_loadvm_state(QEMUFile *f)
>          }
>      }
>  
> +    fprintf(stderr, "%s:%d QEMU_VM_EOF\n", __func__, __LINE__);

DPRINTF or removal?

Later, Juan.

Reply via email to