On Tue, May 09, 2023 at 06:12:55AM -0400, Dave Voutila wrote:
> tech@,
>
> The diff below adds a new ioctl for vmm(4) that allows an emulated
> device process request vmm(4) enter a shared mapping in its vmspace so
> it can access guest memory without using a shared mapping backed by a
> named file.
>
> Similar to the vm creation ioctl (VMM_IOC_CREATE), the caller requires
> the "vmm" and "proc" pledge(2) promises. This allows the emulated
> devices to do this setup early and drop these promises back down to just
> "stdio" before any device emulation occurs.
>
> Feel free to skip to the diff (the regress change shows how it works in
> a simplified case) or continue reading for reasoning behind this
> change. I share this primarily for testers and feedback from other devs
> while mlarkin@ reviews.
>
> To test:
>
> 1. apply diff
> 2. build and install new kernel
> 3. copy or symlink new vmm.h into /usr/include/dev/vmm/
> 4. build and reinstall vmd (no changes for vmctl needed)
>
> You should see no change during vm usage, however you should now see no
> change in /tmp consumption while unmounting things like NFS mounts or
> usb disks. Read on for details.
>
> ...
>
> vmd(8) began emulating virtio network and block devices in subprocesses
> with a commit I made at the recent hackathon in Morocco. It relies on
> creating shared memory mappings using shm_mkstemp(3) and passing file
> descriptors to the fork/exec'd child processes.
>
> I've since received reports that using named mappings for shared memory
> is having 2 negative impacts:
>
> 1. increased overhead during vm teardown, often making systems
> unresponsive (this is my conclusion based on only minimal evidence)
>
> 2. unmounting any device on the host while a vm is running causes some
> guest memory to be flushed to disk (the backing file is already
> unlinked, so not visible to other processes).
>
> (2) can cause /tmp to fill up or introduce failure conditions I'm not
> sure we can recover from in vmd. It also has implications for other
> services on the host.
>
> I don't own a fireproof suit that fits...so I'm not about to wade into
> the VFS & UVM layers to figure out if (1) or (2) can be mitigated or
> fixed on their own.
>
> One idea was to implement what FreeBSD borrowed from Linux in their
> forever quest to become LinuxBSD: memfd_create(2) [1].
>
> I took one look and decided this was not the time for me to be trying to
> land a new syscall primarily for vmd (and some ports) and went another
> route.
>
> [1] https://man7.org/linux/man-pages/man2/memfd_create.2.html
>
> -dv
>
This does fix the unexpected shm issues. Thanks!
Diff reads ok to me, go for it when you are happy with the test results.
-ml
>
> diff refs/heads/master refs/heads/vmm-mapshare
> commit - cec1ace2d4d21c85f4c8bacc2dd971721bf6b694
> commit + 8f533c371094c044b0127d468be5feaaf775811b
> blob - f221b58f75c4eb01a3a04ae45c7cdb066b11361a
> blob + 0e6f5ff858c51bd9707c657b154b0df1f8944c3b
> --- regress/sys/arch/amd64/vmm/vcpu.c
> +++ regress/sys/arch/amd64/vmm/vcpu.c
> @@ -83,6 +83,7 @@ main(int argc, char **argv)
> struct vm_resetcpu_params vresetp;
> struct vm_run_params vrunp;
> struct vm_terminate_params vtp;
> + struct vm_sharemem_params vsp;
>
> struct vm_mem_range *vmr;
> int fd, ret = 1;
> @@ -127,8 +128,9 @@ main(int argc, char **argv)
> ((uint8_t*)p)[j + 1] = PCKBC_AUX;
> }
> vmr->vmr_va = (vaddr_t)p;
> - printf("mapped region %zu: { gpa: 0x%08lx, size: %lu }\n",
> - i, vmr->vmr_gpa, vmr->vmr_size);
> + printf("created mapped region %zu: { gpa: 0x%08lx, size: %lu,"
> + " hva: 0x%lx }\n", i, vmr->vmr_gpa, vmr->vmr_size,
> + vmr->vmr_va);
> }
>
> if (ioctl(fd, VMM_IOC_CREATE, &vcp) == -1)
> @@ -136,8 +138,55 @@ main(int argc, char **argv)
> printf("created vm %d named \"%s\"\n", vcp.vcp_id, vcp.vcp_name);
>
> /*
> - * 2. Check that our VM exists.
> + * 2. Check we can create shared memory mappings.
> */
> + memset(&vsp, 0, sizeof(vsp));
> + vsp.vsp_nmemranges = vcp.vcp_nmemranges;
> + memcpy(&vsp.vsp_memranges, &vcp.vcp_memranges,
> + sizeof(vsp.vsp_memranges));
> + vsp.vsp_vm_id = vcp.vcp_id;
> +
> + /* Find some new va ranges... */
> + for (i = 0; i < vsp.vsp_nmemranges; i++) {
> + vmr = &vsp.vsp_memranges[i];
> + p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE,
> + MAP_PRIVATE | MAP_ANON, -1, 0);
> + if (p == MAP_FAILED)
> + err(1, "mmap");
> + vmr->vmr_va = (vaddr_t)p;
> + }
> +
> + /* Release our mappings so vmm can replace them. */
> + for (i = 0; i < vsp.vsp_nmemranges; i++) {
> + vmr = &vsp.vsp_memranges[i];
> + munmap((void*)vmr->vmr_va, vmr->vmr_size);
> + }
> +
> + /* Perform the shared mapping. */
> + if (ioctl(fd, VMM_IOC_SHAREMEM, &vsp) == -1)
> + err(1, "VMM_IOC_SHAREMEM");
> + printf("created shared memory mappings\n");
> +
> + /* We should see our reset vector instructions in the new mappings. */
> + for (i = 0; i < vsp.vsp_nmemranges; i++) {
> + vmr = &vsp.vsp_memranges[i];
> + p = (void*)vmr->vmr_va;
> +
> + for (j = 0; j < vmr->vmr_size; j += 2) {
> + if (((uint8_t*)p)[j + 0] != 0xE4)
> + errx(1, "bad byte");
> + if (((uint8_t*)p)[j + 1] != PCKBC_AUX)
> + errx(1, "bad byte");
> + }
> + printf("checked shared region %zu: { gpa: 0x%08lx, size: %lu,"
> + " hva: 0x%lx }\n", i, vmr->vmr_gpa, vmr->vmr_size,
> + vmr->vmr_va);
> + }
> + printf("validated shared memory mappings\n");
> +
> + /*
> + * 3. Check that our VM exists.
> + */
> memset(&vip, 0, sizeof(vip));
> vip.vip_size = 0;
> info = NULL;
> @@ -189,7 +238,7 @@ main(int argc, char **argv)
> ours = NULL;
>
> /*
> - * 3. Reset our VCPU and initialize register state.
> + * 4. Reset our VCPU and initialize register state.
> */
> memset(&vresetp, 0, sizeof(vresetp));
> vresetp.vrp_vm_id = vcp.vcp_id;
> @@ -205,7 +254,7 @@ main(int argc, char **argv)
> vresetp.vrp_vm_id);
>
> /*
> - * 4. Run the vcpu, expecting an immediate exit for IO assist.
> + * 5. Run the vcpu, expecting an immediate exit for IO assist.
> */
> exit = malloc(sizeof(*exit));
> if (exit == NULL) {
> @@ -258,7 +307,7 @@ out:
>
> out:
> /*
> - * 5. Terminate our VM and clean up.
> + * 6. Terminate our VM and clean up.
> */
> memset(&vtp, 0, sizeof(vtp));
> vtp.vtp_vm_id = vcp.vcp_id;
> @@ -277,13 +326,23 @@ out:
> vmr = &vcp.vcp_memranges[i];
> if (vmr->vmr_va) {
> if (munmap((void *)vmr->vmr_va, vmr->vmr_size)) {
> - warn("failed to unmap region %zu at 0x%08lx",
> - i, vmr->vmr_va);
> + warn("failed to unmap orginal region %zu @ hva "
> + "0x%lx", i, vmr->vmr_va);
> ret = 1;
> } else
> - printf("unmapped region %zu @ gpa 0x%08lx\n",
> - i, vmr->vmr_gpa);
> + printf("unmapped origin region %zu @ hva "
> + "0x%lx\n", i, vmr->vmr_va);
> }
> + vmr = &vsp.vsp_memranges[i];
> + if (vmr->vmr_va) {
> + if (munmap((void *)vmr->vmr_va, vmr->vmr_size)) {
> + warn("failed to unmap shared region %zu @ hva "
> + "0x%lx", i, vmr->vmr_va);
> + ret = 1;
> + } else
> + printf("unmapped shared region %zu @ hva "
> + "0x%lx\n", i, vmr->vmr_va);
> + }
> }
>
> return (ret);
> blob - d46b3431081b6d2e7e1adab884ec21b0aaa9761a
> blob + 3daee7dad431ed200cbd734bc0f8b35bebd54216
> --- sys/dev/vmm/vmm.c
> +++ sys/dev/vmm/vmm.c
> @@ -262,6 +262,9 @@ vmmioctl(dev_t dev, u_long cmd, caddr_t data, int flag
> case VMM_IOC_WRITEVMPARAMS:
> ret = vm_rwvmparams((struct vm_rwvmparams_params *)data, 1);
> break;
> + case VMM_IOC_SHAREMEM:
> + ret = vm_share_mem((struct vm_sharemem_params *)data, p);
> + break;
> default:
> ret = vmmioctl_machdep(dev, cmd, data, flag, p);
> break;
> @@ -286,6 +289,7 @@ pledge_ioctl_vmm(struct proc *p, long com)
> switch (com) {
> case VMM_IOC_CREATE:
> case VMM_IOC_INFO:
> + case VMM_IOC_SHAREMEM:
> /* The "parent" process in vmd forks and manages VMs */
> if (p->p_p->ps_pledge & PLEDGE_PROC)
> return (0);
> @@ -780,3 +784,82 @@ vcpu_must_stop(struct vcpu *vcpu)
> return (1);
> return (0);
> }
> +
> +/*
> + * vm_share_mem
> + *
> + * Share a uvm mapping for the vm guest memory ranges into the calling
> process.
> + *
> + * Return values:
> + * 0: if successful
> + * ENOENT: if the vm cannot be found by vm_find
> + * EPERM: if the vm cannot be accessed by the current process
> + * EINVAL: if the provide memory ranges fail checks
> + * ENOMEM: if uvm_share fails to find available memory in the destination
> map
> + */
> +int
> +vm_share_mem(struct vm_sharemem_params *vsp, struct proc *p)
> +{
> + int ret = EINVAL;
> + size_t i, n;
> + struct vm *vm;
> + struct vm_mem_range *src, *dst;
> +
> + ret = vm_find(vsp->vsp_vm_id, &vm);
> + if (ret)
> + return (ret);
> +
> + /* Check we have the expected number of ranges. */
> + if (vm->vm_nmemranges != vsp->vsp_nmemranges)
> + goto out;
> + n = vm->vm_nmemranges;
> +
> + /* Check their types, sizes, and gpa's (implying page alignment). */
> + for (i = 0; i < n; i++) {
> + src = &vm->vm_memranges[i];
> + dst = &vsp->vsp_memranges[i];
> +
> + /*
> + * The vm memranges were already checked during creation, so
> + * compare to them to confirm validity of mapping request.
> + */
> + if (src->vmr_type != dst->vmr_type)
> + goto out;
> + if (src->vmr_gpa != dst->vmr_gpa)
> + goto out;
> + if (src->vmr_size != dst->vmr_size)
> + goto out;
> +
> + /* Check our intended destination is page-aligned. */
> + if (dst->vmr_va & PAGE_MASK)
> + goto out;
> + }
> +
> + /*
> + * Share each range individually with the calling process. We do
> + * not need PROC_EXEC as the emulated devices do not need to execute
> + * instructions from guest memory.
> + */
> + for (i = 0; i < n; i++) {
> + src = &vm->vm_memranges[i];
> + dst = &vsp->vsp_memranges[i];
> +
> + /* Skip MMIO range. */
> + if (src->vmr_type == VM_MEM_MMIO)
> + continue;
> +
> + DPRINTF("sharing gpa=0x%lx for pid %d @ va=0x%lx\n",
> + src->vmr_gpa, p->p_p->ps_pid, dst->vmr_va);
> + ret = uvm_share(&p->p_vmspace->vm_map, dst->vmr_va,
> + PROT_READ | PROT_WRITE, vm->vm_map, src->vmr_gpa,
> + src->vmr_size);
> + if (ret) {
> + printf("%s: uvm_share failed (%d)\n", __func__, ret);
> + break;
> + }
> + }
> + ret = 0;
> +out:
> + refcnt_rele_wake(&vm->vm_refcnt);
> + return (ret);
> +}
> blob - d2355d42b44b51044901de9d0adc0239586f37b8
> blob + 7b3b0d77ad550165b6e53f4de66723366d689b23
> --- sys/dev/vmm/vmm.h
> +++ sys/dev/vmm/vmm.h
> @@ -76,6 +76,13 @@ struct vm_resetcpu_params {
> struct vcpu_reg_state vrp_init_state;
> };
>
> +struct vm_sharemem_params {
> + /* Input parameters to VMM_IOC_SHAREMEM */
> + uint32_t vsp_vm_id;
> + size_t vsp_nmemranges;
> + struct vm_mem_range vsp_memranges[VMM_MAX_MEM_RANGES];
> +};
> +
> /* IOCTL definitions */
> #define VMM_IOC_CREATE _IOWR('V', 1, struct vm_create_params) /* Create VM */
> #define VMM_IOC_RUN _IOWR('V', 2, struct vm_run_params) /* Run VCPU */
> @@ -88,8 +95,8 @@ struct vm_resetcpu_params {
> #define VMM_IOC_READVMPARAMS _IOWR('V', 9, struct vm_rwvmparams_params)
> /* Set VM params */
> #define VMM_IOC_WRITEVMPARAMS _IOW('V', 10, struct vm_rwvmparams_params)
> +#define VMM_IOC_SHAREMEM _IOW('V', 11, struct vm_sharemem_params)
>
> -
> #ifdef _KERNEL
>
> /* #define VMM_DEBUG */
> @@ -194,6 +201,7 @@ int vcpu_must_stop(struct vcpu *);
> int vm_terminate(struct vm_terminate_params *);
> int vm_resetcpu(struct vm_resetcpu_params *);
> int vcpu_must_stop(struct vcpu *);
> +int vm_share_mem(struct vm_sharemem_params *, struct proc *);
>
> #endif /* _KERNEL */
> #endif /* DEV_VMM_H */
> blob - 9373a135aa87755f0e34b821af4ab8c0f4970421
> blob + dd0efc2fd71bb0ac91d2e389a30e779d8d6c6c0d
> --- usr.sbin/vmd/vioblk.c
> +++ usr.sbin/vmd/vioblk.c
> @@ -58,7 +58,7 @@ vioblk_main(int fd)
> }
>
> __dead void
> -vioblk_main(int fd)
> +vioblk_main(int fd, int fd_vmm)
> {
> struct virtio_dev dev;
> struct vioblk_dev *vioblk;
> @@ -71,8 +71,11 @@ vioblk_main(int fd)
>
> log_procinit("vioblk");
>
> - /* stdio - needed for read/write to disk fds and channels to the vm. */
> - if (pledge("stdio", NULL) == -1)
> + /*
> + * stdio - needed for read/write to disk fds and channels to the vm.
> + * vmm + proc - needed to create shared vm mappings.
> + */
> + if (pledge("stdio vmm proc", NULL) == -1)
> fatal("pledge");
>
> /* Receive our virtio_dev, mostly preconfigured. */
> @@ -92,8 +95,9 @@ vioblk_main(int fd)
> vioblk = &dev.vioblk;
>
> log_debug("%s: got viblk dev. num disk fds = %d, sync fd = %d, "
> - "async fd = %d, sz = %lld maxfer = %d", __func__, vioblk->ndisk_fd,
> - dev.sync_fd, dev.async_fd, vioblk->sz, vioblk->max_xfer);
> + "async fd = %d, sz = %lld maxfer = %d, vmm fd = %d", __func__,
> + vioblk->ndisk_fd, dev.sync_fd, dev.async_fd, vioblk->sz,
> + vioblk->max_xfer, fd_vmm);
>
> /* Receive our vm information from the vm process. */
> memset(&vm, 0, sizeof(vm));
> @@ -108,12 +112,19 @@ vioblk_main(int fd)
> setproctitle("%s/vioblk[%d]", vcp->vcp_name, vioblk->idx);
>
> /* Now that we have our vm information, we can remap memory. */
> - ret = remap_guest_mem(&vm);
> + ret = remap_guest_mem(&vm, fd_vmm);
> if (ret) {
> log_warnx("failed to remap guest memory");
> goto fail;
> }
>
> + /*
> + * We no longer need /dev/vmm access.
> + */
> + close_fd(fd_vmm);
> + if (pledge("stdio", NULL) == -1)
> + fatal("pledge2");
> +
> /* Initialize the virtio block abstractions. */
> type = vm.vm_params.vmc_disktypes[vioblk->idx];
> switch (type) {
> blob - 6ce905fdccfa7befb49353c23628227fdc74c486
> blob + d75dc06b9bc0133ace3e74de85abba3b62f539dc
> --- usr.sbin/vmd/vionet.c
> +++ usr.sbin/vmd/vionet.c
> @@ -61,7 +61,7 @@ vionet_main(int fd)
> static void handle_sync_io(int, short, void *);
>
> __dead void
> -vionet_main(int fd)
> +vionet_main(int fd, int fd_vmm)
> {
> struct virtio_dev dev;
> struct vionet_dev *vionet = NULL;
> @@ -73,8 +73,11 @@ vionet_main(int fd)
>
> log_procinit("vionet");
>
> - /* stdio - needed for read/write to tap fd and channels to the vm. */
> - if (pledge("stdio", NULL) == -1)
> + /*
> + * stdio - needed for read/write to disk fds and channels to the vm.
> + * vmm + proc - needed to create shared vm mappings.
> + */
> + if (pledge("stdio vmm proc", NULL) == -1)
> fatal("pledge");
>
> /* Receive our vionet_dev, mostly preconfigured. */
> @@ -92,8 +95,9 @@ vionet_main(int fd)
> dev.sync_fd = fd;
> vionet = &dev.vionet;
>
> - log_debug("%s: got vionet dev. tap fd = %d, syncfd = %d, asyncfd = %d",
> - __func__, vionet->data_fd, dev.sync_fd, dev.async_fd);
> + log_debug("%s: got vionet dev. tap fd = %d, syncfd = %d, asyncfd = %d"
> + ", vmm fd = %d", __func__, vionet->data_fd, dev.sync_fd,
> + dev.async_fd, fd_vmm);
>
> /* Receive our vm information from the vm process. */
> memset(&vm, 0, sizeof(vm));
> @@ -108,10 +112,19 @@ vionet_main(int fd)
> setproctitle("%s/vionet[%d]", vcp->vcp_name, vionet->idx);
>
> /* Now that we have our vm information, we can remap memory. */
> - ret = remap_guest_mem(&vm);
> - if (ret)
> + ret = remap_guest_mem(&vm, fd_vmm);
> + if (ret) {
> + fatal("%s: failed to remap", __func__);
> goto fail;
> + }
>
> + /*
> + * We no longer need /dev/vmm access.
> + */
> + close_fd(fd_vmm);
> + if (pledge("stdio", NULL) == -1)
> + fatal("pledge2");
> +
> /* If we're restoring hardware, re-initialize virtqueue hva's. */
> if (vm.vm_state & VM_STATE_RECEIVED) {
> struct virtio_vq_info *vq_info;
> blob - 92e77b8f83431c2ff52824e35149477909612653
> blob + e3f6d1371ab7795df6e9d4b370ffd1a7c5afbde6
> --- usr.sbin/vmd/virtio.c
> +++ usr.sbin/vmd/virtio.c
> @@ -1297,7 +1297,7 @@ virtio_dev_launch(struct vmd_vm *vm, struct virtio_dev
> static int
> virtio_dev_launch(struct vmd_vm *vm, struct virtio_dev *dev)
> {
> - char *nargv[8], num[32], t[2];
> + char *nargv[10], num[32], vmm_fd[32], t[2];
> pid_t dev_pid;
> int data_fds[VM_MAX_BASE_PER_DISK], sync_fds[2], async_fds[2], ret = 0;
> size_t i, j, data_fds_sz, sz = 0;
> @@ -1483,6 +1483,8 @@ virtio_dev_launch(struct vmd_vm *vm, struct virtio_dev
> memset(&nargv, 0, sizeof(nargv));
> memset(num, 0, sizeof(num));
> snprintf(num, sizeof(num), "%d", sync_fds[1]);
> + memset(vmm_fd, 0, sizeof(vmm_fd));
> + snprintf(vmm_fd, sizeof(vmm_fd), "%d", env->vmd_fd);
>
> t[0] = dev->dev_type;
> t[1] = '\0';
> @@ -1492,13 +1494,15 @@ virtio_dev_launch(struct vmd_vm *vm, struct virtio_dev
> nargv[2] = num;
> nargv[3] = "-t";
> nargv[4] = t;
> - nargv[5] = "-n";
> + nargv[5] = "-i";
> + nargv[6] = vmm_fd;
> + nargv[7] = "-n";
>
> if (env->vmd_verbose) {
> - nargv[6] = "-v";
> - nargv[7] = NULL;
> + nargv[8] = "-v";
> + nargv[9] = NULL;
> } else
> - nargv[6] = NULL;
> + nargv[8] = NULL;
>
> /* Control resumes in vmd.c:main(). */
> execvp(nargv[0], nargv);
> blob - d42abb5a834cb5f6d9777d8b23f92a6a1c5930f2
> blob + 48ec88c37db24290e069bf0ab9249df764b2c424
> --- usr.sbin/vmd/vm.c
> +++ usr.sbin/vmd/vm.c
> @@ -218,9 +218,10 @@ static const struct vcpu_reg_state vcpu_init_flat16 =
> * Primary entrypoint for launching a vm. Does not return.
> *
> * fd: file descriptor for communicating with vmm process.
> + * fd_vmm: file descriptor for communicating with vmm(4) device
> */
> void
> -vm_main(int fd)
> +vm_main(int fd, int vmm_fd)
> {
> struct vm_create_params *vcp = NULL;
> struct vmd_vm vm;
> @@ -241,9 +242,8 @@ vm_main(int fd)
> * vmm - for the vmm ioctls and operations.
> * proc exec - fork/exec for launching devices.
> * recvfd - for vm send/recv and sending fd to devices.
> - * tmppath/rpath - for shm_mkstemp, ftruncate, unlink
> */
> - if (pledge("stdio vmm proc exec recvfd tmppath rpath", NULL) == -1)
> + if (pledge("stdio vmm proc exec recvfd", NULL) == -1)
> fatal("pledge");
>
> /* Receive our vm configuration. */
> @@ -254,13 +254,6 @@ vm_main(int fd)
> _exit(EIO);
> }
>
> - /* Receive the /dev/vmm fd number. */
> - sz = atomicio(read, fd, &env->vmd_fd, sizeof(env->vmd_fd));
> - if (sz != sizeof(env->vmd_fd)) {
> - log_warnx("failed to receive /dev/vmm fd");
> - _exit(EIO);
> - }
> -
> /* Update process with the vm name. */
> vcp = &vm.vm_params.vmc_params;
> setproctitle("%s", vcp->vcp_name);
> @@ -1099,63 +1092,34 @@ alloc_guest_mem(struct vmd_vm *vm)
> alloc_guest_mem(struct vmd_vm *vm)
> {
> void *p;
> - char *tmp;
> - int fd, ret = 0;
> + int ret = 0;
> size_t i, j;
> struct vm_create_params *vcp = &vm->vm_params.vmc_params;
> struct vm_mem_range *vmr;
>
> - tmp = calloc(32, sizeof(char));
> - if (tmp == NULL) {
> - ret = errno;
> - log_warn("%s: calloc", __func__);
> - return (ret);
> - }
> - strlcpy(tmp, "/tmp/vmd.XXXXXXXXXX", 32);
> -
> - vm->vm_nmemfds = vcp->vcp_nmemranges;
> -
> for (i = 0; i < vcp->vcp_nmemranges; i++) {
> vmr = &vcp->vcp_memranges[i];
>
> - fd = shm_mkstemp(tmp);
> - if (fd < 0) {
> - ret = errno;
> - log_warn("%s: shm_mkstemp", __func__);
> - return (ret);
> - }
> - if (ftruncate(fd, vmr->vmr_size) == -1) {
> - ret = errno;
> - log_warn("%s: ftruncate", __func__);
> - goto out;
> - }
> - if (fcntl(fd, F_SETFD, 0) == -1) {
> - ret = errno;
> - log_warn("%s: fcntl", __func__);
> - goto out;
> - }
> - if (shm_unlink(tmp) == -1) {
> - ret = errno;
> - log_warn("%s: shm_unlink", __func__);
> - goto out;
> - }
> - strlcpy(tmp, "/tmp/vmd.XXXXXXXXXX", 32);
> -
> + /*
> + * We only need R/W as userland. vmm(4) will use R/W/X in its
> + * mapping.
> + *
> + * We must use MAP_SHARED so emulated devices will be able
> + * to generate shared mappings.
> + */
> p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE,
> - MAP_SHARED | MAP_CONCEAL, fd, 0);
> + MAP_ANON | MAP_CONCEAL | MAP_SHARED, -1, 0);
> if (p == MAP_FAILED) {
> ret = errno;
> for (j = 0; j < i; j++) {
> vmr = &vcp->vcp_memranges[j];
> munmap((void *)vmr->vmr_va, vmr->vmr_size);
> }
> - goto out;
> + return (ret);
> }
> - vm->vm_memfds[i] = fd;
> vmr->vmr_va = (vaddr_t)p;
> }
> -out:
> - free(tmp);
> +
> return (ret);
> }
>
> @@ -2552,10 +2516,11 @@ remap_guest_mem(struct vmd_vm *vm)
> * Returns 0 on success, non-zero in event of failure.
> */
> int
> -remap_guest_mem(struct vmd_vm *vm)
> +remap_guest_mem(struct vmd_vm *vm, int vmm_fd)
> {
> struct vm_create_params *vcp;
> struct vm_mem_range *vmr;
> + struct vm_sharemem_params vsp;
> size_t i, j;
> void *p = NULL;
> int ret;
> @@ -2566,23 +2531,32 @@ remap_guest_mem(struct vmd_vm *vm)
> vcp = &vm->vm_params.vmc_params;
>
> /*
> - * We've execve'd, so we need to re-map the guest VM memory. Iterate
> - * over all possible vm_mem_range entries so we can initialize all
> - * file descriptors to a value.
> + * Initialize our VM shared memory request using our original
> + * creation parameters. We'll overwrite the va's after mmap(2).
> */
> + memset(&vsp, 0, sizeof(vsp));
> + vsp.vsp_nmemranges = vcp->vcp_nmemranges;
> + vsp.vsp_vm_id = vcp->vcp_id;
> + memcpy(&vsp.vsp_memranges, &vcp->vcp_memranges,
> + sizeof(vsp.vsp_memranges));
> +
> + /*
> + * Use mmap(2) to identify virtual address space for our mappings.
> + */
> for (i = 0; i < VMM_MAX_MEM_RANGES; i++) {
> - if (i < vcp->vcp_nmemranges) {
> - vmr = &vcp->vcp_memranges[i];
> - /* Skip ranges we know we don't need right now. */
> + if (i < vsp.vsp_nmemranges) {
> + vmr = &vsp.vsp_memranges[i];
> +
> + /* Ignore any MMIO ranges. */
> if (vmr->vmr_type == VM_MEM_MMIO) {
> - log_debug("%s: skipping range i=%ld, type=%d",
> - __func__, i, vmr->vmr_type);
> - vm->vm_memfds[i] = -1;
> + vmr->vmr_va = 0;
> + vcp->vcp_memranges[i].vmr_va = 0;
> continue;
> }
> - /* Re-mmap the memrange. */
> - p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE,
> - MAP_SHARED | MAP_CONCEAL, vm->vm_memfds[i], 0);
> +
> + /* Make initial mappings for the memrange. */
> + p = mmap(NULL, vmr->vmr_size, PROT_READ, MAP_ANON, -1,
> + 0);
> if (p == MAP_FAILED) {
> ret = errno;
> log_warn("%s: mmap", __func__);
> @@ -2594,11 +2568,29 @@ remap_guest_mem(struct vmd_vm *vm)
> return (ret);
> }
> vmr->vmr_va = (vaddr_t)p;
> - } else {
> - /* Initialize with an invalid fd. */
> - vm->vm_memfds[i] = -1;
> + vcp->vcp_memranges[i].vmr_va = vmr->vmr_va;
> }
> }
>
> + /*
> + * munmap(2) now that we have va's and ranges that don't overlap. vmm
> + * will use the va's and sizes to recreate the mappings for us.
> + */
> + for (i = 0; i < vsp.vsp_nmemranges; i++) {
> + vmr = &vsp.vsp_memranges[i];
> + if (vmr->vmr_type == VM_MEM_MMIO)
> + continue;
> + if (munmap((void*)vmr->vmr_va, vmr->vmr_size) == -1)
> + fatal("%s: munmap", __func__);
> + }
> +
> + /*
> + * Ask vmm to enter the shared mappings for us. They'll point
> + * to the same host physical memory, but will have a randomized
> + * virtual address for the calling process.
> + */
> + if (ioctl(vmm_fd, VMM_IOC_SHAREMEM, &vsp) == -1)
> + return (errno);
> +
> return (0);
> }
> blob - c06fe974877c17710a81cd69bed541f908e76ef4
> blob + 0d0a66533b05a80aafba31d415e97c0fb3a6be88
> --- usr.sbin/vmd/vmd.c
> +++ usr.sbin/vmd/vmd.c
> @@ -774,7 +774,8 @@ main(int argc, char **argv)
> struct privsep *ps;
> int ch;
> enum privsep_procid proc_id = PROC_PARENT;
> - int proc_instance = 0, vm_launch = 0, vm_fd = -1;
> + int proc_instance = 0, vm_launch = 0;
> + int vmm_fd = -1, vm_fd = -1;
> const char *errp, *title = NULL;
> int argc0 = argc;
> char dev_type = '\0';
> @@ -784,7 +785,7 @@ main(int argc, char **argv)
> if ((env = calloc(1, sizeof(*env))) == NULL)
> fatal("calloc: env");
>
> - while ((ch = getopt(argc, argv, "D:P:I:V:X:df:nt:v")) != -1) {
> + while ((ch = getopt(argc, argv, "D:P:I:V:X:df:i:nt:v")) != -1) {
> switch (ch) {
> case 'D':
> if (cmdline_symset(optarg) < 0)
> @@ -838,6 +839,11 @@ main(int argc, char **argv)
> default: fatalx("invalid device type");
> }
> break;
> + case 'i':
> + vmm_fd = strtonum(optarg, 0, 128, &errp);
> + if (errp)
> + fatalx("invalid vmm fd");
> + break;
> default:
> usage();
> }
> @@ -866,7 +872,7 @@ main(int argc, char **argv)
>
> ps = &env->vmd_ps;
> ps->ps_env = env;
> - env->vmd_fd = -1;
> + env->vmd_fd = vmm_fd;
>
> if (config_init(env) == -1)
> fatal("failed to initialize configuration");
> @@ -882,14 +888,14 @@ main(int argc, char **argv)
> * If we're launching a new vm or its device, we short out here.
> */
> if (vm_launch == VMD_LAUNCH_VM) {
> - vm_main(vm_fd);
> + vm_main(vm_fd, vmm_fd);
> /* NOTREACHED */
> } else if (vm_launch == VMD_LAUNCH_DEV) {
> if (dev_type == VMD_DEVTYPE_NET) {
> - vionet_main(vm_fd);
> + vionet_main(vm_fd, vmm_fd);
> /* NOTREACHED */
> } else if (dev_type == VMD_DEVTYPE_DISK) {
> - vioblk_main(vm_fd);
> + vioblk_main(vm_fd, vmm_fd);
> /* NOTREACHED */
> }
> fatalx("unsupported device type '%c'", dev_type);
> blob - 68de0544706a5864aec480590191b33904864053
> blob + 61b0cff0c62c9ed752a2128bea8b12bc34f918d7
> --- usr.sbin/vmd/vmd.h
> +++ usr.sbin/vmd/vmd.h
> @@ -329,9 +329,6 @@ struct vmd_vm {
> struct timeval vm_start_tv;
> int vm_start_limit;
>
> - int vm_memfds[VMM_MAX_MEM_RANGES];
> - size_t vm_nmemfds;
> -
> TAILQ_ENTRY(vmd_vm) vm_entry;
> };
> TAILQ_HEAD(vmlist, vmd_vm);
> @@ -488,7 +485,7 @@ void vm_main(int);
> int vmm_pipe(struct vmd_vm *, int, void (*)(int, short, void *));
>
> /* vm.c */
> -void vm_main(int);
> +void vm_main(int, int);
> void mutex_lock(pthread_mutex_t *);
> void mutex_unlock(pthread_mutex_t *);
> int read_mem(paddr_t, void *buf, size_t);
> @@ -499,7 +496,7 @@ int remap_guest_mem(struct vmd_vm *);
> enum pipe_msg_type vm_pipe_recv(struct vm_dev_pipe *);
> int write_mem(paddr_t, const void *buf, size_t);
> void* hvaddr_mem(paddr_t, size_t);
> -int remap_guest_mem(struct vmd_vm *);
> +int remap_guest_mem(struct vmd_vm *, int);
>
> /* config.c */
> int config_init(struct vmd *);
> @@ -527,9 +524,9 @@ __dead void vionet_main(int);
> int virtio_get_base(int, char *, size_t, int, const char *);
>
> /* vionet.c */
> -__dead void vionet_main(int);
> +__dead void vionet_main(int, int);
>
> /* vioblk.c */
> -__dead void vioblk_main(int);
> +__dead void vioblk_main(int, int);
>
> #endif /* VMD_H */
> blob - 35119673dc31b82aec55e6dd8ef12eff3ef2beef
> blob + 7f6fe9040ad8b0c6774255a7d02f96322ea5e421
> --- usr.sbin/vmd/vmm.c
> +++ usr.sbin/vmd/vmm.c
> @@ -627,7 +627,7 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *p
> {
> struct vm_create_params *vcp;
> struct vmd_vm *vm;
> - char *nargv[6], num[32];
> + char *nargv[8], num[32], vmm_fd[32];
> int fd, ret = EINVAL;
> int fds[2];
> pid_t vm_pid;
> @@ -701,16 +701,6 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *p
> if (ret == EIO)
> goto err;
>
> - /* Send the fd number for /dev/vmm. */
> - sz = atomicio(vwrite, fds[0], &env->vmd_fd,
> - sizeof(env->vmd_fd));
> - if (sz != sizeof(env->vmd_fd)) {
> - log_warnx("%s: failed to send /dev/vmm fd for vm '%s'",
> - __func__, vcp->vcp_name);
> - ret = EIO;
> - goto err;
> - }
> -
> /* Read back the kernel-generated vm id from the child */
> sz = atomicio(read, fds[0], &vcp->vcp_id, sizeof(vcp->vcp_id));
> if (sz != sizeof(vcp->vcp_id)) {
> @@ -773,17 +763,21 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *p
> memset(&nargv, 0, sizeof(nargv));
> memset(num, 0, sizeof(num));
> snprintf(num, sizeof(num), "%d", fds[1]);
> + memset(vmm_fd, 0, sizeof(vmm_fd));
> + snprintf(vmm_fd, sizeof(vmm_fd), "%d", env->vmd_fd);
>
> nargv[0] = env->argv0;
> nargv[1] = "-V";
> nargv[2] = num;
> nargv[3] = "-n";
> + nargv[4] = "-i";
> + nargv[5] = vmm_fd;
>
> if (env->vmd_verbose) {
> - nargv[4] = "-v";
> - nargv[5] = NULL;
> + nargv[6] = "-v";
> + nargv[7] = NULL;
> } else
> - nargv[4] = NULL;
> + nargv[6] = NULL;
>
> /* Control resumes in vmd main(). */
> execvp(nargv[0], nargv);