Hi,

here comes a diff for vmm, and I'd like to ask people that are
interested in our hypervisor to test this. If you are experimenting
with vmm already, just do what you always do with vmm when running
with this diff :-)

How to apply the diff:
$ cd /usr/src
$ patch < vmm.diff
$ doas make includes

Then rebuild and install kernel, vmd, and vmctl.

This diff will not go in at once. The first thing that should be
committed is an addition to uvm. I'll post that one separately in this
thread and ask for reviews.

Some details about the diff:

Currently, when vmd(8) needs to access the memory of the guest VM
(for example to copy the kernel image, read/write network packets, etc.),
it has to do readpage and writepage ioctls. For memory accesses
that are larger than one page, several such ioctls are needed.

A nicer way would be to have the guest memory accessible to the
userland vmd(8) via shared memory. That allows us to do memory
accesses through pointers and with ordinary loads/stores.

This works by having vmd(8) allocate guest memory via mmap(),
and pass these userspace pointers to vmm(4). vmm(4) then also
maps guest physical addresses to the mmap'd range.

It's similar to what KVM does, and has a few benefits:
- the RAM size of a VM is restricted to the max data size of vmd(8),
  and the kernel takes care of this. No need to invent a separate
  resource limitation mechanism for vmm(4)
- memory accesses between guest and host are faster, obviously

This diff:
- adds a new uvm subroutine uvm_share() that establishes a
  shared mapping of a memory range. vmm(4) calls this to
  map guest physical ranges to the mmap'd regions allocated
  by userspace.
- Makes vmm(4) use uvm_share instead of allocating virtual memory
  ranges for the guest itself
- Makes vmd(8) allocate memory for the guest VM
- Makes vmd(8) use memcpy instead of read/writepage ioctls
- removes the vm_readpage and vm_writepage ioctls from vmm(4)

Again, the uvm parts should go in first, and the vmm bits will
be finished afterwards.

Index: sys/arch/amd64/amd64/vmm.c
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/vmm.c,v
retrieving revision 1.48
diff -u -p -r1.48 vmm.c
--- sys/arch/amd64/amd64/vmm.c  6 Apr 2016 06:15:06 -0000       1.48
+++ sys/arch/amd64/amd64/vmm.c  6 Apr 2016 17:46:19 -0000
@@ -110,8 +110,6 @@ int vm_create(struct vm_create_params *,
 int vm_run(struct vm_run_params *);
 int vm_terminate(struct vm_terminate_params *);
 int vm_get_info(struct vm_info_params *);
-int vm_writepage(struct vm_writepage_params *);
-int vm_readpage(struct vm_readpage_params *);
 int vm_resetcpu(struct vm_resetcpu_params *);
 int vm_intr_pending(struct vm_intr_params *);
 int vcpu_reset_regs(struct vcpu *, struct vcpu_init_state *);
@@ -126,9 +124,9 @@ int vcpu_run_svm(struct vcpu *, uint8_t)
 void vcpu_deinit(struct vcpu *);
 void vcpu_deinit_vmx(struct vcpu *);
 void vcpu_deinit_svm(struct vcpu *);
-int vm_impl_init(struct vm *);
-int vm_impl_init_vmx(struct vm *);
-int vm_impl_init_svm(struct vm *);
+int vm_impl_init(struct vm *, struct proc *);
+int vm_impl_init_vmx(struct vm *, struct proc *);
+int vm_impl_init_svm(struct vm *, struct proc *);
 void vm_impl_deinit(struct vm *);
 void vm_impl_deinit_vmx(struct vm *);
 void vm_impl_deinit_svm(struct vm *);
@@ -344,12 +342,6 @@ vmmioctl(dev_t dev, u_long cmd, caddr_t 
        case VMM_IOC_TERM:
                ret = vm_terminate((struct vm_terminate_params *)data);
                break;
-       case VMM_IOC_WRITEPAGE:
-               ret = vm_writepage((struct vm_writepage_params *)data);
-               break;
-       case VMM_IOC_READPAGE:
-               ret = vm_readpage((struct vm_readpage_params *)data);
-               break;
        case VMM_IOC_RESETCPU:
                ret = vm_resetcpu((struct vm_resetcpu_params *)data);
                break;
@@ -383,8 +375,6 @@ pledge_ioctl_vmm(struct proc *p, long co
        case VMM_IOC_TERM:
                /* XXX VM processes should only terminate themselves */
        case VMM_IOC_RUN:
-       case VMM_IOC_WRITEPAGE:
-       case VMM_IOC_READPAGE:
        case VMM_IOC_RESETCPU:
                return (0);
        }
@@ -404,89 +394,6 @@ vmmclose(dev_t dev, int flag, int mode, 
 }
 
 /*
- * vm_readpage
- *
- * Reads a region (PAGE_SIZE max) of guest physical memory using the parameters
- * defined in 'vrp'.
- *
- * Returns 0 if successful, or various error codes on failure:
- *  ENOENT if the VM id contained in 'vrp' refers to an unknown VM
- *  EINVAL if the memory region described by vrp is not regular memory
- *  EFAULT if the memory region described by vrp has not yet been faulted in
- *      by the guest
- */
-int
-vm_readpage(struct vm_readpage_params *vrp)
-{
-       struct vm *vm;
-       paddr_t host_pa;
-       void *kva;
-       vaddr_t vr_page;
-
-       /* Find the desired VM */
-       rw_enter_read(&vmm_softc->vm_lock);
-       SLIST_FOREACH(vm, &vmm_softc->vm_list, vm_link) {
-               if (vm->vm_id == vrp->vrp_vm_id)
-                       break;
-       }
-
-       /* Not found? exit. */
-       if (vm == NULL) {
-               rw_exit_read(&vmm_softc->vm_lock);
-               return (ENOENT);
-       }
-
-       /* Check that the data to be read is within a page */
-       if (vrp->vrp_len > (PAGE_SIZE - (vrp->vrp_paddr & PAGE_MASK))) {
-               rw_exit_read(&vmm_softc->vm_lock);
-               return (EINVAL);
-       }
-
-       /* Calculate page containing vrp->vrp_paddr */
-       vr_page = vrp->vrp_paddr & ~PAGE_MASK;
-
-       /* If not regular memory, exit. */
-       if (vmm_get_guest_memtype(vm, vr_page) != VMM_MEM_TYPE_REGULAR) {
-               rw_exit_read(&vmm_softc->vm_lock);
-               return (EINVAL);
-       }
-
-       /* Find the phys page where this guest page exists in real memory */
-       if (!pmap_extract(vm->vm_map->pmap, vr_page, &host_pa)) {
-               rw_exit_read(&vmm_softc->vm_lock);
-               return (EFAULT);
-       }
-
-       /* Allocate temporary KVA for the guest page */
-       kva = km_alloc(PAGE_SIZE, &kv_any, &kp_none, &kd_nowait);
-       if (!kva) {
-               DPRINTF("vm_readpage: can't alloc kva\n");
-               rw_exit_read(&vmm_softc->vm_lock);
-               return (EFAULT);
-       }
-
-       /* Enter the mapping in the kernel pmap and copyout */
-       pmap_kenter_pa((vaddr_t)kva, host_pa, PROT_READ);
-
-       if (copyout(kva + ((vaddr_t)vrp->vrp_paddr & PAGE_MASK),
-           vrp->vrp_data, vrp->vrp_len) == EFAULT) {
-               DPRINTF("vm_readpage: can't copyout\n");
-               pmap_kremove((vaddr_t)kva, PAGE_SIZE);
-               km_free(kva, PAGE_SIZE, &kv_any, &kp_none);
-               rw_exit_read(&vmm_softc->vm_lock);
-               return (EFAULT);
-       }
-
-       /* Cleanup and exit */
-       pmap_kremove((vaddr_t)kva, PAGE_SIZE);
-       km_free(kva, PAGE_SIZE, &kv_any, &kp_none);
-
-       rw_exit_read(&vmm_softc->vm_lock);
-
-       return (0);
-}
-
-/*
  * vm_resetcpu
  *
  * Resets the vcpu defined in 'vrp' to power-on-init register state
@@ -620,112 +527,6 @@ vm_intr_pending(struct vm_intr_params *v
 }
 
 /*
- * vm_writepage
- *
- * Writes a region (PAGE_SIZE max) of guest physical memory using the 
parameters
- * defined in 'vrp'.
- *
- * Returns 0 if successful, or various error codes on failure:
- *  ENOENT if the VM id contained in 'vrp' refers to an unknown VM
- *  EINVAL if the memory region described by vrp is not regular memory
- *  EFAULT if the source data in vrp contains an invalid address
- *  ENOMEM if a memory allocation error occurs
- */
-int
-vm_writepage(struct vm_writepage_params *vwp)
-{
-       char *pagedata;
-       struct vm *vm;
-       paddr_t host_pa;
-       void *kva;
-       int ret;
-       vaddr_t vw_page, dst;
-
-       /* Find the desired VM */
-       rw_enter_read(&vmm_softc->vm_lock);
-       SLIST_FOREACH(vm, &vmm_softc->vm_list, vm_link) {
-               if (vm->vm_id == vwp->vwp_vm_id)
-                       break;
-       }
-
-       /* Not found? exit. */
-       if (vm == NULL) {
-               rw_exit_read(&vmm_softc->vm_lock);
-               return (ENOENT);
-       }
-
-       /* Check that the data to be written is within a page */
-       if (vwp->vwp_len > (PAGE_SIZE - (vwp->vwp_paddr & PAGE_MASK))) {
-               rw_exit_read(&vmm_softc->vm_lock);
-               return (EINVAL);
-       }
-
-       /* Calculate page containing vwp->vwp_paddr */
-       vw_page = vwp->vwp_paddr & ~PAGE_MASK;
-
-       /* If not regular memory, exit. */
-       if (vmm_get_guest_memtype(vm, vw_page) != VMM_MEM_TYPE_REGULAR) {
-               rw_exit_read(&vmm_softc->vm_lock);
-               return (EINVAL);
-       }
-
-       /* Allocate temporary region to copyin into */
-       pagedata = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT|M_ZERO);
-       if (pagedata == NULL) {
-               rw_exit_read(&vmm_softc->vm_lock);
-               return (ENOMEM);
-       }
-
-       /* Copy supplied data to kernel */
-       if (copyin(vwp->vwp_data, pagedata, vwp->vwp_len) == EFAULT) {
-               free(pagedata, M_DEVBUF, PAGE_SIZE);
-               rw_exit_read(&vmm_softc->vm_lock);
-               return (EFAULT);
-       }
-
-       /* Find the phys page where this guest page exists in real memory */
-       if (!pmap_extract(vm->vm_map->pmap, vw_page, &host_pa)) {
-               /* page not present */
-               ret = uvm_fault(vm->vm_map, vw_page,
-                   VM_FAULT_INVALID, PROT_READ | PROT_WRITE | PROT_EXEC);
-               if (ret) {
-                       free(pagedata, M_DEVBUF, PAGE_SIZE);
-                       rw_exit_read(&vmm_softc->vm_lock);
-                       return (EFAULT);
-               }
-
-               if (!pmap_extract(vm->vm_map->pmap, vw_page, &host_pa)) {
-                       panic("vm_writepage: still not mapped GPA 0x%llx\n",
-                           (uint64_t)vwp->vwp_paddr);
-               }
-       }
-
-       /* Allocate kva for guest page */
-       kva = km_alloc(PAGE_SIZE, &kv_any, &kp_none, &kd_nowait);
-       if (kva == NULL) {
-               DPRINTF("vm_writepage: can't alloc kva\n");
-               free(pagedata, M_DEVBUF, PAGE_SIZE);
-               rw_exit_read(&vmm_softc->vm_lock);
-               return (ENOMEM);
-       }
-
-       /* Enter mapping and copy data */
-       pmap_kenter_pa((vaddr_t)kva, host_pa, PROT_READ | PROT_WRITE);
-       dst = (vaddr_t)kva + ((vaddr_t)vwp->vwp_paddr & PAGE_MASK);
-       memcpy((void *)dst, pagedata, vwp->vwp_len);
-
-       /* Cleanup */
-       pmap_kremove((vaddr_t)kva, PAGE_SIZE);
-       km_free(kva, PAGE_SIZE, &kv_any, &kp_none);
-
-       free(pagedata, M_DEVBUF, PAGE_SIZE);
-
-       rw_exit_read(&vmm_softc->vm_lock);
-
-       return (0);
-}
-
-/*
  * vmm_start
  *
  * Starts VMM mode on the system
@@ -951,7 +752,7 @@ vm_create_check_mem_ranges(struct vm_cre
                vmr = &vcp->vcp_memranges[i];
 
                /* Only page-aligned addresses and sizes are permitted */
-               if ((vmr->vmr_gpa & PAGE_MASK) ||
+               if ((vmr->vmr_gpa & PAGE_MASK) || (vmr->vmr_va & PAGE_MASK) ||
                    (vmr->vmr_size & PAGE_MASK) || vmr->vmr_size == 0)
                        return (0);
 
@@ -960,6 +761,17 @@ vm_create_check_mem_ranges(struct vm_cre
                    vmr->vmr_size > maxgpa - vmr->vmr_gpa)
                        return (0);
 
+               /*
+                * Make sure that all virtual addresses are within the address
+                * space of the process and that they do not wrap around.
+                * Calling uvm_share() when creating the VM will take care of
+                * further checks.
+                */
+               if (vmr->vmr_va < VM_MIN_ADDRESS ||
+                   vmr->vmr_va >= VM_MAXUSER_ADDRESS ||
+                   vmr->vmr_size >= VM_MAXUSER_ADDRESS - vmr->vmr_va)
+                       return (0);
+
                /* Specifying ranges within the PCI MMIO space is forbidden */
                disjunct_range = (vmr->vmr_gpa > VMM_PCI_MMIO_BAR_END) ||
                    (vmr->vmr_gpa + vmr->vmr_size <= VMM_PCI_MMIO_BAR_BASE);
@@ -1025,7 +837,7 @@ vm_create(struct vm_create_params *vcp, 
        vm->vm_memory_size = memsize;
        strncpy(vm->vm_name, vcp->vcp_name, VMM_MAX_NAME_LEN);
 
-       if (vm_impl_init(vm)) {
+       if (vm_impl_init(vm, p)) {
                printf("failed to init arch-specific features for vm 0x%p\n",
                    vm);
                vm_teardown(vm);
@@ -1079,11 +891,10 @@ vm_create(struct vm_create_params *vcp, 
  * Intel VMX specific VM initialization routine
  */
 int
-vm_impl_init_vmx(struct vm *vm)
+vm_impl_init_vmx(struct vm *vm, struct proc *p)
 {
        int i, ret;
        vaddr_t mingpa, maxgpa;
-       vaddr_t startp;
        struct pmap *pmap;
        struct vm_mem_range *vmr;
 
@@ -1119,15 +930,11 @@ vm_impl_init_vmx(struct vm *vm)
        DPRINTF("vm_impl_init_vmx: created vm_map @ %p\n", vm->vm_map);
        for (i = 0; i < vm->vm_nmemranges; i++) {
                vmr = &vm->vm_memranges[i];
-               startp = vmr->vmr_gpa;
-               ret = uvm_mapanon(vm->vm_map, &startp, vmr->vmr_size, 0,
-                   UVM_MAPFLAG(PROT_READ | PROT_WRITE | PROT_EXEC,
+               ret = uvm_share(vm->vm_map, vmr->vmr_gpa,
                    PROT_READ | PROT_WRITE | PROT_EXEC,
-                   MAP_INHERIT_NONE,
-                   MADV_NORMAL,
-                   UVM_FLAG_FIXED | UVM_FLAG_COPYONW));
+                   &p->p_vmspace->vm_map, vmr->vmr_va, vmr->vmr_size);
                if (ret) {
-                       printf("vm_impl_init_vmx: uvm_mapanon failed (%d)\n",
+                       printf("vm_impl_init_vmx: uvm_share failed (%d)\n",
                            ret);
                        /* uvm_map_deallocate calls pmap_destroy for us */
                        uvm_map_deallocate(vm->vm_map);
@@ -1155,7 +962,7 @@ vm_impl_init_vmx(struct vm *vm)
  * AMD SVM specific VM initialization routine
  */
 int
-vm_impl_init_svm(struct vm *vm)
+vm_impl_init_svm(struct vm *vm, struct proc *p)
 {
        /* XXX removed due to rot */
        return (-1);
@@ -1167,14 +974,14 @@ vm_impl_init_svm(struct vm *vm)
  * Calls the architecture-specific VM init routine
  */
 int
-vm_impl_init(struct vm *vm)
+vm_impl_init(struct vm *vm, struct proc *p)
 {
        if (vmm_softc->mode == VMM_MODE_VMX ||
            vmm_softc->mode == VMM_MODE_EPT)
-               return vm_impl_init_vmx(vm);
+               return vm_impl_init_vmx(vm, p);
        else if (vmm_softc->mode == VMM_MODE_SVM ||
                 vmm_softc->mode == VMM_MODE_RVI)
-               return vm_impl_init_svm(vm);
+               return vm_impl_init_svm(vm, p);
        else
                panic("unknown vmm mode\n");
 }
Index: sys/arch/amd64/include/vmmvar.h
===================================================================
RCS file: /cvs/src/sys/arch/amd64/include/vmmvar.h,v
retrieving revision 1.12
diff -u -p -r1.12 vmmvar.h
--- sys/arch/amd64/include/vmmvar.h     6 Apr 2016 06:15:06 -0000       1.12
+++ sys/arch/amd64/include/vmmvar.h     6 Apr 2016 17:46:19 -0000
@@ -176,6 +176,7 @@ struct vcpu_init_state {
 
 struct vm_mem_range {
        paddr_t vmr_gpa;
+       vaddr_t vmr_va;
        size_t  vmr_size;
 };
 
@@ -234,24 +235,6 @@ struct vm_terminate_params {
        uint32_t                vtp_vm_id;
 };
 
-struct vm_writepage_params {
-       /* Input parameters to VMM_IOC_WRITEPAGE */
-       uint32_t                vwp_vm_id; /* VM ID */
-       paddr_t                 vwp_paddr; /* Phys Addr */
-       char                    *vwp_data; /* Page Data */
-       uint32_t                vwp_len;   /* Length */
-};
-
-struct vm_readpage_params {
-       /* Input parameters to VMM_IOC_READPAGE */
-       uint32_t                vrp_vm_id; /* VM ID */
-       paddr_t                 vrp_paddr; /* Phys Addr */
-       uint32_t                vrp_len;   /* Length */
-
-       /* Output parameters from VMM_IOC_READPAGE */
-       char                    *vrp_data; /* Page Data */
-};
-
 struct vm_resetcpu_params {
        /* Input parameters to VMM_IOC_RESETCPU */
        uint32_t                vrp_vm_id;
@@ -271,10 +254,8 @@ struct vm_intr_params {
 #define VMM_IOC_RUN _IOWR('V', 2, struct vm_run_params) /* Run VCPU */
 #define VMM_IOC_INFO _IOWR('V', 3, struct vm_info_params) /* Get VM Info */
 #define VMM_IOC_TERM _IOW('V', 4, struct vm_terminate_params) /* Terminate VM 
*/
-#define VMM_IOC_WRITEPAGE _IOW('V', 5, struct vm_writepage_params) /* Wr Pg */
-#define VMM_IOC_READPAGE _IOW('V', 6, struct vm_readpage_params) /* Rd Pg */
-#define VMM_IOC_RESETCPU _IOW('V', 7, struct vm_resetcpu_params) /* Reset */
-#define VMM_IOC_INTR _IOW('V', 8, struct vm_intr_params) /* Intr pending */
+#define VMM_IOC_RESETCPU _IOW('V', 5, struct vm_resetcpu_params) /* Reset */
+#define VMM_IOC_INTR _IOW('V', 6, struct vm_intr_params) /* Intr pending */
 
 #ifdef _KERNEL
 
Index: sys/uvm/uvm_extern.h
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_extern.h,v
retrieving revision 1.138
diff -u -p -r1.138 uvm_extern.h
--- sys/uvm/uvm_extern.h        4 Apr 2016 16:34:16 -0000       1.138
+++ sys/uvm/uvm_extern.h        6 Apr 2016 17:46:21 -0000
@@ -428,6 +428,8 @@ void                        uvmspace_exec(struct proc *, 
vadd
 struct vmspace         *uvmspace_fork(struct process *);
 void                   uvmspace_free(struct vmspace *);
 struct vmspace         *uvmspace_share(struct process *);
+int                    uvm_share(vm_map_t, vaddr_t, vm_prot_t,
+                           vm_map_t, vaddr_t, vsize_t);
 void                   uvm_meter(void);
 int                    uvm_sysctl(int *, u_int, void *, size_t *, 
                            void *, size_t, struct proc *);
Index: sys/uvm/uvm_map.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_map.c,v
retrieving revision 1.211
diff -u -p -r1.211 uvm_map.c
--- sys/uvm/uvm_map.c   4 Apr 2016 16:34:16 -0000       1.211
+++ sys/uvm/uvm_map.c   6 Apr 2016 17:46:22 -0000
@@ -182,8 +182,12 @@ int                         uvm_mapent_bias(struct 
vm_map*, s
  * uvm_vmspace_fork helper functions.
  */
 struct vm_map_entry    *uvm_mapent_clone(struct vm_map*, vaddr_t, vsize_t,
-                           vsize_t, struct vm_map_entry*,
-                           struct uvm_map_deadq*, int, int);
+                           vsize_t, vm_prot_t, vm_prot_t,
+                           struct vm_map_entry*, struct uvm_map_deadq*, int,
+                           int);
+struct vm_map_entry    *uvm_mapent_share(struct vm_map*, vaddr_t, vsize_t,
+                           vsize_t, vm_prot_t, vm_prot_t, struct vm_map*,
+                           struct vm_map_entry*, struct uvm_map_deadq*);
 struct vm_map_entry    *uvm_mapent_forkshared(struct vmspace*, struct vm_map*,
                            struct vm_map*, struct vm_map_entry*,
                            struct uvm_map_deadq*);
@@ -3364,6 +3368,98 @@ uvmspace_free(struct vmspace *vm)
 }
 
 /*
+ * uvm_share: Map the address range [srcaddr, srcaddr + sz) in
+ * srcmap to the address range [dstaddr, dstaddr + sz) in
+ * dstmap.
+ *
+ * The whole address range in srcmap must be backed by an object
+ * (no holes).
+ *
+ * If successful, the address ranges share memory and the destination
+ * address range uses the protection flags in prot.
+ *
+ * This routine assumes that sz is a multiple of PAGE_SIZE and
+ * that dstaddr and srcaddr are page-aligned.
+ */
+int
+uvm_share(struct vm_map *dstmap, vaddr_t dstaddr, vm_prot_t prot,
+    struct vm_map *srcmap, vaddr_t srcaddr, vsize_t sz)
+{
+       int ret = 0;
+       vaddr_t unmap_end;
+       vaddr_t dstva;
+       vsize_t off, len, n = sz;
+       struct vm_map_entry *first = NULL, *last = NULL;
+       struct vm_map_entry *src_entry, *psrc_entry = NULL;
+       struct uvm_map_deadq dead;
+
+       if (srcaddr >= srcmap->max_offset || sz > srcmap->max_offset - srcaddr)
+               return EINVAL;
+
+       TAILQ_INIT(&dead);
+       vm_map_lock(dstmap);
+       vm_map_lock_read(srcmap);
+
+       if (!uvm_map_isavail(dstmap, NULL, &first, &last, dstaddr, sz)) {
+               ret = ENOMEM;
+               goto exit_unlock;
+       }
+       if (!uvm_map_lookup_entry(srcmap, srcaddr, &src_entry)) {
+               ret = EINVAL;
+               goto exit_unlock;
+       }
+
+       unmap_end = dstaddr;
+       for (; src_entry != NULL;
+           psrc_entry = src_entry,
+           src_entry = RB_NEXT(uvm_map_addr, &srcmap->addr, src_entry)) {
+               /* hole in address space, bail out */
+               if (psrc_entry != NULL && psrc_entry->end != src_entry->start)
+                       break;
+               if (src_entry->start >= srcaddr + sz)
+                       break;
+
+               if (UVM_ET_ISSUBMAP(src_entry))
+                       panic("uvm_share: encountered a submap (illegal)");
+               if (!UVM_ET_ISCOPYONWRITE(src_entry) &&
+                   UVM_ET_ISNEEDSCOPY(src_entry))
+                       panic("uvm_share: non-copy_on_write map entries "
+                           "marked needs_copy (illegal)");
+
+               dstva = dstaddr;
+               if (src_entry->start > srcaddr) {
+                       dstva += src_entry->start - srcaddr;
+                       off = 0;
+               } else
+                       off = srcaddr - src_entry->start;
+
+               if (n < src_entry->end - src_entry->start)
+                       len = n;
+               else
+                       len = src_entry->end - src_entry->start;
+               n -= len;
+
+               if (uvm_mapent_share(dstmap, dstva, len, off, prot, prot,
+                   srcmap, src_entry, &dead) == NULL)
+                       break;
+
+               unmap_end = dstva + len;
+               if (n == 0)
+                       goto exit_unlock;
+       }
+
+       ret = EINVAL;
+       uvm_unmap_remove(dstmap, dstaddr, unmap_end, &dead, FALSE, TRUE);
+
+exit_unlock:
+       vm_map_unlock_read(srcmap);
+       vm_map_unlock(dstmap);
+       uvm_unmap_detach(&dead, 0);
+
+       return ret;
+}
+
+/*
  * Clone map entry into other map.
  *
  * Mapping will be placed at dstaddr, for the same length.
@@ -3372,7 +3468,8 @@ uvmspace_free(struct vmspace *vm)
  */
 struct vm_map_entry *
 uvm_mapent_clone(struct vm_map *dstmap, vaddr_t dstaddr, vsize_t dstlen,
-    vsize_t off, struct vm_map_entry *old_entry, struct uvm_map_deadq *dead,
+    vsize_t off, vm_prot_t prot, vm_prot_t maxprot,
+    struct vm_map_entry *old_entry, struct uvm_map_deadq *dead,
     int mapent_flags, int amap_share_flags)
 {
        struct vm_map_entry *new_entry, *first, *last;
@@ -3394,8 +3491,8 @@ uvm_mapent_clone(struct vm_map *dstmap, 
        new_entry->offset = old_entry->offset;
        new_entry->aref = old_entry->aref;
        new_entry->etype |= old_entry->etype & ~UVM_ET_FREEMAPPED;
-       new_entry->protection = old_entry->protection;
-       new_entry->max_protection = old_entry->max_protection;
+       new_entry->protection = prot;
+       new_entry->max_protection = maxprot;
        new_entry->inheritance = old_entry->inheritance;
        new_entry->advice = old_entry->advice;
 
@@ -3417,34 +3514,48 @@ uvm_mapent_clone(struct vm_map *dstmap, 
        return new_entry;
 }
 
-/*
- * share the mapping: this means we want the old and
- * new entries to share amaps and backing objects.
- */
 struct vm_map_entry *
-uvm_mapent_forkshared(struct vmspace *new_vm, struct vm_map *new_map,
-    struct vm_map *old_map,
+uvm_mapent_share(struct vm_map *dstmap, vaddr_t dstaddr, vsize_t dstlen,
+    vsize_t off, vm_prot_t prot, vm_prot_t maxprot, struct vm_map *old_map,
     struct vm_map_entry *old_entry, struct uvm_map_deadq *dead)
 {
-       struct vm_map_entry *new_entry;
-
        /*
-        * if the old_entry needs a new amap (due to prev fork)
-        * then we need to allocate it now so that we have
-        * something we own to share with the new_entry.   [in
-        * other words, we need to clear needs_copy]
+        * If old_entry refers to a copy-on-write region that has not yet been
+        * written to (needs_copy flag is set), then we need to allocate a new
+        * amap for old_entry.
+        *
+        * If we do not do this, and the process owning old_entry does a copy-on
+        * write later, old_entry and new_entry will refer to different memory
+        * regions, and the memory between the processes is no longer shared.
+        *
+        * [in other words, we need to clear needs_copy]
         */
 
        if (UVM_ET_ISNEEDSCOPY(old_entry)) {
                /* get our own amap, clears needs_copy */
                amap_copy(old_map, old_entry, M_WAITOK, FALSE,
-                   0, 0); 
+                   0, 0);
                /* XXXCDC: WAITOK??? */
        }
 
-       new_entry = uvm_mapent_clone(new_map, old_entry->start,
-           old_entry->end - old_entry->start, 0, old_entry,
-           dead, 0, AMAP_SHARED);
+       return uvm_mapent_clone(dstmap, dstaddr, dstlen, off,
+           prot, maxprot, old_entry, dead, 0, AMAP_SHARED);
+}
+
+/*
+ * share the mapping: this means we want the old and
+ * new entries to share amaps and backing objects.
+ */
+struct vm_map_entry *
+uvm_mapent_forkshared(struct vmspace *new_vm, struct vm_map *new_map,
+    struct vm_map *old_map,
+    struct vm_map_entry *old_entry, struct uvm_map_deadq *dead)
+{
+       struct vm_map_entry *new_entry;
+
+       new_entry = uvm_mapent_share(new_map, old_entry->start,
+           old_entry->end - old_entry->start, 0, old_entry->protection,
+           old_entry->max_protection, old_map, old_entry, dead);
 
        /* 
         * pmap_copy the mappings: this routine is optional
@@ -3474,8 +3585,8 @@ uvm_mapent_forkcopy(struct vmspace *new_
        boolean_t                protect_child;
 
        new_entry = uvm_mapent_clone(new_map, old_entry->start,
-           old_entry->end - old_entry->start, 0, old_entry,
-           dead, 0, 0);
+           old_entry->end - old_entry->start, 0, old_entry->protection,
+           old_entry->max_protection, old_entry, dead, 0, 0);
 
        new_entry->etype |=
            (UVM_ET_COPYONWRITE|UVM_ET_NEEDSCOPY);
@@ -3615,8 +3726,8 @@ uvm_mapent_forkzero(struct vmspace *new_
        struct vm_map_entry *new_entry;
 
        new_entry = uvm_mapent_clone(new_map, old_entry->start,
-           old_entry->end - old_entry->start, 0, old_entry,
-           dead, 0, 0);
+           old_entry->end - old_entry->start, 0, old_entry->protection,
+           old_entry->max_protection, old_entry, dead, 0, 0);
 
        new_entry->etype |=
            (UVM_ET_COPYONWRITE|UVM_ET_NEEDSCOPY);
@@ -4116,6 +4227,7 @@ uvm_map_extract(struct vm_map *srcmap, v
 
                newentry = uvm_mapent_clone(kernel_map,
                    cp_start - start + dstaddr, cp_len, cp_off,
+                   entry->protection, entry->max_protection,
                    entry, &dead, flags, AMAP_SHARED | AMAP_REFALL);
                if (newentry == NULL) {
                        error = ENOMEM;
Index: usr.sbin/vmd/vmm.c
===================================================================
RCS file: /cvs/src/usr.sbin/vmd/vmm.c,v
retrieving revision 1.25
diff -u -p -r1.25 vmm.c
--- usr.sbin/vmd/vmm.c  5 Apr 2016 09:33:05 -0000       1.25
+++ usr.sbin/vmd/vmm.c  6 Apr 2016 17:46:23 -0000
@@ -121,6 +121,7 @@ void *vcpu_run_loop(void *);
 int vcpu_exit(struct vm_run_params *);
 int vcpu_reset(uint32_t, uint32_t, struct vcpu_init_state *);
 void create_memory_map(struct vm_create_params *);
+int alloc_guest_mem(struct vm_create_params *);
 int vmm_create_vm(struct vm_create_params *);
 void init_emulated_hw(struct vm_create_params *, int *, int *);
 void vcpu_exit_inout(struct vm_run_params *);
@@ -139,6 +140,9 @@ void vcpu_process_com_scr(union vm_exit 
 int vmm_dispatch_parent(int, struct privsep_proc *, struct imsg *);
 void vmm_run(struct privsep *, struct privsep_proc *, void *);
 
+static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t,
+    size_t);
+
 int con_fd;
 struct vmd_vm *current_vm;
 
@@ -475,6 +479,12 @@ start_vm(struct imsg *imsg, uint32_t *id
                log_procinit(vcp->vcp_name);
 
                create_memory_map(vcp);
+               ret = alloc_guest_mem(vcp);
+               if (ret) {
+                       errno = ret;
+                       fatal("could not allocate guest memory - exiting");
+               }
+
                ret = vmm_create_vm(vcp);
                current_vm = vm;
 
@@ -657,20 +667,20 @@ start_client_vmd(void)
 void
 create_memory_map(struct vm_create_params *vcp)
 {
-       size_t mem_mb;
-       uint64_t mem_bytes, len;
+       size_t len, mem_bytes, mem_mb;
 
        mem_mb = vcp->vcp_memranges[0].vmr_size;
        vcp->vcp_nmemranges = 0;
        if (mem_mb < 1 || mem_mb > VMM_MAX_VM_MEM_SIZE)
                return;
 
-       mem_bytes = (uint64_t)mem_mb * 1024 * 1024;
+       mem_bytes = mem_mb * 1024 * 1024;
 
        /* First memory region: 0 - LOWMEM_KB (DOS low mem) */
+       len = LOWMEM_KB * 1024;
        vcp->vcp_memranges[0].vmr_gpa = 0x0;
-       vcp->vcp_memranges[0].vmr_size = LOWMEM_KB * 1024;
-       mem_bytes -= LOWMEM_KB * 1024;
+       vcp->vcp_memranges[0].vmr_size = len;
+       mem_bytes -= len;
 
        /*
         * Second memory region: LOWMEM_KB - 1MB.
@@ -706,6 +716,55 @@ create_memory_map(struct vm_create_param
 }
 
 /*
+ * alloc_guest_mem
+ *
+ * Allocates memory for the guest.
+ * Instead of doing a single allocation with one mmap(), we allocate memory
+ * separately for every range for the following reasons:
+ * - ASLR for the individual ranges
+ * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to
+ *   map the single mmap'd userspace memory to the individual guest physical
+ *   memory ranges, the underlying amap of the single mmap'd range would have
+ *   to allocate per-page reference counters. The reason is that the
+ *   individual guest physical ranges would reference the single mmap'd region
+ *   only partially. However, if every guest physical range has its own
+ *   corresponding mmap'd userspace allocation, there are no partial
+ *   references: every guest physical range fully references an mmap'd
+ *   range => no per-page reference counters have to be allocated.
+ *
+ * Return values:
+ *  0: success
+ *  !0: failure - errno indicating the source of the failure
+ */
+int
+alloc_guest_mem(struct vm_create_params *vcp)
+{
+       void *p;
+       int ret;
+       size_t i, j;
+       struct vm_mem_range *vmr;
+
+       for (i = 0; i < vcp->vcp_nmemranges; i++) {
+               vmr = &vcp->vcp_memranges[i];
+               p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE,
+                   MAP_PRIVATE | MAP_ANON, -1, 0);
+               if (p == MAP_FAILED) {
+                       ret = errno;
+                       for (j = 0; j < i; j++) {
+                               vmr = &vcp->vcp_memranges[j];
+                               munmap((void *)vmr->vmr_va, vmr->vmr_size);
+                       }
+
+                       return (ret);
+               }
+
+               vmr->vmr_va = (vaddr_t)p;
+       }
+
+       return (0);
+}
+
+/*
  * vmm_create_vm
  *
  * Requests vmm(4) to create a new VM using the supplied creation
@@ -1536,6 +1595,59 @@ vcpu_exit(struct vm_run_params *vrp)
 }
 
 /*
+ * find_gpa_range
+ *
+ * Search for a contiguous guest physical mem range.
+ *
+ * Parameters:
+ *  vcp: VM create parameters that contain the memory map to search in
+ *  gpa: the starting guest physical address
+ *  len: the length of the memory range
+ *
+ * Return values:
+ *  NULL: on failure if there is no memory range as described by the parameters
+ *  Pointer to vm_mem_range that contains the start of the range otherwise.
+ */
+static struct vm_mem_range *
+find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len)
+{
+       size_t i, n;
+       struct vm_mem_range *vmr;
+
+       /* Find the first vm_mem_range that contains gpa */
+       for (i = 0; i < vcp->vcp_nmemranges; i++) {
+               vmr = &vcp->vcp_memranges[i];
+               if (vmr->vmr_gpa + vmr->vmr_size >= gpa)
+                       break;
+       }
+       if (i == vcp->vcp_nmemranges)
+               return (NULL);
+
+       n = vmr->vmr_size - (gpa - vmr->vmr_gpa);
+       if (len < n)
+               len = 0;
+       else
+               len -= n;
+       gpa = vmr->vmr_gpa + vmr->vmr_size;
+       for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) {
+               vmr = &vcp->vcp_memranges[i];
+               if (gpa != vmr->vmr_gpa)
+                       return (NULL);
+               if (len <= vmr->vmr_size)
+                       len = 0;
+               else
+                       len -= vmr->vmr_size;
+
+               gpa = vmr->vmr_gpa + vmr->vmr_size;
+       }
+
+       if (len != 0)
+               return (NULL);
+
+       return (vmr);
+}
+
+/*
  * write_mem
  *
  * Pushes data from 'buf' into the guest VM's memory at paddr 'dst'.
@@ -1549,38 +1661,38 @@ vcpu_exit(struct vm_run_params *vrp)
  *  len: size of 'buf'
  *
  * Return values:
- *  various return values from ioctl(VMM_IOC_WRITEPAGE), or 0 if no error
- *      occurred.
+ *  0: success
+ *  EINVAL: if the guest physical memory range [dst, dst + len) does not
+ *      exist in the guest.
  */
 int
 write_mem(paddr_t dst, void *buf, size_t len)
 {
-       char *p = buf;
-       size_t n, left;
-       paddr_t gpa;
-       struct vm_writepage_params vwp;
-
-       left = len;
-       for (gpa = dst; gpa < dst + len;
-           gpa = (gpa & ~PAGE_MASK) + PAGE_SIZE) {
-               n = left;
-               if (n > PAGE_SIZE)
-                       n = PAGE_SIZE;
-               if (n > (PAGE_SIZE - (gpa & PAGE_MASK)))
-                       n = PAGE_SIZE - (gpa & PAGE_MASK);
-
-               vwp.vwp_paddr = (paddr_t)gpa;
-               vwp.vwp_data = p;
-               vwp.vwp_vm_id = current_vm->vm_params.vcp_id;
-               vwp.vwp_len = n;
-               if (ioctl(env->vmd_fd, VMM_IOC_WRITEPAGE, &vwp) < 0) {
-                       log_warn("writepage ioctl failed @ 0x%lx: "
-                           "dst = 0x%lx, len = 0x%zx", gpa, dst, len);
-                       return (errno);
-               }
+       char *from = buf, *to;
+       size_t n, off;
+       struct vm_mem_range *vmr;
+
+       vmr = find_gpa_range(&current_vm->vm_params, dst, len);
+       if (vmr == NULL) {
+               errno = EINVAL;
+               log_warn("writepage ioctl failed: "
+                   "invalid memory range dst = 0x%lx, len = 0x%zx", dst, len);
+               return (EINVAL);
+       }
 
-               left -= n;
-               p += n;
+       off = dst - vmr->vmr_gpa;
+       while (len != 0) {
+               n = vmr->vmr_size - off;
+               if (len < n)
+                       n = len;
+
+               to = (char *)vmr->vmr_va + off;
+               memcpy(to, from, n);
+
+               from += n;
+               len -= n;
+               off = 0;
+               vmr++;
        }
 
        return (0);
@@ -1593,42 +1705,45 @@ write_mem(paddr_t dst, void *buf, size_t
  *
  * Parameters:
  *  src: the source paddr_t in the guest VM to read from.
+ *      If there is no guest paddr mapping at 'src', a new page will be
+ *      faulted in by the VMM and filles with zeroes (provided 'src'
+ *      represents a valid paddr in the guest's address space)
  *  buf: destination (local) buffer
  *  len: size of 'buf'
  *
  * Return values:
- *  various return values from ioctl(VMM_IOC_READPAGE), or 0 if no error
- *      occurred.
+ *  0: success
+ *  EINVAL: if the guest physical memory range [dst, dst + len) does not
+ *      exist in the guest.
  */
 int
 read_mem(paddr_t src, void *buf, size_t len)
 {
-       char *p = buf;
-       size_t n, left;
-       paddr_t gpa;
-       struct vm_readpage_params vrp;
-
-       left = len;
-       for (gpa = src; gpa < src + len;
-           gpa = (gpa & ~PAGE_MASK) + PAGE_SIZE) {
-               n = left;
-               if (n > PAGE_SIZE)
-                       n = PAGE_SIZE;
-               if (n > (PAGE_SIZE - (gpa & PAGE_MASK)))
-                       n = PAGE_SIZE - (gpa & PAGE_MASK);
-
-               vrp.vrp_paddr = (paddr_t)gpa;
-               vrp.vrp_data = p;
-               vrp.vrp_vm_id = current_vm->vm_params.vcp_id;
-               vrp.vrp_len = n;
-               if (ioctl(env->vmd_fd, VMM_IOC_READPAGE, &vrp) < 0) {
-                       log_warn("readpage ioctl failed @ 0x%lx: "
-                           "src = 0x%lx, len = 0x%zx", gpa, src, len);
-                       return (errno);
-               }
+       char *from, *to = buf;
+       size_t n, off;
+       struct vm_mem_range *vmr;
+
+       vmr = find_gpa_range(&current_vm->vm_params, src, len);
+       if (vmr == NULL) {
+               errno = EINVAL;
+               log_warn("readpage ioctl failed: "
+                   "invalid memory range src = 0x%lx, len = 0x%zx", src, len);
+               return (EINVAL);
+       }
 
-               left -= n;
-               p += n;
+       off = src - vmr->vmr_gpa;
+       while (len != 0) {
+               n = vmr->vmr_size - off;
+               if (len < n)
+                       n = len;
+
+               from = (char *)vmr->vmr_va + off;
+               memcpy(to, from, n);
+
+               to += n;
+               len -= n;
+               off = 0;
+               vmr++;
        }
 
        return (0);

Reply via email to