Hi I have been working on this diff on and off for a while now, mlarkin was able to give me lots of tips, but now i am stuck, so i thought i would ask for nits, tips or even your doing it wrong.
The code below causes the vm to triple fault without the ((*pte & EPT_WB) == EPT_WB) check in vmx_mprotect_ept, some calls pass this check and i am able to modify the ept, other fail, and are skipped. I am just not sure i am looking up the ept entry correctly, any help here would be grealy appreciated. The kernel code is called from solo5[1] as follows. if(mprotect(vaddr_start, size, prot) == -1) return -1; ret = 0; struct hvt_b *hvb = hvt->b; struct vm_mprotect_ept_params *vmep; vmep = calloc(1, sizeof (struct vm_mprotect_ept_params)); if (vmep == NULL) { warn("calloc"); return -1; } vmep->vmep_vm_id = hvb->vcp_id; vmep->vmep_vcpu_id = hvb->vcpu_id; // sgpa = vmr->vmr_gpa(0x0) + (addr - vmr->vmr_va) // vmep->vmep_sgpa = (vaddr_t)0x0 + (vaddr_t)(vaddr_start - hvt->mem); vmep->vmep_sgpa = addr_start; vmep->vmep_size = size; vmep->vmep_prot = prot; if (ioctl(hvb->vmd_fd, VMM_IOC_MPROTECT_EPT, vmep) < 0) { warn("mprotect ept vmm ioctl failed - exiting"); ret = -1; } Cheers Adam [1] https://github.com/adamsteen/solo5/blob/wnox/tenders/hvt/hvt_openbsd.c#L169-L225 ? div Index: sys/arch/amd64/amd64/vmm.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/vmm.c,v retrieving revision 1.257 diff -u -p -u -p -r1.257 vmm.c --- sys/arch/amd64/amd64/vmm.c 13 Dec 2019 03:38:15 -0000 1.257 +++ sys/arch/amd64/amd64/vmm.c 6 Jan 2020 03:30:39 -0000 @@ -41,7 +41,7 @@ #include <dev/isa/isareg.h> #include <dev/pv/pvreg.h> -/* #define VMM_DEBUG */ +#define VMM_DEBUG void *l1tf_flush_region; @@ -124,6 +124,7 @@ int vm_get_info(struct vm_info_params *) int vm_resetcpu(struct vm_resetcpu_params *); int vm_intr_pending(struct vm_intr_params *); int vm_rwregs(struct vm_rwregs_params *, int); +int vm_mprotect_ept(struct vm_mprotect_ept_params *); int vm_rwvmparams(struct vm_rwvmparams_params *, int); int vm_find(uint32_t, struct vm **); int vcpu_readregs_vmx(struct vcpu *, uint64_t, struct vcpu_reg_state *); @@ -201,6 +202,8 @@ void vmx_setmsrbw(struct vcpu *, uint32_ void vmx_setmsrbrw(struct vcpu *, uint32_t); void svm_set_clean(struct vcpu *, uint32_t); void svm_set_dirty(struct vcpu *, uint32_t); +int vmx_mprotect_ept(vm_map_t, paddr_t, paddr_t, int); +pt_entry_t * vmx_pmap_find_pte_ept(pmap_t, paddr_t); void vmm_init_pvclock(struct vcpu *, paddr_t); int vmm_update_pvclock(struct vcpu *); @@ -225,6 +228,9 @@ void vmm_decode_efer_value(uint64_t); void vmm_decode_rflags(uint64_t); void vmm_decode_misc_enable_value(uint64_t); const char *vmm_decode_cpu_mode(struct vcpu *); +void dump_requested_vmx_mprotect_ept(int prot); +void vmx_dump_pte_prot(pt_entry_t *pte); +void vmx_dump_pte_after(pt_entry_t *pte); extern int mtrr2mrt(int); @@ -494,6 +500,9 @@ vmmioctl(dev_t dev, u_long cmd, caddr_t case VMM_IOC_WRITEREGS: ret = vm_rwregs((struct vm_rwregs_params *)data, 1); break; + case VMM_IOC_MPROTECT_EPT: + ret = vm_mprotect_ept((struct vm_mprotect_ept_params *)data); + break; case VMM_IOC_READVMPARAMS: ret = vm_rwvmparams((struct vm_rwvmparams_params *)data, 0); break; @@ -532,6 +541,7 @@ pledge_ioctl_vmm(struct proc *p, long co case VMM_IOC_INTR: case VMM_IOC_READREGS: case VMM_IOC_WRITEREGS: + case VMM_IOC_MPROTECT_EPT: case VMM_IOC_READVMPARAMS: case VMM_IOC_WRITEVMPARAMS: return (0); @@ -807,6 +817,261 @@ vm_rwregs(struct vm_rwregs_params *vrwp, } /* + * vm_mprotect_ept + * + * IOCTL handler to sets the access protections of the ept + * + * Parameters: + * vmep: decribes the memory for which the protect will be applied.. + * + * Return values: + * 0: if successful + * ENOENT: if the VM defined by 'vmep' cannot be found + * EINVAL: if the sgpa or size is not page aligned, the prot is WX or RWX, + * size is too large (512GB), there is wraparound + * (like start = 512GB-1 and end = 512GB-2), + * the address specified is not within the vm's mem range + * or the address lies inside reserved (MMIO) memory + */ +int +vm_mprotect_ept(struct vm_mprotect_ept_params *vmep) +{ + struct vm *vm; + struct vcpu *vcpu; + struct vm_mem_range *vmr; + vaddr_t sgpa; + size_t size; + vm_prot_t prot; + int i, ret, mem_type; + + /* Find the desired VM */ + rw_enter_read(&vmm_softc->vm_lock); + ret = vm_find(vmep->vmep_vm_id, &vm); + rw_exit_read(&vmm_softc->vm_lock); + + /* Not found? exit. */ + if (ret != 0) { + DPRINTF("%s: vm id %u not found\n", __func__, + vmep->vmep_vm_id); + return (ret); + } + + rw_enter_read(&vm->vm_vcpu_lock); + SLIST_FOREACH(vcpu, &vm->vm_vcpu_list, vc_vcpu_link) { + if (vcpu->vc_id == vmep->vmep_vcpu_id) + break; + } + rw_exit_read(&vm->vm_vcpu_lock); + + if (vcpu == NULL) { + DPRINTF("%s: vcpu id %u of vm %u not found\n", __func__, + vmep->vmep_vcpu_id, vmep->vmep_vm_id); + return (ENOENT); + } + + if (vcpu->vc_state != VCPU_STATE_STOPPED) { + DPRINTF("%s: mprotect_ept %u on vm %u attempted " + "while vcpu was in state %u (%s)\n", __func__, + vmep->vmep_vcpu_id, vmep->vmep_vm_id, vcpu->vc_state, + vcpu_state_decode(vcpu->vc_state)); + + return (EBUSY); + } + + sgpa = vmep->vmep_sgpa; + size = vmep->vmep_size; + prot = vmep->vmep_prot; + + /* disallow W^X */ + if ((prot & PROT_MASK) != prot && + (prot & (PROT_WRITE | PROT_EXEC)) == (PROT_WRITE | PROT_EXEC)) + return (EINVAL); + + if ((sgpa & PAGE_MASK) || (size & PAGE_MASK) || size == 0) + return (EINVAL); /* Must be page aligned */ + + if(size >= NBPD_L4) + return (EINVAL); /* size must be less then 512GB */ + + if(sgpa + size < sgpa) + return (EINVAL); /* no wraparound */ + + /* XXX check if the address lies inside reserved (MMIO) memory */ + + mem_type = VMM_MEM_TYPE_UNKNOWN; + for (i = 0; i < vm->vm_nmemranges; i++) { + vmr = &vm->vm_memranges[i]; + + /* + * vm_memranges are ascending. sgpa can no longer be in one of + * the memranges + */ + if (sgpa < vmr->vmr_gpa) + continue; + + if (sgpa + size < vmr->vmr_gpa + vmr->vmr_size) { + mem_type = VMM_MEM_TYPE_REGULAR; + break; + } + } + + if (mem_type == VMM_MEM_TYPE_UNKNOWN) + return (EINVAL); + + if (vmm_softc->mode == VMM_MODE_EPT || + vmm_softc->mode == VMM_MODE_VMX) { + ret = vmx_mprotect_ept(vm->vm_map, sgpa, sgpa + size, prot); + } else { + pmap_write_protect(vm->vm_map->pmap, sgpa, sgpa + size, prot); + ret = 0; + } + + return (ret); +} + +/* + * vmx_mprotect_ept + * + * apply the ept protections to the requested pages, faulting the page if + * required. + */ +int +vmx_mprotect_ept(vm_map_t vm_map, paddr_t sgpa, paddr_t egpa, int prot) +{ + struct vmx_invept_descriptor vid; + pmap_t pmap = vm_map->pmap; + pt_entry_t *pte; + paddr_t addr; + int ret = 0; + +#ifdef VMM_DEBUG + dump_requested_vmx_mprotect_ept(prot); +#endif + for (addr = sgpa; addr < egpa; addr += PAGE_SIZE) { + pte = vmx_pmap_find_pte_ept(pmap, addr); + if(pte == NULL) { + ret = uvm_fault(vm_map, addr, VM_FAULT_INVALID, + PROT_READ | PROT_WRITE); + if (ret) + printf("%s: uvm_fault returns %d, GPA=0x%llx\n" + ,__func__, ret, (uint64_t)addr); + + pte = vmx_pmap_find_pte_ept(pmap, addr); + if(pte == NULL) + return EFAULT; + } +#ifdef VMM_DEBUG + vmx_dump_pte_prot(pte); +#endif + // XXX how to determine if this is a valid pte + if((*pte & EPT_WB) == EPT_WB) { + + if (prot & PROT_READ) + *pte |= EPT_R; + else + *pte &= ~EPT_R; + + if (prot & PROT_WRITE) + *pte |= EPT_W; + else + *pte &= ~EPT_W; + + if (prot & PROT_EXEC) + *pte |= EPT_X; + else + *pte &= ~EPT_X; + } +#ifdef VMM_DEBUG + vmx_dump_pte_after(pte); +#endif + + /* + * XXX bonus points - ensure permission combination is valid + * in EPT (eg, no "write only" pages, and ensure if "execute + * only" is requested, that the CPU actually supports it, etc) + */ + } + + /* + * SDM 3C: 28.3.3.4 Guidelines for Use of the INVEPT Instruction + * the first bullet point seems to say we should call invept. + * + * Software should use the INVEPT instruction with the “single-context” + * INVEPT type after making any of the following changes to an EPT + * paging-structure entry (the INVEPT descriptor should contain an + * EPTP value that references — directly or indirectly + * — the modified EPT paging structure): + * — Changing any of the privilege bits 2:0 from 1 to 0. + * */ + if (pmap->eptp != 0) { + memset(&vid, 0, sizeof(vid)); + vid.vid_eptp = pmap->eptp; + DPRINTF("%s: flushing EPT TLB for EPTP 0x%llx\n", __func__, + vid.vid_eptp); + invept(IA32_VMX_INVEPT_SINGLE_CTX, &vid); + } + + return ret; +} + +/* + * vmx_pmap_find_pte_ept + * + * find the page table entry specified by addr in the pmap supplied. + */ +pt_entry_t * +vmx_pmap_find_pte_ept(pmap_t pmap, paddr_t addr) +{ + int l4idx, l3idx, l2idx, l1idx; + pd_entry_t *pd; + paddr_t pdppa; + pt_entry_t *ptes; + + l4idx = (addr & L4_MASK) >> L4_SHIFT; /* PML4E idx */ + l3idx = (addr & L3_MASK) >> L3_SHIFT; /* PDPTE idx */ + l2idx = (addr & L2_MASK) >> L2_SHIFT; /* PDE idx */ + l1idx = (addr & L1_MASK) >> L1_SHIFT; /* PTE idx */ + + pd = (pd_entry_t *)pmap->pm_pdir; + if(pd == NULL) + return NULL; + + /* + * l4idx should always be 0 since we don't support more than 512GB + * guest physical memory. + */ + if(l4idx > 0) + return NULL; + + /* + * l3idx should always be < MAXDSIZ/1GB because we don't support more + * than MAXDSIZ guest phys mem. + */ + if(l3idx >= MAXDSIZ / ((paddr_t)1024*1024*1024)) + return NULL; + + pdppa = pd[l4idx] & PG_FRAME; + if (pdppa == 0) + return NULL; + + ptes = (pt_entry_t *)PMAP_DIRECT_MAP(pdppa); + + pdppa = ptes[l3idx] & PG_FRAME; + if (pdppa == 0) + return NULL; + + ptes = (pt_entry_t *)PMAP_DIRECT_MAP(pdppa); + + pdppa = ptes[l2idx] & PG_FRAME; + if (pdppa == 0) + return NULL; + + ptes = (pt_entry_t *)PMAP_DIRECT_MAP(pdppa); + + return &ptes[l1idx]; +} + +/* * vm_find * * Function to find an existing VM by its identifier. @@ -8630,5 +8895,64 @@ vmm_decode_cpu_mode(struct vcpu *vcpu) case VMM_CPU_MODE_LONG: return "long"; default: return "unknown"; } +} + +void +dump_requested_vmx_mprotect_ept(int prot) +{ + DPRINTF("%s\nrequested: '", __func__); + if (prot & PROT_READ) + DPRINTF("R"); + + if (prot & PROT_WRITE) + DPRINTF("W"); + + if (prot & PROT_EXEC) + DPRINTF("X"); + DPRINTF("'\n"); +} + +void +vmx_dump_pte_prot(pt_entry_t *pte) +{ + DPRINTF("before: '"); + if(*pte & EPT_R) + DPRINTF("R"); + + if(*pte & EPT_W) + DPRINTF("W"); + + if(*pte & EPT_X) + DPRINTF("X"); + + if(*pte & EPT_WB) + DPRINTF("`WB`"); + + if(*pte & EPT_PS) + DPRINTF("`PS`"); + + DPRINTF("'"); +} + +void +vmx_dump_pte_after(pt_entry_t *pte) +{ + DPRINTF(" after: '"); + if(*pte & EPT_R) + DPRINTF("R"); + + if(*pte & EPT_W) + DPRINTF("W"); + + if(*pte & EPT_X) + DPRINTF("X"); + + if(*pte & EPT_WB) + DPRINTF("`WB`"); + + if(*pte & EPT_PS) + DPRINTF("`PS`"); + + DPRINTF("'\n"); } #endif /* VMM_DEBUG */ Index: sys/arch/amd64/include/vmmvar.h =================================================================== RCS file: /cvs/src/sys/arch/amd64/include/vmmvar.h,v retrieving revision 1.67 diff -u -p -u -p -r1.67 vmmvar.h --- sys/arch/amd64/include/vmmvar.h 17 Jul 2019 05:51:07 -0000 1.67 +++ sys/arch/amd64/include/vmmvar.h 6 Jan 2020 03:30:40 -0000 @@ -558,6 +558,15 @@ struct vm_rwregs_params { struct vcpu_reg_state vrwp_regs; }; +struct vm_mprotect_ept_params { + /* Input parameters to VMM_IOC_MPROTECT_EPT */ + uint32_t vmep_vm_id; + uint32_t vmep_vcpu_id; + vaddr_t vmep_sgpa; + size_t vmep_size; + int vmep_prot; +}; + /* IOCTL definitions */ #define VMM_IOC_CREATE _IOWR('V', 1, struct vm_create_params) /* Create VM */ #define VMM_IOC_RUN _IOWR('V', 2, struct vm_run_params) /* Run VCPU */ @@ -571,7 +580,8 @@ struct vm_rwregs_params { #define VMM_IOC_READVMPARAMS _IOWR('V', 9, struct vm_rwvmparams_params) /* Set VM params */ #define VMM_IOC_WRITEVMPARAMS _IOW('V', 10, struct vm_rwvmparams_params) - +/* Control the protection of ept pages*/ +#define VMM_IOC_MPROTECT_EPT _IOW('V', 11, struct vm_mprotect_ept_params) /* CPUID masks */ /*