Marcelo Tosatti wrote:
> Updated patch, now feature complete. Changes from last version:
>
> - Use __gfn_to_page in cmpxchg_pte() to avoid potential deadlock
> - Add kvm_read_guest_inatomic() and use it in fetch()
> - Make prefetch_page() use copy_from_user_inatomic()
> - Pass grabbed page down to mmu_set_spte to avoid a potential schedule
> with mmu_lock held (this could happen even without the page being
> swapped out because get_user_pages() calls cond_resched).
> - Convert a few missing mutex lock users to mmap_sem.
> - Grab the mutex lock when calling kvm_iodevice_{read,write}
>
> Please review.
>
>
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 401eb7c..1b375ba 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
> unsigned pt_access, unsigned pte_access,
> int user_fault, int write_fault, int dirty,
> - int *ptwrite, gfn_t gfn)
> + int *ptwrite, gfn_t gfn, struct page *userpage)
> {
> u64 spte;
> int was_rmapped = is_rmap_pte(*shadow_pte);
> @@ -907,7 +908,11 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64
> *shadow_pte,
> if (!(pte_access & ACC_EXEC_MASK))
> spte |= PT64_NX_MASK;
>
> - page = gfn_to_page(vcpu->kvm, gfn);
> + if (userpage) {
> + page = userpage;
> + get_page(page);
> + } else
> + page = __gfn_to_page(vcpu->kvm, gfn);
>
>
This bit is unlovely. It would be better to have a single calling
convention (caller supplies userpage (which can just be called 'page', no?))
Thinking a bit, it's unsafe, no? mmu_set_spte() is called with the lock
held so it can't call __gfn_to_page().
> @@ -91,7 +92,7 @@ static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
> pt_element_t *table;
> struct page *page;
>
> - page = gfn_to_page(kvm, table_gfn);
> + page = __gfn_to_page(kvm, table_gfn);
>
>
Many of these. Perhaps we should kill __gfn_to_page() and make
gfn_to_page() unlocked.
> +static struct page *FNAME(pte_to_page)(struct kvm_vcpu *vcpu, const void
> *pte,
> + int bytes)
> +{
> + pt_element_t gpte = *(const pt_element_t *)pte;
> +
> + if (bytes < sizeof(pt_element_t))
> + return NULL;
> + if (!is_present_pte(gpte))
> + return NULL;
> +
> + return __gfn_to_page(vcpu->kvm, gpte_to_gfn(gpte));
> +}
>
This is wrong: interpreting a gpte is not dependent on the current vcpu
mode, but on the role the page was cached in. A single page can be
simultaneously cached as a page table and page directory, for 32-bit,
pae, and 64-bit modes. More below.
> @@ -423,27 +457,36 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu,
> gva_t vaddr)
> static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
> struct kvm_mmu_page *sp)
> {
> - int i, offset = 0;
> + int i, r, offset = 0;
> pt_element_t *gpt;
> - struct page *page;
> -
> + void __user *src = (void __user *)gfn_to_hva(vcpu->kvm, sp->gfn);
> + void *dest = (void *)vcpu->kvm->prefetch_tmp_area;
> +
> if (sp->role.metaphysical
> || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
> nonpaging_prefetch_page(vcpu, sp);
> return;
> }
>
> + pagefault_disable();
> + r = __copy_from_user_inatomic(dest, src, PAGE_SIZE);
> + pagefault_enable();
> +
>
Please drop the temporary copy and put individual fetches in the loop
below. While __copy_from_user_inatomic is suboptimal for 4- and 8- byte
fetches, we can easily make it compile into a two instruction sequence.
Failed fetches are equivalent to present ptes (since we can't be sure).
>
> @@ -1594,11 +1602,21 @@ static int emulator_write_phys(struct kvm_vcpu *vcpu,
> gpa_t gpa,
> const void *val, int bytes)
> {
> int ret;
> + struct page *page;
>
> + down_read(¤t->mm->mmap_sem);
> ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
> - if (ret < 0)
> + if (ret < 0) {
> + up_read(¤t->mm->mmap_sem);
> return 0;
> - kvm_mmu_pte_write(vcpu, gpa, val, bytes);
> + }
> + page = vcpu->arch.mmu.pte_to_page(vcpu, val, bytes);
>
Theoretically a single 8-byte write can be used to map three different
pages (one with 64-bit pte and two with 32-bit ptes), so this needs to
change.
The only way around it I can see now is to do a follow_page() inside
mmu_set_spte(), and fail if we don't find the page. This is safe as the
spte will already have been zapped. It also gets rid of the new
'userpage' parameter.
>
> struct kvm {
> struct mutex lock; /* protects everything except vcpus */
>
There are few enough comments, let's keep them updated.
> + spinlock_t mmu_lock;
> struct mm_struct *mm; /* userspace tied to this vm */
> int nmemslots;
> struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS +
> KVM_PRIVATE_MEM_SLOTS];
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 845beb2..afdb767 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -165,12 +165,14 @@ static struct kvm *kvm_create_vm(void)
>
> kvm->mm = current->mm;
> atomic_inc(&kvm->mm->mm_count);
> + spin_lock_init(&kvm->mmu_lock);
> kvm_io_bus_init(&kvm->pio_bus);
> mutex_init(&kvm->lock);
> kvm_io_bus_init(&kvm->mmio_bus);
> spin_lock(&kvm_lock);
> list_add(&kvm->vm_list, &vm_list);
> spin_unlock(&kvm_lock);
> + kvm->prefetch_tmp_area = get_zeroed_page(GFP_KERNEL);
>
Missing failure handling.
> @@ -552,6 +557,46 @@ int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void
> *data, unsigned long len)
> }
> EXPORT_SYMBOL_GPL(kvm_read_guest);
>
> +int kvm_read_guest_page_inatomic(struct kvm *kvm, gfn_t gfn, void *data,
> + int offset, int len)
> +{
> + int r;
> + unsigned long addr;
> +
> + addr = gfn_to_hva(kvm, gfn);
> + if (kvm_is_error_hva(addr))
> + return -EFAULT;
> + pagefault_disable();
>
Is pagefault_disable() not implied by the spinlock we're holding?
> + r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
> + pagefault_enable();
> + if (r)
> + return -EFAULT;
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(kvm_read_guest_page_inatomic);
> +
> +int kvm_read_guest_inatomic(struct kvm *kvm, gpa_t gpa, void *data,
> + unsigned long len)
> +{
>
Since all atomic reads are pte fetches, I don't think we need to support
page-spanning atomic reads.
Apart from these comments, please split the patch into a few more
digestible chunks, perhaps along the lines of
- patches to prepare mmu for atomicity
- add mmu spinlock
- replace kvm->mutex by mm->mmap_sem where appropriate
Again, great patch, great results.
--
Any sufficiently difficult bug is indistinguishable from a feature.
-------------------------------------------------------------------------
SF.Net email is sponsored by:
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services
for just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace
_______________________________________________
kvm-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/kvm-devel