right now kvm take the down_read(mmap_sem) lock to make sure that no
slots will be
removed while trying to get them / make the dirty log safe,
but in some cases when the down_read(mmap_sem) is called and after that
kvm_read_guest() is called
copy_from_user can result in a page fault that will lead to recursivly
taking the mmap_sem,
this patch remove the mmap_sem, with new kvm private lock.
--
woof.
>From 17ab59f76968731a1dc1067fc07d637b5c52e3da Mon Sep 17 00:00:00 2001
From: Izik Eidus <[EMAIL PROTECTED]>
Date: Sun, 10 Feb 2008 18:04:15 +0200
Subject: [PATCH] KVM: remove the usage of the mmap_sem for the protection of the memory slots.
this patch replace the mmap_sem lock for the memory slots with a new
kvm private lock,
it is needed beacuse untill now there were cases where the down_read for the
mmap_sem can be called recursivly.
(for example when doing copy_to_user() unpresent memory that will result in
pagefault)
Signed-off-by: Izik Eidus <[EMAIL PROTECTED]>
---
arch/x86/kvm/mmu.c | 24 +++++++++++++---
arch/x86/kvm/paging_tmpl.h | 13 ++++++---
arch/x86/kvm/vmx.c | 7 +++-
arch/x86/kvm/x86.c | 65 +++++++++++++++++++++++++-------------------
include/linux/kvm_host.h | 1 +
virt/kvm/kvm_main.c | 7 +++-
6 files changed, 76 insertions(+), 41 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 635e70c..461a8de 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -868,11 +868,18 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
{
+ struct page *page;
+
gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
if (gpa == UNMAPPED_GVA)
return NULL;
- return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+
+ down_read(¤t->mm->mmap_sem);
+ page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+ up_read(¤t->mm->mmap_sem);
+
+ return page;
}
static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
@@ -1005,13 +1012,16 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
struct page *page;
+ down_read(&vcpu->kvm->slots_lock);
+
down_read(¤t->mm->mmap_sem);
page = gfn_to_page(vcpu->kvm, gfn);
+ up_read(¤t->mm->mmap_sem);
/* mmio */
if (is_error_page(page)) {
kvm_release_page_clean(page);
- up_read(¤t->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
return 1;
}
@@ -1020,7 +1030,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
r = __nonpaging_map(vcpu, v, write, gfn, page);
spin_unlock(&vcpu->kvm->mmu_lock);
- up_read(¤t->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
return r;
}
@@ -1381,7 +1391,11 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
if (!is_present_pte(gpte))
return;
gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
+
+ down_read(¤t->mm->mmap_sem);
page = gfn_to_page(vcpu->kvm, gfn);
+ up_read(¤t->mm->mmap_sem);
+
if (is_error_page(page)) {
kvm_release_page_clean(page);
return;
@@ -1503,9 +1517,9 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
gpa_t gpa;
int r;
- down_read(¤t->mm->mmap_sem);
+ down_read(&vcpu->kvm->slots_lock);
gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
- up_read(¤t->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
spin_lock(&vcpu->kvm->mmu_lock);
r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index b13e823..f58c143 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -91,7 +91,10 @@ static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
pt_element_t *table;
struct page *page;
+ down_read(¤t->mm->mmap_sem);
page = gfn_to_page(kvm, table_gfn);
+ up_read(¤t->mm->mmap_sem);
+
table = kmap_atomic(page, KM_USER0);
ret = CMPXCHG(&table[index], orig_pte, new_pte);
@@ -375,7 +378,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
if (r)
return r;
- down_read(¤t->mm->mmap_sem);
+ down_read(&vcpu->kvm->slots_lock);
/*
* Look up the shadow pte for the faulting address.
*/
@@ -389,17 +392,19 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
pgprintk("%s: guest page fault\n", __FUNCTION__);
inject_page_fault(vcpu, addr, walker.error_code);
vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
- up_read(¤t->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
return 0;
}
+ down_read(¤t->mm->mmap_sem);
page = gfn_to_page(vcpu->kvm, walker.gfn);
+ up_read(¤t->mm->mmap_sem);
/* mmio */
if (is_error_page(page)) {
pgprintk("gfn %x is mmio\n", walker.gfn);
kvm_release_page_clean(page);
- up_read(¤t->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
return 1;
}
@@ -416,7 +421,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
++vcpu->stat.pf_fixed;
kvm_mmu_audit(vcpu, "post page fault (fixed)");
spin_unlock(&vcpu->kvm->mmu_lock);
- up_read(¤t->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
return write_pt;
}
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8496dbe..e83c415 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1519,7 +1519,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
struct kvm_userspace_memory_region kvm_userspace_mem;
int r = 0;
- down_write(¤t->mm->mmap_sem);
+ down_write(&kvm->slots_lock);
if (kvm->arch.apic_access_page)
goto out;
kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
@@ -1529,9 +1529,12 @@ static int alloc_apic_access_page(struct kvm *kvm)
r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
if (r)
goto out;
+
+ down_read(¤t->mm->mmap_sem);
kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
+ up_read(¤t->mm->mmap_sem);
out:
- up_write(¤t->mm->mmap_sem);
+ up_write(&kvm->slots_lock);
return r;
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 08e1edc..93425bb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -189,7 +189,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
int ret;
u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
- down_read(¤t->mm->mmap_sem);
+ down_read(&vcpu->kvm->slots_lock);
ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
offset * sizeof(u64), sizeof(pdpte));
if (ret < 0) {
@@ -206,7 +206,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
out:
- up_read(¤t->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
return ret;
}
@@ -220,13 +220,13 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
if (is_long_mode(vcpu) || !is_pae(vcpu))
return false;
- down_read(¤t->mm->mmap_sem);
+ down_read(&vcpu->kvm->slots_lock);
r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
if (r < 0)
goto out;
changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
out:
- up_read(¤t->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
return changed;
}
@@ -364,7 +364,7 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
*/
}
- down_read(¤t->mm->mmap_sem);
+ down_read(&vcpu->kvm->slots_lock);
/*
* Does the new cr3 value map to physical memory? (Note, we
* catch an invalid cr3 even in real-mode, because it would
@@ -380,7 +380,7 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
vcpu->arch.cr3 = cr3;
vcpu->arch.mmu.new_cr3(vcpu);
}
- up_read(¤t->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
}
EXPORT_SYMBOL_GPL(set_cr3);
@@ -1214,12 +1214,12 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
return -EINVAL;
- down_write(¤t->mm->mmap_sem);
+ down_write(&kvm->slots_lock);
kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
- up_write(¤t->mm->mmap_sem);
+ up_write(&kvm->slots_lock);
return 0;
}
@@ -1268,7 +1268,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
< alias->target_phys_addr)
goto out;
- down_write(¤t->mm->mmap_sem);
+ down_write(&kvm->slots_lock);
p = &kvm->arch.aliases[alias->slot];
p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
@@ -1282,7 +1282,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
kvm_mmu_zap_all(kvm);
- up_write(¤t->mm->mmap_sem);
+ up_write(&kvm->slots_lock);
return 0;
@@ -1358,7 +1358,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
struct kvm_memory_slot *memslot;
int is_dirty = 0;
- down_write(¤t->mm->mmap_sem);
+ down_write(&kvm->slots_lock);
r = kvm_get_dirty_log(kvm, log, &is_dirty);
if (r)
@@ -1374,7 +1374,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
}
r = 0;
out:
- up_write(¤t->mm->mmap_sem);
+ up_write(&kvm->slots_lock);
return r;
}
@@ -1570,7 +1570,7 @@ int emulator_read_std(unsigned long addr,
void *data = val;
int r = X86EMUL_CONTINUE;
- down_read(¤t->mm->mmap_sem);
+ down_read(&vcpu->kvm->slots_lock);
while (bytes) {
gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
unsigned offset = addr & (PAGE_SIZE-1);
@@ -1592,7 +1592,7 @@ int emulator_read_std(unsigned long addr,
addr += tocopy;
}
out:
- up_read(¤t->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
return r;
}
EXPORT_SYMBOL_GPL(emulator_read_std);
@@ -1611,9 +1611,9 @@ static int emulator_read_emulated(unsigned long addr,
return X86EMUL_CONTINUE;
}
- down_read(¤t->mm->mmap_sem);
+ down_read(&vcpu->kvm->slots_lock);
gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
- up_read(¤t->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
/* For APIC access vmexit */
if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -1651,14 +1651,14 @@ static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
{
int ret;
- down_read(¤t->mm->mmap_sem);
+ down_read(&vcpu->kvm->slots_lock);
ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
if (ret < 0) {
- up_read(¤t->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
return 0;
}
kvm_mmu_pte_write(vcpu, gpa, val, bytes);
- up_read(¤t->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
return 1;
}
@@ -1670,9 +1670,9 @@ static int emulator_write_emulated_onepage(unsigned long addr,
struct kvm_io_device *mmio_dev;
gpa_t gpa;
- down_read(¤t->mm->mmap_sem);
+ down_read(&vcpu->kvm->slots_lock);
gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
- up_read(¤t->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
if (gpa == UNMAPPED_GVA) {
kvm_inject_page_fault(vcpu, addr, 2);
@@ -1749,7 +1749,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
char *kaddr;
u64 val;
- down_read(¤t->mm->mmap_sem);
+ down_read(&kvm->slots_lock);
gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
if (gpa == UNMAPPED_GVA ||
@@ -1760,13 +1760,17 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
goto emul_write;
val = *(u64 *)new;
+
+ down_read(¤t->mm->mmap_sem);
page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+ up_read(¤t->mm->mmap_sem);
+
kaddr = kmap_atomic(page, KM_USER0);
set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
kunmap_atomic(kaddr, KM_USER0);
kvm_release_page_dirty(page);
emul_write:
- up_read(¤t->mm->mmap_sem);
+ up_read(&kvm->slots_lock);
}
#endif
@@ -2159,10 +2163,10 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
kvm_x86_ops->skip_emulated_instruction(vcpu);
for (i = 0; i < nr_pages; ++i) {
- down_read(¤t->mm->mmap_sem);
+ down_read(&vcpu->kvm->slots_lock);
page = gva_to_page(vcpu, address + i * PAGE_SIZE);
vcpu->arch.pio.guest_pages[i] = page;
- up_read(¤t->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
if (!page) {
kvm_inject_gp(vcpu, 0);
free_pio_guest_pages(vcpu);
@@ -2485,8 +2489,9 @@ static void vapic_enter(struct kvm_vcpu *vcpu)
down_read(¤t->mm->mmap_sem);
page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
- vcpu->arch.apic->vapic_page = page;
up_read(¤t->mm->mmap_sem);
+
+ vcpu->arch.apic->vapic_page = page;
}
static void vapic_exit(struct kvm_vcpu *vcpu)
@@ -2957,9 +2962,9 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
gpa_t gpa;
vcpu_load(vcpu);
- down_read(¤t->mm->mmap_sem);
+ down_read(&vcpu->kvm->slots_lock);
gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
- up_read(¤t->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
tr->physical_address = gpa;
tr->valid = gpa != UNMAPPED_GVA;
tr->writeable = 1;
@@ -3232,11 +3237,13 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
*/
if (!user_alloc) {
if (npages && !old.rmap) {
+ down_read(¤t->mm->mmap_sem);
memslot->userspace_addr = do_mmap(NULL, 0,
npages * PAGE_SIZE,
PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_ANONYMOUS,
0);
+ up_read(¤t->mm->mmap_sem);
if (IS_ERR((void *)memslot->userspace_addr))
return PTR_ERR((void *)memslot->userspace_addr);
@@ -3244,8 +3251,10 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
if (!old.user_alloc && old.rmap) {
int ret;
+ down_read(¤t->mm->mmap_sem);
ret = do_munmap(current->mm, old.userspace_addr,
old.npages * PAGE_SIZE);
+ up_read(¤t->mm->mmap_sem);
if (ret < 0)
printk(KERN_WARNING
"kvm_vm_ioctl_set_memory_region: "
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a84c912..b90ca36 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -109,6 +109,7 @@ struct kvm_memory_slot {
struct kvm {
struct mutex lock; /* protects the vcpus array and APIC accesses */
spinlock_t mmu_lock;
+ struct rw_semaphore slots_lock;
struct mm_struct *mm; /* userspace tied to this vm */
int nmemslots;
struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS +
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4da308e..8ea19d5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -169,6 +169,7 @@ static struct kvm *kvm_create_vm(void)
kvm_io_bus_init(&kvm->pio_bus);
mutex_init(&kvm->lock);
kvm_io_bus_init(&kvm->mmio_bus);
+ init_rwsem(&kvm->slots_lock);
spin_lock(&kvm_lock);
list_add(&kvm->vm_list, &vm_list);
spin_unlock(&kvm_lock);
@@ -339,9 +340,9 @@ int kvm_set_memory_region(struct kvm *kvm,
{
int r;
- down_write(¤t->mm->mmap_sem);
+ down_write(&kvm->slots_lock);
r = __kvm_set_memory_region(kvm, mem, user_alloc);
- up_write(¤t->mm->mmap_sem);
+ up_write(&kvm->slots_lock);
return r;
}
EXPORT_SYMBOL_GPL(kvm_set_memory_region);
@@ -989,7 +990,9 @@ static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
if (!kvm_is_visible_gfn(kvm, vmf->pgoff))
return VM_FAULT_SIGBUS;
+
page = gfn_to_page(kvm, vmf->pgoff);
+
if (is_error_page(page)) {
kvm_release_page_clean(page);
return VM_FAULT_SIGBUS;
--
1.5.3.6
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
_______________________________________________
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel