[PATCH 2/4] KVM: PPC: Book3S HV: Add a mechanism for recording modified HPTEs

2012-11-13 Thread Paul Mackerras
This uses a bit in our record of the guest view of the HPTE to record
when the HPTE gets modified.  We use a reserved bit for this, and ensure
that this bit is always cleared in HPTE values returned to the guest.
The recording of modified HPTEs is only done if other code indicates
its interest by setting kvm->arch.hpte_mod_interest to a non-zero value.

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/include/asm/kvm_book3s_64.h |6 ++
 arch/powerpc/include/asm/kvm_host.h  |1 +
 arch/powerpc/kvm/book3s_hv_rm_mmu.c  |   25 ++---
 3 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h 
b/arch/powerpc/include/asm/kvm_book3s_64.h
index 1472a5b..4ca4f25 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -50,6 +50,12 @@ extern int kvm_hpt_order;/* order of 
preallocated HPTs */
 #define HPTE_V_HVLOCK  0x40UL
 #define HPTE_V_ABSENT  0x20UL
 
+/*
+ * We use this bit in the guest_rpte field of the revmap entry
+ * to indicate a modified HPTE.
+ */
+#define HPTE_GR_MODIFIED   (1ul << 62)
+
 static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits)
 {
unsigned long tmp, old;
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 3093896..58c7264 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -248,6 +248,7 @@ struct kvm_arch {
atomic_t vcpus_running;
unsigned long hpt_npte;
unsigned long hpt_mask;
+   atomic_t hpte_mod_interest;
spinlock_t slot_phys_lock;
unsigned short last_vcpu[NR_CPUS];
struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 362dffe..726231a 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -66,6 +66,18 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct 
revmap_entry *rev,
 }
 EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
 
+/*
+ * Note modification of an HPTE; set the HPTE modified bit
+ * if it wasn't modified before and anyone is interested.
+ */
+static inline void note_hpte_modification(struct kvm *kvm,
+ struct revmap_entry *rev)
+{
+   if (!(rev->guest_rpte & HPTE_GR_MODIFIED) &&
+   atomic_read(&kvm->arch.hpte_mod_interest))
+   rev->guest_rpte |= HPTE_GR_MODIFIED;
+}
+
 /* Remove this HPTE from the chain for a real page */
 static void remove_revmap_chain(struct kvm *kvm, long pte_index,
struct revmap_entry *rev,
@@ -287,8 +299,10 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long 
flags,
rev = &kvm->arch.revmap[pte_index];
if (realmode)
rev = real_vmalloc_addr(rev);
-   if (rev)
+   if (rev) {
rev->guest_rpte = g_ptel;
+   note_hpte_modification(kvm, rev);
+   }
 
/* Link HPTE into reverse-map chain */
if (pteh & HPTE_V_VALID) {
@@ -392,7 +406,8 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long 
flags,
/* Read PTE low word after tlbie to get final R/C values */
remove_revmap_chain(kvm, pte_index, rev, v, hpte[1]);
}
-   r = rev->guest_rpte;
+   r = rev->guest_rpte & ~HPTE_GR_MODIFIED;
+   note_hpte_modification(kvm, rev);
unlock_hpte(hpte, 0);
 
vcpu->arch.gpr[4] = v;
@@ -466,6 +481,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 
args[j] = ((0x80 | flags) << 56) + pte_index;
rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+   note_hpte_modification(kvm, rev);
 
if (!(hp[0] & HPTE_V_VALID)) {
/* insert R and C bits from PTE */
@@ -555,6 +571,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long 
flags,
if (rev) {
r = (rev->guest_rpte & ~mask) | bits;
rev->guest_rpte = r;
+   note_hpte_modification(kvm, rev);
}
r = (hpte[1] & ~mask) | bits;
 
@@ -606,8 +623,10 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long 
flags,
v &= ~HPTE_V_ABSENT;
v |= HPTE_V_VALID;
}
-   if (v & HPTE_V_VALID)
+   if (v & HPTE_V_VALID) {
r = rev[i].guest_rpte | (r & (HPTE_R_R | HPTE_R_C));
+   r &= ~HPTE_GR_MODIFIED;
+   }
vcpu->arch.gpr[4 + i * 2] = v;
vcpu->arch.gpr[5 + i * 2] = r;
}
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/4] KVM: PPC: Book3S HV: Restructure HPT entry creation code

2012-11-13 Thread Paul Mackerras
This restructures the code that creates HPT (hashed page table)
entries so that it can be called in situations where we don't have a
struct vcpu pointer, only a struct kvm pointer.  It also fixes a bug
where kvmppc_map_vrma() would corrupt the guest R4 value.

Most of the work of kvmppc_virtmode_h_enter is now done by a new
function, kvmppc_virtmode_do_h_enter, which itself calls another new
function, kvmppc_do_h_enter, which contains most of the old
kvmppc_h_enter.  The new kvmppc_do_h_enter takes explicit arguments
for the place to return the HPTE index, the Linux page tables to use,
and whether it is being called in real mode, thus removing the need
for it to have the vcpu as an argument.

Currently kvmppc_map_vrma creates the VRMA (virtual real mode area)
HPTEs by calling kvmppc_virtmode_h_enter, which is designed primarily
to handle H_ENTER hcalls from the guest that need to pin a page of
memory.  Since H_ENTER returns the index of the created HPTE in R4,
kvmppc_virtmode_h_enter updates the guest R4, corrupting the guest R4
in the case when it gets called from kvmppc_map_vrma on the first
VCPU_RUN ioctl.  With this, kvmppc_map_vrma instead calls
kvmppc_virtmode_do_h_enter with the address of a dummy word as the
place to store the HPTE index, thus avoiding corrupting the guest R4.

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/include/asm/kvm_book3s.h |5 +++--
 arch/powerpc/kvm/book3s_64_mmu_hv.c   |   36 +++--
 arch/powerpc/kvm/book3s_hv_rm_mmu.c   |   27 -
 3 files changed, 45 insertions(+), 23 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index 36fcf41..fea768f 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -157,8 +157,9 @@ extern void *kvmppc_pin_guest_page(struct kvm *kvm, 
unsigned long addr,
 extern void kvmppc_unpin_guest_page(struct kvm *kvm, void *addr);
 extern long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
long pte_index, unsigned long pteh, unsigned long ptel);
-extern long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
-   long pte_index, unsigned long pteh, unsigned long ptel);
+extern long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
+   long pte_index, unsigned long pteh, unsigned long ptel,
+   pgd_t *pgdir, bool realmode, unsigned long *idx_ret);
 extern long kvmppc_hv_get_dirty_log(struct kvm *kvm,
struct kvm_memory_slot *memslot, unsigned long *map);
 
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c 
b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 2a89a36..6ee6516 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -41,6 +41,10 @@
 /* Power architecture requires HPT is at least 256kB */
 #define PPC_MIN_HPT_ORDER  18
 
+static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
+   long pte_index, unsigned long pteh,
+   unsigned long ptel, unsigned long *pte_idx_ret);
+
 long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
 {
unsigned long hpt;
@@ -185,6 +189,7 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct 
kvm_memory_slot *memslot,
unsigned long addr, hash;
unsigned long psize;
unsigned long hp0, hp1;
+   unsigned long idx_ret;
long ret;
struct kvm *kvm = vcpu->kvm;
 
@@ -216,7 +221,8 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct 
kvm_memory_slot *memslot,
hash = (hash << 3) + 7;
hp_v = hp0 | ((addr >> 16) & ~0x7fUL);
hp_r = hp1 | addr;
-   ret = kvmppc_virtmode_h_enter(vcpu, H_EXACT, hash, hp_v, hp_r);
+   ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, hash, hp_v, hp_r,
+&idx_ret);
if (ret != H_SUCCESS) {
pr_err("KVM: map_vrma at %lx failed, ret=%ld\n",
   addr, ret);
@@ -354,15 +360,10 @@ static long kvmppc_get_guest_page(struct kvm *kvm, 
unsigned long gfn,
return err;
 }
 
-/*
- * We come here on a H_ENTER call from the guest when we are not
- * using mmu notifiers and we don't have the requested page pinned
- * already.
- */
-long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
-   long pte_index, unsigned long pteh, unsigned long ptel)
+long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
+   long pte_index, unsigned long pteh,
+   unsigned long ptel, unsigned long *pte_idx_ret)
 {
-   struct kvm *kvm = vcpu->kvm;
unsigned long psize, gpa, gfn;
struct kvm_memory_slot *memslot;
long ret;
@@ -390,8 +391,8 @@ long kvmppc_virtmode_h_enter(st

[PATCH 4/4] KVM: PPC: Book3S HV: Provide a method for userspace to read and write the HPT

2012-11-13 Thread Paul Mackerras
A new ioctl, KVM_PPC_GET_HTAB_FD, returns a file descriptor.  Reads on
this fd return the contents of the HPT (hashed page table), writes
create and/or remove entries in the HPT.  There is a new capability,
KVM_CAP_PPC_HTAB_FD, to indicate the presence of the ioctl.  The ioctl
takes an argument structure with the index of the first HPT entry to
read out and a set of flags.  The flags indicate whether the user is
intending to read or write the HPT, and whether to return all entries
or only the "bolted" entries (those with the bolted bit, 0x10, set in
the first doubleword).

This is intended for use in implementing qemu's savevm/loadvm and for
live migration.  Therefore, on reads, the first pass returns information
about all HPTEs (or all bolted HPTEs).  When the first pass reaches the
end of the HPT, it returns from the read.  Subsequent reads only return
information about HPTEs that have changed since they were last read.
A read that finds no changed HPTEs in the HPT following where the last
read finished will return 0 bytes.

The format of the data provides a simple run-length compression of the
invalid entries.  Each block of data starts with a header that indicates
the index (position in the HPT, which is just an array), the number of
valid entries starting at that index (may be zero), and the number of
invalid entries following those valid entries.  The valid entries, 16
bytes each, follow the header.  The invalid entries are not explicitly
represented.

Signed-off-by: Paul Mackerras 
---
 Documentation/virtual/kvm/api.txt|   53 +
 arch/powerpc/include/asm/kvm_book3s_64.h |   18 ++
 arch/powerpc/include/asm/kvm_ppc.h   |2 +
 arch/powerpc/include/uapi/asm/kvm.h  |   24 +++
 arch/powerpc/kvm/book3s_64_mmu_hv.c  |  344 ++
 arch/powerpc/kvm/book3s_hv.c |   12 --
 arch/powerpc/kvm/powerpc.c   |   17 ++
 include/uapi/linux/kvm.h |3 +
 8 files changed, 461 insertions(+), 12 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index 6671fdc..33080ea 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2071,6 +2071,59 @@ KVM_S390_INT_EXTERNAL_CALL (vcpu) - sigp external call; 
source cpu in parm
 
 Note that the vcpu ioctl is asynchronous to vcpu execution.
 
+4.78 KVM_PPC_GET_HTAB_FD
+
+Capability: KVM_CAP_PPC_HTAB_FD
+Architectures: powerpc
+Type: vm ioctl
+Parameters: Pointer to struct kvm_get_htab_fd (in)
+Returns: file descriptor number (>= 0) on success, -1 on error
+
+This returns a file descriptor that can be used either to read out the
+entries in the guest's hashed page table (HPT), or to write entries to
+initialize the HPT.  The returned fd can only be written to if the
+KVM_GET_HTAB_WRITE bit is set in the flags field of the argument, and
+can only be read if that bit is clear.  The argument struct looks like
+this:
+
+/* For KVM_PPC_GET_HTAB_FD */
+struct kvm_get_htab_fd {
+   __u64   flags;
+   __u64   start_index;
+};
+
+/* Values for kvm_get_htab_fd.flags */
+#define KVM_GET_HTAB_BOLTED_ONLY   ((__u64)0x1)
+#define KVM_GET_HTAB_WRITE ((__u64)0x2)
+
+The `start_index' field gives the index in the HPT of the entry at
+which to start reading.  It is ignored when writing.
+
+Reads on the fd will initially supply information about all
+"interesting" HPT entries.  Interesting entries are those with the
+bolted bit set, if the KVM_GET_HTAB_BOLTED_ONLY bit is set, otherwise
+all entries.  When the end of the HPT is reached, the read() will
+return.  If read() is called again on the fd, it will start again from
+the beginning of the HPT, but will only return HPT entries that have
+changed since they were last read.
+
+Data read or written is structured as a header (8 bytes) followed by a
+series of valid HPT entries (16 bytes) each.  The header indicates how
+many valid HPT entries there are and how many invalid entries follow
+the valid entries.  The invalid entries are not represented explicitly
+in the stream.  The header format is:
+
+struct kvm_get_htab_header {
+   __u32   index;
+   __u16   n_valid;
+   __u16   n_invalid;
+};
+
+Writes to the fd create HPT entries starting at the index given in the
+header; first `n_valid' valid entries with contents from the data
+written, then `n_invalid' invalid entries, invalidating any previously
+valid entries found.
+
 
 5. The kvm_run structure
 
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h 
b/arch/powerpc/include/asm/kvm_book3s_64.h
index 4ca4f25..dc0a78d 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -243,4 +243,22 @@ static inline bool slot_is_aligned(struct kvm_memory_slot 
*memslot,
return !(memslot->base_gfn & mask) && !(memslot->npages & mask);
 }
 
+static inline unsigned long slb_pgsize_encoding(unsigned long psize)
+{
+   unsigned

[PATCH 3/4] KVM: PPC: Book3S HV: Make a HPTE removal function available

2012-11-13 Thread Paul Mackerras
This makes a HPTE removal function, kvmppc_do_h_remove(), available
outside book3s_hv_rm_mmu.c.  This will be used by the HPT writing
code.

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/include/asm/kvm_book3s.h |3 +++
 arch/powerpc/kvm/book3s_hv_rm_mmu.c   |   19 +--
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index fea768f..46763d10 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -160,6 +160,9 @@ extern long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, 
unsigned long flags,
 extern long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
long pte_index, unsigned long pteh, unsigned long ptel,
pgd_t *pgdir, bool realmode, unsigned long *idx_ret);
+extern long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
+   unsigned long pte_index, unsigned long avpn,
+   unsigned long *hpret);
 extern long kvmppc_hv_get_dirty_log(struct kvm *kvm,
struct kvm_memory_slot *memslot, unsigned long *map);
 
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 726231a..e407e97 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -364,11 +364,10 @@ static inline int try_lock_tlbie(unsigned int *lock)
return old == 0;
 }
 
-long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
-unsigned long pte_index, unsigned long avpn,
-unsigned long va)
+long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
+   unsigned long pte_index, unsigned long avpn,
+   unsigned long *hpret)
 {
-   struct kvm *kvm = vcpu->kvm;
unsigned long *hpte;
unsigned long v, r, rb;
struct revmap_entry *rev;
@@ -410,10 +409,18 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long 
flags,
note_hpte_modification(kvm, rev);
unlock_hpte(hpte, 0);
 
-   vcpu->arch.gpr[4] = v;
-   vcpu->arch.gpr[5] = r;
+   hpret[0] = v;
+   hpret[1] = r;
return H_SUCCESS;
 }
+EXPORT_SYMBOL_GPL(kvmppc_do_h_remove);
+
+long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
+unsigned long pte_index, unsigned long avpn)
+{
+   return kvmppc_do_h_remove(vcpu->kvm, flags, pte_index, avpn,
+ &vcpu->arch.gpr[4]);
+}
 
 long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 {
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/4] KVM: PPC: Book3S HV: HPT read/write functions for userspace

2012-11-13 Thread Paul Mackerras
This series of patches provides an interface by which userspace can
read and write the hashed page table (HPT) of a Book3S HV guest.
The interface is an ioctl which provides a file descriptor which can
be accessed with the read() and write() system calls.  The data read
and written is the guest view of the HPT, in which the second
doubleword of each HPTE (HPT entry) contains a guest physical address,
as distinct from the real HPT that the hardware accesses, where the
second doubleword of each HPTE contains a real address.

Because the HPT is divided into groups (HPTEGs) of 8 entries each,
where each HPTEG usually only contains a few valid entries, or none,
the data format that we use does run-length encoding of the invalid
entries, so in fact the invalid entries take up no space in the
stream.

The interface also provides for doing multiple passes over the HPT,
where the first pass provides information on all HPTEs, and subsequent
passes only return the HPTEs that have changed since the previous pass.

I have implemented a read/write interface rather than an mmap-based
interface because the data is not stored contiguously anywhere in
kernel memory.  Of each 16-byte HPTE, the first 8 bytes come from the
real HPT and the second 8 bytes come from the parallel vmalloc'd array
where we store the guest view of the guest physical address,
permissions, accessed/dirty bits etc.  Thus a mmap-based interface
would not be practicable (not without doubling the size of the
parallel array, typically requiring an extra 8MB of kernel memory per
guest).  This is also why I have not used the memslot interface for
this.

This implements the interface for HV-style KVM but not for PR-style
KVM.  Userspace does not need any additional interface with PR-style
KVM because userspace maintains the guest HPT already in that case,
and has an image of the guest view of the HPT in its address space.

This series is against the next branch of the kvm tree.  The patches
are basically identical to the previous posting of the series, just
rediffed for the move of kvm.h from include/linux to
include/uapi/linux, and for commit 8ca40a70a7 ("KVM: Take kvm instead
of vcpu to mmu_notifier_retry"), which supersedes patch 1 of the old
series.

The overall diffstat is:

 Documentation/virtual/kvm/api.txt|   53 +
 arch/powerpc/include/asm/kvm_book3s.h|8 +-
 arch/powerpc/include/asm/kvm_book3s_64.h |   24 ++
 arch/powerpc/include/asm/kvm_host.h  |1 +
 arch/powerpc/include/asm/kvm_ppc.h   |2 +
 arch/powerpc/include/uapi/asm/kvm.h  |   24 ++
 arch/powerpc/kvm/book3s_64_mmu_hv.c  |  380 +-
 arch/powerpc/kvm/book3s_hv.c |   12 -
 arch/powerpc/kvm/book3s_hv_rm_mmu.c  |   71 --
 arch/powerpc/kvm/powerpc.c   |   17 ++
 include/uapi/linux/kvm.h |3 +
 11 files changed, 551 insertions(+), 44 deletions(-)

Please apply.

Paul.
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html