On Mon, Oct 08, 2018 at 04:31:08PM +1100, Paul Mackerras wrote:
> From: Suraj Jitindar Singh <sjitindarsi...@gmail.com>
> 
> When a host (L0) page which is mapped into a (L1) guest is in turn
> mapped through to a nested (L2) guest we keep a reverse mapping (rmap)
> so that these mappings can be retrieved later.
> 
> Whenever we create an entry in a shadow_pgtable for a nested guest we
> create a corresponding rmap entry and add it to the list for the
> L1 guest memslot at the index of the L1 guest page it maps. This means
> at the L1 guest memslot we end up with lists of rmaps.
> 
> When we are notified of a host page being invalidated which has been
> mapped through to a (L1) guest, we can then walk the rmap list for that
> guest page, and find and invalidate all of the corresponding
> shadow_pgtable entries.
> 
> In order to reduce memory consumption, we compress the information for
> each rmap entry down to 52 bits -- 12 bits for the LPID and 40 bits
> for the guest real page frame number -- which will fit in a single
> unsigned long.  To avoid a scenario where a guest can trigger
> unbounded memory allocations, we scan the list when adding an entry to
> see if there is already an entry with the contents we need.  This can
> occur, because we don't ever remove entries from the middle of a list.
> 
> A struct nested guest rmap is a list pointer and an rmap entry;
> ----------------
> | next pointer |
> ----------------
> | rmap entry   |
> ----------------
> 
> Thus the rmap pointer for each guest frame number in the memslot can be
> either NULL, a single entry, or a pointer to a list of nested rmap entries.
> 
> gfn    memslot rmap array
>       -------------------------
>  0    | NULL                  |       (no rmap entry)
>       -------------------------
>  1    | single rmap entry     |       (rmap entry with low bit set)
>       -------------------------
>  2    | list head pointer     |       (list of rmap entries)
>       -------------------------
> 
> The final entry always has the lowest bit set and is stored in the next
> pointer of the last list entry, or as a single rmap entry.
> With a list of rmap entries looking like;
> 
> -----------------     -----------------       -------------------------
> | list head ptr       | ----> | next pointer  | ----> | single rmap entry     
> |
> -----------------     -----------------       -------------------------
>                       | rmap entry    |       | rmap entry            |
>                       -----------------       -------------------------
> 
> Signed-off-by: Suraj Jitindar Singh <sjitindarsi...@gmail.com>
> Signed-off-by: Paul Mackerras <pau...@ozlabs.org>

Reviewed-by: David Gibson <da...@gibson.dropbear.id.au>

> ---
>  arch/powerpc/include/asm/kvm_book3s.h    |   3 +
>  arch/powerpc/include/asm/kvm_book3s_64.h |  69 +++++++++++++++-
>  arch/powerpc/kvm/book3s_64_mmu_radix.c   |  44 +++++++---
>  arch/powerpc/kvm/book3s_hv.c             |   1 +
>  arch/powerpc/kvm/book3s_hv_nested.c      | 138 
> ++++++++++++++++++++++++++++++-
>  5 files changed, 240 insertions(+), 15 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
> b/arch/powerpc/include/asm/kvm_book3s.h
> index 63f7ccf..d7aeb6f 100644
> --- a/arch/powerpc/include/asm/kvm_book3s.h
> +++ b/arch/powerpc/include/asm/kvm_book3s.h
> @@ -196,6 +196,9 @@ extern int kvmppc_mmu_radix_translate_table(struct 
> kvm_vcpu *vcpu, gva_t eaddr,
>                       int table_index, u64 *pte_ret_p);
>  extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
>                       struct kvmppc_pte *gpte, bool data, bool iswrite);
> +extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
> +                     unsigned int shift, struct kvm_memory_slot *memslot,
> +                     unsigned int lpid);
>  extern bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable,
>                                   bool writing, unsigned long gpa,
>                                   unsigned int lpid);
> diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h 
> b/arch/powerpc/include/asm/kvm_book3s_64.h
> index 5496152..c2a9146 100644
> --- a/arch/powerpc/include/asm/kvm_book3s_64.h
> +++ b/arch/powerpc/include/asm/kvm_book3s_64.h
> @@ -53,6 +53,66 @@ struct kvm_nested_guest {
>       struct kvm_nested_guest *next;
>  };
>  
> +/*
> + * We define a nested rmap entry as a single 64-bit quantity
> + * 0xFFF0000000000000        12-bit lpid field
> + * 0x000FFFFFFFFFF000        40-bit guest 4k page frame number
> + * 0x0000000000000001        1-bit  single entry flag
> + */
> +#define RMAP_NESTED_LPID_MASK                0xFFF0000000000000UL
> +#define RMAP_NESTED_LPID_SHIFT               (52)
> +#define RMAP_NESTED_GPA_MASK         0x000FFFFFFFFFF000UL
> +#define RMAP_NESTED_IS_SINGLE_ENTRY  0x0000000000000001UL
> +
> +/* Structure for a nested guest rmap entry */
> +struct rmap_nested {
> +     struct llist_node list;
> +     u64 rmap;
> +};
> +
> +/*
> + * for_each_nest_rmap_safe - iterate over the list of nested rmap entries
> + *                        safe against removal of the list entry or NULL list
> + * @pos:     a (struct rmap_nested *) to use as a loop cursor
> + * @node:    pointer to the first entry
> + *           NOTE: this can be NULL
> + * @rmapp:   an (unsigned long *) in which to return the rmap entries on each
> + *           iteration
> + *           NOTE: this must point to already allocated memory
> + *
> + * The nested_rmap is a llist of (struct rmap_nested) entries pointed to by 
> the
> + * rmap entry in the memslot. The list is always terminated by a "single 
> entry"
> + * stored in the list element of the final entry of the llist. If there is 
> ONLY
> + * a single entry then this is itself in the rmap entry of the memslot, not a
> + * llist head pointer.
> + *
> + * Note that the iterator below assumes that a nested rmap entry is always
> + * non-zero.  This is true for our usage because the LPID field is always
> + * non-zero (zero is reserved for the host).
> + *
> + * This should be used to iterate over the list of rmap_nested entries with
> + * processing done on the u64 rmap value given by each iteration. This is 
> safe
> + * against removal of list entries and it is always safe to call free on 
> (pos).
> + *
> + * e.g.
> + * struct rmap_nested *cursor;
> + * struct llist_node *first;
> + * unsigned long rmap;
> + * for_each_nest_rmap_safe(cursor, first, &rmap) {
> + *   do_something(rmap);
> + *   free(cursor);
> + * }
> + */
> +#define for_each_nest_rmap_safe(pos, node, rmapp)                           \
> +     for ((pos) = llist_entry((node), typeof(*(pos)), list);                \
> +          (node) &&                                                         \
> +          (*(rmapp) = ((RMAP_NESTED_IS_SINGLE_ENTRY & ((u64) (node))) ?     \
> +                       ((u64) (node)) : ((pos)->rmap))) &&                  \
> +          (((node) = ((RMAP_NESTED_IS_SINGLE_ENTRY & ((u64) (node))) ?      \
> +                      ((struct llist_node *) ((pos) = NULL)) :              \
> +                      (pos)->list.next)), true);                            \
> +          (pos) = llist_entry((node), typeof(*(pos)), list))
> +
>  struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid,
>                                         bool create);
>  void kvmhv_put_nested(struct kvm_nested_guest *gp);
> @@ -551,7 +611,14 @@ static inline void copy_to_checkpoint(struct kvm_vcpu 
> *vcpu)
>  
>  extern int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
>                            unsigned long gpa, unsigned int level,
> -                          unsigned long mmu_seq, unsigned int lpid);
> +                          unsigned long mmu_seq, unsigned int lpid,
> +                          unsigned long *rmapp, struct rmap_nested **n_rmap);
> +extern void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp,
> +                                struct rmap_nested **n_rmap);
> +extern void kvmhv_remove_nest_rmap_range(struct kvm *kvm,
> +                             struct kvm_memory_slot *memslot,
> +                             unsigned long gpa, unsigned long hpa,
> +                             unsigned long nbytes);
>  
>  #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
>  
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c 
> b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> index c4b1a9e..4c1eccb 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> @@ -256,27 +256,38 @@ static void kvmppc_pmd_free(pmd_t *pmdp)
>       kmem_cache_free(kvm_pmd_cache, pmdp);
>  }
>  
> -void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte,
> -                   unsigned long gpa, unsigned int shift,
> -                   struct kvm_memory_slot *memslot,
> +/* Called with kvm->mmu_lock held */
> +void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
> +                   unsigned int shift, struct kvm_memory_slot *memslot,
>                     unsigned int lpid)
>  
>  {
>       unsigned long old;
> +     unsigned long gfn = gpa >> PAGE_SHIFT;
> +     unsigned long page_size = PAGE_SIZE;
> +     unsigned long hpa;
>  
>       old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift);
>       kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid);
> -     if ((old & _PAGE_DIRTY) && (lpid == kvm->arch.lpid)) {
> -             unsigned long gfn = gpa >> PAGE_SHIFT;
> -             unsigned long page_size = PAGE_SIZE;
>  
> -             if (shift)
> -                     page_size = 1ul << shift;
> +     /* The following only applies to L1 entries */
> +     if (lpid != kvm->arch.lpid)
> +             return;
> +
> +     if (!memslot) {
> +             memslot = gfn_to_memslot(kvm, gfn);
>               if (!memslot)
> -                     memslot = gfn_to_memslot(kvm, gfn);
> -             if (memslot && memslot->dirty_bitmap)
> -                     kvmppc_update_dirty_map(memslot, gfn, page_size);
> +                     return;
>       }
> +     if (shift)
> +             page_size = 1ul << shift;
> +
> +     gpa &= ~(page_size - 1);
> +     hpa = old & PTE_RPN_MASK;
> +     kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size);
> +
> +     if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap)
> +             kvmppc_update_dirty_map(memslot, gfn, page_size);
>  }
>  
>  /*
> @@ -430,7 +441,8 @@ static void kvmppc_unmap_free_pud_entry_table(struct kvm 
> *kvm, pud_t *pud,
>  
>  int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
>                     unsigned long gpa, unsigned int level,
> -                   unsigned long mmu_seq, unsigned int lpid)
> +                   unsigned long mmu_seq, unsigned int lpid,
> +                   unsigned long *rmapp, struct rmap_nested **n_rmap)
>  {
>       pgd_t *pgd;
>       pud_t *pud, *new_pud = NULL;
> @@ -509,6 +521,8 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, 
> pte_t pte,
>                       kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid);
>               }
>               kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte);
> +             if (rmapp && n_rmap)
> +                     kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
>               ret = 0;
>               goto out_unlock;
>       }
> @@ -559,6 +573,8 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, 
> pte_t pte,
>                       kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid);
>               }
>               kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
> +             if (rmapp && n_rmap)
> +                     kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
>               ret = 0;
>               goto out_unlock;
>       }
> @@ -583,6 +599,8 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, 
> pte_t pte,
>               goto out_unlock;
>       }
>       kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
> +     if (rmapp && n_rmap)
> +             kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
>       ret = 0;
>  
>   out_unlock:
> @@ -710,7 +728,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
>  
>       /* Allocate space in the tree and write the PTE */
>       ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level,
> -                             mmu_seq, kvm->arch.lpid);
> +                             mmu_seq, kvm->arch.lpid, NULL, NULL);
>       if (inserted_pte)
>               *inserted_pte = pte;
>       if (levelp)
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index dc25461..cb9e738 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -4482,6 +4482,7 @@ int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
>       kvmppc_free_hpt(&kvm->arch.hpt);
>       kvmppc_update_lpcr(kvm, LPCR_UPRT | LPCR_GTSE | LPCR_HR,
>                          LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
> +     kvmppc_rmap_reset(kvm);
>       kvm->arch.radix = 1;
>       return 0;
>  }
> diff --git a/arch/powerpc/kvm/book3s_hv_nested.c 
> b/arch/powerpc/kvm/book3s_hv_nested.c
> index 21a210c..3fa676b 100644
> --- a/arch/powerpc/kvm/book3s_hv_nested.c
> +++ b/arch/powerpc/kvm/book3s_hv_nested.c
> @@ -10,6 +10,7 @@
>  
>  #include <linux/kernel.h>
>  #include <linux/kvm_host.h>
> +#include <linux/llist.h>
>  
>  #include <asm/kvm_ppc.h>
>  #include <asm/kvm_book3s.h>
> @@ -22,6 +23,7 @@
>  static struct patb_entry *pseries_partition_tb;
>  
>  static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp);
> +static void kvmhv_free_memslot_nest_rmap(struct kvm_memory_slot *free);
>  
>  void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
>  {
> @@ -456,6 +458,8 @@ void kvmhv_release_all_nested(struct kvm *kvm)
>       int i;
>       struct kvm_nested_guest *gp;
>       struct kvm_nested_guest *freelist = NULL;
> +     struct kvm_memory_slot *memslot;
> +     int srcu_idx;
>  
>       spin_lock(&kvm->mmu_lock);
>       for (i = 0; i <= kvm->arch.max_nested_lpid; i++) {
> @@ -474,6 +478,11 @@ void kvmhv_release_all_nested(struct kvm *kvm)
>               freelist = gp->next;
>               kvmhv_release_nested(gp);
>       }
> +
> +     srcu_idx = srcu_read_lock(&kvm->srcu);
> +     kvm_for_each_memslot(memslot, kvm_memslots(kvm))
> +             kvmhv_free_memslot_nest_rmap(memslot);
> +     srcu_read_unlock(&kvm->srcu, srcu_idx);
>  }
>  
>  /* caller must hold gp->tlb_lock */
> @@ -544,6 +553,123 @@ void kvmhv_put_nested(struct kvm_nested_guest *gp)
>               kvmhv_release_nested(gp);
>  }
>  
> +static struct kvm_nested_guest *kvmhv_find_nested(struct kvm *kvm, int lpid)
> +{
> +     if (lpid > kvm->arch.max_nested_lpid)
> +             return NULL;
> +     return kvm->arch.nested_guests[lpid];
> +}
> +
> +static inline bool kvmhv_n_rmap_is_equal(u64 rmap_1, u64 rmap_2)
> +{
> +     return !((rmap_1 ^ rmap_2) & (RMAP_NESTED_LPID_MASK |
> +                                    RMAP_NESTED_GPA_MASK));
> +}
> +
> +void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp,
> +                         struct rmap_nested **n_rmap)
> +{
> +     struct llist_node *entry = ((struct llist_head *) rmapp)->first;
> +     struct rmap_nested *cursor;
> +     u64 rmap, new_rmap = (*n_rmap)->rmap;
> +
> +     /* Are there any existing entries? */
> +     if (!(*rmapp)) {
> +             /* No -> use the rmap as a single entry */
> +             *rmapp = new_rmap | RMAP_NESTED_IS_SINGLE_ENTRY;
> +             return;
> +     }
> +
> +     /* Do any entries match what we're trying to insert? */
> +     for_each_nest_rmap_safe(cursor, entry, &rmap) {
> +             if (kvmhv_n_rmap_is_equal(rmap, new_rmap))
> +                     return;
> +     }
> +
> +     /* Do we need to create a list or just add the new entry? */
> +     rmap = *rmapp;
> +     if (rmap & RMAP_NESTED_IS_SINGLE_ENTRY) /* Not previously a list */
> +             *rmapp = 0UL;
> +     llist_add(&((*n_rmap)->list), (struct llist_head *) rmapp);
> +     if (rmap & RMAP_NESTED_IS_SINGLE_ENTRY) /* Not previously a list */
> +             (*n_rmap)->list.next = (struct llist_node *) rmap;
> +
> +     /* Set NULL so not freed by caller */
> +     *n_rmap = NULL;
> +}
> +
> +static void kvmhv_remove_nest_rmap(struct kvm *kvm, u64 n_rmap,
> +                                unsigned long hpa, unsigned long mask)
> +{
> +     struct kvm_nested_guest *gp;
> +     unsigned long gpa;
> +     unsigned int shift, lpid;
> +     pte_t *ptep;
> +
> +     gpa = n_rmap & RMAP_NESTED_GPA_MASK;
> +     lpid = (n_rmap & RMAP_NESTED_LPID_MASK) >> RMAP_NESTED_LPID_SHIFT;
> +     gp = kvmhv_find_nested(kvm, lpid);
> +     if (!gp)
> +             return;
> +
> +     /* Find and invalidate the pte */
> +     ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift);
> +     /* Don't spuriously invalidate ptes if the pfn has changed */
> +     if (ptep && pte_present(*ptep) && ((pte_val(*ptep) & mask) == hpa))
> +             kvmppc_unmap_pte(kvm, ptep, gpa, shift, NULL, gp->shadow_lpid);
> +}
> +
> +static void kvmhv_remove_nest_rmap_list(struct kvm *kvm, unsigned long 
> *rmapp,
> +                                     unsigned long hpa, unsigned long mask)
> +{
> +     struct llist_node *entry = llist_del_all((struct llist_head *) rmapp);
> +     struct rmap_nested *cursor;
> +     unsigned long rmap;
> +
> +     for_each_nest_rmap_safe(cursor, entry, &rmap) {
> +             kvmhv_remove_nest_rmap(kvm, rmap, hpa, mask);
> +             kfree(cursor);
> +     }
> +}
> +
> +/* called with kvm->mmu_lock held */
> +void kvmhv_remove_nest_rmap_range(struct kvm *kvm,
> +                               struct kvm_memory_slot *memslot,
> +                               unsigned long gpa, unsigned long hpa,
> +                               unsigned long nbytes)
> +{
> +     unsigned long gfn, end_gfn;
> +     unsigned long addr_mask;
> +
> +     if (!memslot)
> +             return;
> +     gfn = (gpa >> PAGE_SHIFT) - memslot->base_gfn;
> +     end_gfn = gfn + (nbytes >> PAGE_SHIFT);
> +
> +     addr_mask = PTE_RPN_MASK & ~(nbytes - 1);
> +     hpa &= addr_mask;
> +
> +     for (; gfn < end_gfn; gfn++) {
> +             unsigned long *rmap = &memslot->arch.rmap[gfn];
> +             kvmhv_remove_nest_rmap_list(kvm, rmap, hpa, addr_mask);
> +     }
> +}
> +
> +static void kvmhv_free_memslot_nest_rmap(struct kvm_memory_slot *free)
> +{
> +     unsigned long page;
> +
> +     for (page = 0; page < free->npages; page++) {
> +             unsigned long rmap, *rmapp = &free->arch.rmap[page];
> +             struct rmap_nested *cursor;
> +             struct llist_node *entry;
> +
> +             entry = llist_del_all((struct llist_head *) rmapp);
> +             for_each_nest_rmap_safe(cursor, entry, &rmap)
> +                     kfree(cursor);
> +     }
> +}
> +
>  static bool kvmhv_invalidate_shadow_pte(struct kvm_vcpu *vcpu,
>                                       struct kvm_nested_guest *gp,
>                                       long gpa, int *shift_ret)
> @@ -695,11 +821,13 @@ static long int __kvmhv_nested_page_fault(struct 
> kvm_vcpu *vcpu,
>  {
>       struct kvm *kvm = vcpu->kvm;
>       struct kvm_memory_slot *memslot;
> +     struct rmap_nested *n_rmap;
>       struct kvmppc_pte gpte;
>       pte_t pte, *pte_p;
>       unsigned long mmu_seq;
>       unsigned long dsisr = vcpu->arch.fault_dsisr;
>       unsigned long ea = vcpu->arch.fault_dar;
> +     unsigned long *rmapp;
>       unsigned long n_gpa, gpa, gfn, perm = 0UL;
>       unsigned int shift, l1_shift, level;
>       bool writing = !!(dsisr & DSISR_ISSTORE);
> @@ -833,8 +961,16 @@ static long int __kvmhv_nested_page_fault(struct 
> kvm_vcpu *vcpu,
>  
>       /* 4. Insert the pte into our shadow_pgtable */
>  
> +     n_rmap = kzalloc(sizeof(*n_rmap), GFP_KERNEL);
> +     if (!n_rmap)
> +             return RESUME_GUEST; /* Let the guest try again */
> +     n_rmap->rmap = (n_gpa & RMAP_NESTED_GPA_MASK) |
> +             (((unsigned long) gp->l1_lpid) << RMAP_NESTED_LPID_SHIFT);
> +     rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
>       ret = kvmppc_create_pte(kvm, gp->shadow_pgtable, pte, n_gpa, level,
> -                             mmu_seq, gp->shadow_lpid);
> +                             mmu_seq, gp->shadow_lpid, rmapp, &n_rmap);
> +     if (n_rmap)
> +             kfree(n_rmap);
>       if (ret == -EAGAIN)
>               ret = RESUME_GUEST;     /* Let the guest try again */
>  

-- 
David Gibson                    | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
                                | _way_ _around_!
http://www.ozlabs.org/~dgibson

Attachment: signature.asc
Description: PGP signature

Reply via email to