Re: [PATCH kernel v9 28/32] powerpc/mmu: Add userspace-to-physical addresses translation cache

2015-05-05 Thread David Gibson
On Fri, May 01, 2015 at 09:26:48PM +1000, Alexey Kardashevskiy wrote:
> On 04/29/2015 05:01 PM, David Gibson wrote:
> >On Sat, Apr 25, 2015 at 10:14:52PM +1000, Alexey Kardashevskiy wrote:
> >>We are adding support for DMA memory pre-registration to be used in
> >>conjunction with VFIO. The idea is that the userspace which is going to
> >>run a guest may want to pre-register a user space memory region so
> >>it all gets pinned once and never goes away. Having this done,
> >>a hypervisor will not have to pin/unpin pages on every DMA map/unmap
> >>request. This is going to help with multiple pinning of the same memory
> >>and in-kernel acceleration of DMA requests.
> >>
> >>This adds a list of memory regions to mm_context_t. Each region consists
> >>of a header and a list of physical addresses. This adds API to:
> >>1. register/unregister memory regions;
> >>2. do final cleanup (which puts all pre-registered pages);
> >>3. do userspace to physical address translation;
> >>4. manage a mapped pages counter; when it is zero, it is safe to
> >>unregister the region.
> >>
> >>Multiple registration of the same region is allowed, kref is used to
> >>track the number of registrations.
> >>
> >>Signed-off-by: Alexey Kardashevskiy 
> >>---
> >>Changes:
> >>v8:
> >>* s/mm_iommu_table_group_mem_t/struct mm_iommu_table_group_mem_t/
> >>* fixed error fallback look (s/[i]/[j]/)
> >>---
> >>  arch/powerpc/include/asm/mmu-hash64.h  |   3 +
> >>  arch/powerpc/include/asm/mmu_context.h |  17 +++
> >>  arch/powerpc/mm/Makefile   |   1 +
> >>  arch/powerpc/mm/mmu_context_hash64.c   |   6 +
> >>  arch/powerpc/mm/mmu_context_hash64_iommu.c | 215 
> >> +
> >>  5 files changed, 242 insertions(+)
> >>  create mode 100644 arch/powerpc/mm/mmu_context_hash64_iommu.c
> >>
> >>diff --git a/arch/powerpc/include/asm/mmu-hash64.h 
> >>b/arch/powerpc/include/asm/mmu-hash64.h
> >>index 1da6a81..a82f534 100644
> >>--- a/arch/powerpc/include/asm/mmu-hash64.h
> >>+++ b/arch/powerpc/include/asm/mmu-hash64.h
> >>@@ -536,6 +536,9 @@ typedef struct {
> >>/* for 4K PTE fragment support */
> >>void *pte_frag;
> >>  #endif
> >>+#ifdef CONFIG_SPAPR_TCE_IOMMU
> >>+   struct list_head iommu_group_mem_list;
> >>+#endif
> >
> >Urgh.  I know I'm not one to talk, having done the hugepage crap in
> >there, but man mm_context_t has grown to a bloated mess from orginally
> >being just intended as a context ID integer :/.
> 
> 
> Where else to put it then?... The other way to go would be some global map
> of pid<->iommu_group_mem_list which needs to be available from both VFIO and
> KVM.

I'd suggest putting it as a new field in mm_struct, guarded by a
CONFIG_VFIO_PREREGISTER (or something) which you can make sure is
selected by CONFIG_SPAPR_TCE_IOMMU.

> 
> 
> >>  } mm_context_t;
> >>
> >>
> >>diff --git a/arch/powerpc/include/asm/mmu_context.h 
> >>b/arch/powerpc/include/asm/mmu_context.h
> >>index 73382eb..d6116ca 100644
> >>--- a/arch/powerpc/include/asm/mmu_context.h
> >>+++ b/arch/powerpc/include/asm/mmu_context.h
> >>@@ -16,6 +16,23 @@
> >>   */
> >>  extern int init_new_context(struct task_struct *tsk, struct mm_struct 
> >> *mm);
> >>  extern void destroy_context(struct mm_struct *mm);
> >>+#ifdef CONFIG_SPAPR_TCE_IOMMU
> >>+struct mm_iommu_table_group_mem_t;
> >>+
> >>+extern bool mm_iommu_preregistered(void);
> >>+extern long mm_iommu_alloc(unsigned long ua, unsigned long entries,
> >>+   struct mm_iommu_table_group_mem_t **pmem);
> >>+extern struct mm_iommu_table_group_mem_t *mm_iommu_get(unsigned long ua,
> >>+   unsigned long entries);
> >>+extern long mm_iommu_put(struct mm_iommu_table_group_mem_t *mem);
> >>+extern void mm_iommu_cleanup(mm_context_t *ctx);
> >>+extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(unsigned long ua,
> >>+   unsigned long size);
> >>+extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
> >>+   unsigned long ua, unsigned long *hpa);
> >>+extern long mm_iommu_mapped_update(struct mm_iommu_table_group_mem_t *mem,
> >>+   bool inc);
> >>+#endif
> >>
> >>  extern void switch_mmu_context(struct mm_struct *prev, struct mm_struct 
> >> *next);
> >>  extern void switch_slb(struct task_struct *tsk, struct mm_struct *mm);
> >>diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
> >>index 9c8770b..e216704 100644
> >>--- a/arch/powerpc/mm/Makefile
> >>+++ b/arch/powerpc/mm/Makefile
> >>@@ -36,3 +36,4 @@ obj-$(CONFIG_PPC_SUBPAGE_PROT)+= subpage-prot.o
> >>  obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
> >>  obj-$(CONFIG_HIGHMEM) += highmem.o
> >>  obj-$(CONFIG_PPC_COPRO_BASE)  += copro_fault.o
> >>+obj-$(CONFIG_SPAPR_TCE_IOMMU)  += mmu_context_hash64_iommu.o
> >>diff --git a/arch/powerpc/mm/mmu_context_hash64.c 
> >>b/arch/powerpc/mm/mmu_context_hash64.c
> >>index 178876ae..eb3080c 100644
> >>--- a/arch/powerpc/mm/mmu_context_hash64.c
> >>+++ b/arch/powerpc/mm/mm

Re: [PATCH kernel v9 28/32] powerpc/mmu: Add userspace-to-physical addresses translation cache

2015-05-01 Thread Alexey Kardashevskiy

On 04/29/2015 05:01 PM, David Gibson wrote:

On Sat, Apr 25, 2015 at 10:14:52PM +1000, Alexey Kardashevskiy wrote:

We are adding support for DMA memory pre-registration to be used in
conjunction with VFIO. The idea is that the userspace which is going to
run a guest may want to pre-register a user space memory region so
it all gets pinned once and never goes away. Having this done,
a hypervisor will not have to pin/unpin pages on every DMA map/unmap
request. This is going to help with multiple pinning of the same memory
and in-kernel acceleration of DMA requests.

This adds a list of memory regions to mm_context_t. Each region consists
of a header and a list of physical addresses. This adds API to:
1. register/unregister memory regions;
2. do final cleanup (which puts all pre-registered pages);
3. do userspace to physical address translation;
4. manage a mapped pages counter; when it is zero, it is safe to
unregister the region.

Multiple registration of the same region is allowed, kref is used to
track the number of registrations.

Signed-off-by: Alexey Kardashevskiy 
---
Changes:
v8:
* s/mm_iommu_table_group_mem_t/struct mm_iommu_table_group_mem_t/
* fixed error fallback look (s/[i]/[j]/)
---
  arch/powerpc/include/asm/mmu-hash64.h  |   3 +
  arch/powerpc/include/asm/mmu_context.h |  17 +++
  arch/powerpc/mm/Makefile   |   1 +
  arch/powerpc/mm/mmu_context_hash64.c   |   6 +
  arch/powerpc/mm/mmu_context_hash64_iommu.c | 215 +
  5 files changed, 242 insertions(+)
  create mode 100644 arch/powerpc/mm/mmu_context_hash64_iommu.c

diff --git a/arch/powerpc/include/asm/mmu-hash64.h 
b/arch/powerpc/include/asm/mmu-hash64.h
index 1da6a81..a82f534 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -536,6 +536,9 @@ typedef struct {
/* for 4K PTE fragment support */
void *pte_frag;
  #endif
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+   struct list_head iommu_group_mem_list;
+#endif


Urgh.  I know I'm not one to talk, having done the hugepage crap in
there, but man mm_context_t has grown to a bloated mess from orginally
being just intended as a context ID integer :/.



Where else to put it then?... The other way to go would be some global map 
of pid<->iommu_group_mem_list which needs to be available from both VFIO 
and KVM.




  } mm_context_t;


diff --git a/arch/powerpc/include/asm/mmu_context.h 
b/arch/powerpc/include/asm/mmu_context.h
index 73382eb..d6116ca 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -16,6 +16,23 @@
   */
  extern int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
  extern void destroy_context(struct mm_struct *mm);
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+struct mm_iommu_table_group_mem_t;
+
+extern bool mm_iommu_preregistered(void);
+extern long mm_iommu_alloc(unsigned long ua, unsigned long entries,
+   struct mm_iommu_table_group_mem_t **pmem);
+extern struct mm_iommu_table_group_mem_t *mm_iommu_get(unsigned long ua,
+   unsigned long entries);
+extern long mm_iommu_put(struct mm_iommu_table_group_mem_t *mem);
+extern void mm_iommu_cleanup(mm_context_t *ctx);
+extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(unsigned long ua,
+   unsigned long size);
+extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
+   unsigned long ua, unsigned long *hpa);
+extern long mm_iommu_mapped_update(struct mm_iommu_table_group_mem_t *mem,
+   bool inc);
+#endif

  extern void switch_mmu_context(struct mm_struct *prev, struct mm_struct 
*next);
  extern void switch_slb(struct task_struct *tsk, struct mm_struct *mm);
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 9c8770b..e216704 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -36,3 +36,4 @@ obj-$(CONFIG_PPC_SUBPAGE_PROT)+= subpage-prot.o
  obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
  obj-$(CONFIG_HIGHMEM) += highmem.o
  obj-$(CONFIG_PPC_COPRO_BASE)  += copro_fault.o
+obj-$(CONFIG_SPAPR_TCE_IOMMU)  += mmu_context_hash64_iommu.o
diff --git a/arch/powerpc/mm/mmu_context_hash64.c 
b/arch/powerpc/mm/mmu_context_hash64.c
index 178876ae..eb3080c 100644
--- a/arch/powerpc/mm/mmu_context_hash64.c
+++ b/arch/powerpc/mm/mmu_context_hash64.c
@@ -89,6 +89,9 @@ int init_new_context(struct task_struct *tsk, struct 
mm_struct *mm)
  #ifdef CONFIG_PPC_64K_PAGES
mm->context.pte_frag = NULL;
  #endif
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+   INIT_LIST_HEAD_RCU(&mm->context.iommu_group_mem_list);
+#endif
return 0;
  }

@@ -132,6 +135,9 @@ static inline void destroy_pagetable_page(struct mm_struct 
*mm)

  void destroy_context(struct mm_struct *mm)
  {
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+   mm_iommu_cleanup(&mm->context);
+#endif

  #ifdef CONFIG_PPC_ICSWX
drop_cop(mm->context.acop, mm);
diff --git a/arch/powerpc/mm/mmu_c

Re: [PATCH kernel v9 28/32] powerpc/mmu: Add userspace-to-physical addresses translation cache

2015-04-30 Thread David Gibson
On Thu, Apr 30, 2015 at 06:25:25PM +1000, Paul Mackerras wrote:
> On Thu, Apr 30, 2015 at 04:34:55PM +1000, David Gibson wrote:
> > On Sat, Apr 25, 2015 at 10:14:52PM +1000, Alexey Kardashevskiy wrote:
> > > We are adding support for DMA memory pre-registration to be used in
> > > conjunction with VFIO. The idea is that the userspace which is going to
> > > run a guest may want to pre-register a user space memory region so
> > > it all gets pinned once and never goes away. Having this done,
> > > a hypervisor will not have to pin/unpin pages on every DMA map/unmap
> > > request. This is going to help with multiple pinning of the same memory
> > > and in-kernel acceleration of DMA requests.
> > > 
> > > This adds a list of memory regions to mm_context_t. Each region consists
> > > of a header and a list of physical addresses. This adds API to:
> > > 1. register/unregister memory regions;
> > > 2. do final cleanup (which puts all pre-registered pages);
> > > 3. do userspace to physical address translation;
> > > 4. manage a mapped pages counter; when it is zero, it is safe to
> > > unregister the region.
> > > 
> > > Multiple registration of the same region is allowed, kref is used to
> > > track the number of registrations.
> > 
> > [snip]
> > > +long mm_iommu_alloc(unsigned long ua, unsigned long entries,
> > > + struct mm_iommu_table_group_mem_t **pmem)
> > > +{
> > > + struct mm_iommu_table_group_mem_t *mem;
> > > + long i, j;
> > > + struct page *page = NULL;
> > > +
> > > + list_for_each_entry_rcu(mem, ¤t->mm->context.iommu_group_mem_list,
> > > + next) {
> > > + if ((mem->ua == ua) && (mem->entries == entries))
> > > + return -EBUSY;
> > > +
> > > + /* Overlap? */
> > > + if ((mem->ua < (ua + (entries << PAGE_SHIFT))) &&
> > > + (ua < (mem->ua + (mem->entries << PAGE_SHIFT
> > > + return -EINVAL;
> > > + }
> > > +
> > > + mem = kzalloc(sizeof(*mem), GFP_KERNEL);
> > > + if (!mem)
> > > + return -ENOMEM;
> > > +
> > > + mem->hpas = vzalloc(entries * sizeof(mem->hpas[0]));
> > > + if (!mem->hpas) {
> > > + kfree(mem);
> > > + return -ENOMEM;
> > > + }
> > 
> > So, I've thought more about this and I'm really confused as to what
> > this is supposed to be accomplishing.
> > 
> > I see that you need to keep track of what regions are registered, so
> > you don't double lock or unlock, but I don't see what the point of
> > actualy storing the translations in hpas is.
> > 
> > I had assumed it was so that you could later on get to the
> > translations in real mode when you do in-kernel acceleration.  But
> > that doesn't make sense, because the array is vmalloc()ed, so can't be
> > accessed in real mode anyway.
> 
> We can access vmalloc'd arrays in real mode using real_vmalloc_addr().

Ah, ok.

> > I can't think of a circumstance in which you can use hpas where you
> > couldn't just walk the page tables anyway.
> 
> The problem with walking the page tables is that there is no guarantee
> that the page you find that way is the page that was returned by the
> gup_fast() we did earlier.  Storing the hpas means that we know for
> sure that the page we're doing DMA to is one that we have an elevated
> page count on.
> 
> Also, there are various points where a Linux PTE is made temporarily
> invalid for a short time.  If we happened to do a H_PUT_TCE on one cpu
> while another cpu was doing that, we'd get a spurious failure returned
> by the H_PUT_TCE.

I think we want this explanation in the commit message.  Anr/or in a
comment somewhere, I'm not sure.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


pgpvdLlhle7Fo.pgp
Description: PGP signature
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH kernel v9 28/32] powerpc/mmu: Add userspace-to-physical addresses translation cache

2015-04-30 Thread Paul Mackerras
On Thu, Apr 30, 2015 at 04:34:55PM +1000, David Gibson wrote:
> On Sat, Apr 25, 2015 at 10:14:52PM +1000, Alexey Kardashevskiy wrote:
> > We are adding support for DMA memory pre-registration to be used in
> > conjunction with VFIO. The idea is that the userspace which is going to
> > run a guest may want to pre-register a user space memory region so
> > it all gets pinned once and never goes away. Having this done,
> > a hypervisor will not have to pin/unpin pages on every DMA map/unmap
> > request. This is going to help with multiple pinning of the same memory
> > and in-kernel acceleration of DMA requests.
> > 
> > This adds a list of memory regions to mm_context_t. Each region consists
> > of a header and a list of physical addresses. This adds API to:
> > 1. register/unregister memory regions;
> > 2. do final cleanup (which puts all pre-registered pages);
> > 3. do userspace to physical address translation;
> > 4. manage a mapped pages counter; when it is zero, it is safe to
> > unregister the region.
> > 
> > Multiple registration of the same region is allowed, kref is used to
> > track the number of registrations.
> 
> [snip]
> > +long mm_iommu_alloc(unsigned long ua, unsigned long entries,
> > +   struct mm_iommu_table_group_mem_t **pmem)
> > +{
> > +   struct mm_iommu_table_group_mem_t *mem;
> > +   long i, j;
> > +   struct page *page = NULL;
> > +
> > +   list_for_each_entry_rcu(mem, ¤t->mm->context.iommu_group_mem_list,
> > +   next) {
> > +   if ((mem->ua == ua) && (mem->entries == entries))
> > +   return -EBUSY;
> > +
> > +   /* Overlap? */
> > +   if ((mem->ua < (ua + (entries << PAGE_SHIFT))) &&
> > +   (ua < (mem->ua + (mem->entries << PAGE_SHIFT
> > +   return -EINVAL;
> > +   }
> > +
> > +   mem = kzalloc(sizeof(*mem), GFP_KERNEL);
> > +   if (!mem)
> > +   return -ENOMEM;
> > +
> > +   mem->hpas = vzalloc(entries * sizeof(mem->hpas[0]));
> > +   if (!mem->hpas) {
> > +   kfree(mem);
> > +   return -ENOMEM;
> > +   }
> 
> So, I've thought more about this and I'm really confused as to what
> this is supposed to be accomplishing.
> 
> I see that you need to keep track of what regions are registered, so
> you don't double lock or unlock, but I don't see what the point of
> actualy storing the translations in hpas is.
> 
> I had assumed it was so that you could later on get to the
> translations in real mode when you do in-kernel acceleration.  But
> that doesn't make sense, because the array is vmalloc()ed, so can't be
> accessed in real mode anyway.

We can access vmalloc'd arrays in real mode using real_vmalloc_addr().

> I can't think of a circumstance in which you can use hpas where you
> couldn't just walk the page tables anyway.

The problem with walking the page tables is that there is no guarantee
that the page you find that way is the page that was returned by the
gup_fast() we did earlier.  Storing the hpas means that we know for
sure that the page we're doing DMA to is one that we have an elevated
page count on.

Also, there are various points where a Linux PTE is made temporarily
invalid for a short time.  If we happened to do a H_PUT_TCE on one cpu
while another cpu was doing that, we'd get a spurious failure returned
by the H_PUT_TCE.

Paul.
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH kernel v9 28/32] powerpc/mmu: Add userspace-to-physical addresses translation cache

2015-04-30 Thread David Gibson
On Sat, Apr 25, 2015 at 10:14:52PM +1000, Alexey Kardashevskiy wrote:
> We are adding support for DMA memory pre-registration to be used in
> conjunction with VFIO. The idea is that the userspace which is going to
> run a guest may want to pre-register a user space memory region so
> it all gets pinned once and never goes away. Having this done,
> a hypervisor will not have to pin/unpin pages on every DMA map/unmap
> request. This is going to help with multiple pinning of the same memory
> and in-kernel acceleration of DMA requests.
> 
> This adds a list of memory regions to mm_context_t. Each region consists
> of a header and a list of physical addresses. This adds API to:
> 1. register/unregister memory regions;
> 2. do final cleanup (which puts all pre-registered pages);
> 3. do userspace to physical address translation;
> 4. manage a mapped pages counter; when it is zero, it is safe to
> unregister the region.
> 
> Multiple registration of the same region is allowed, kref is used to
> track the number of registrations.

[snip]
> +long mm_iommu_alloc(unsigned long ua, unsigned long entries,
> + struct mm_iommu_table_group_mem_t **pmem)
> +{
> + struct mm_iommu_table_group_mem_t *mem;
> + long i, j;
> + struct page *page = NULL;
> +
> + list_for_each_entry_rcu(mem, ¤t->mm->context.iommu_group_mem_list,
> + next) {
> + if ((mem->ua == ua) && (mem->entries == entries))
> + return -EBUSY;
> +
> + /* Overlap? */
> + if ((mem->ua < (ua + (entries << PAGE_SHIFT))) &&
> + (ua < (mem->ua + (mem->entries << PAGE_SHIFT
> + return -EINVAL;
> + }
> +
> + mem = kzalloc(sizeof(*mem), GFP_KERNEL);
> + if (!mem)
> + return -ENOMEM;
> +
> + mem->hpas = vzalloc(entries * sizeof(mem->hpas[0]));
> + if (!mem->hpas) {
> + kfree(mem);
> + return -ENOMEM;
> + }

So, I've thought more about this and I'm really confused as to what
this is supposed to be accomplishing.

I see that you need to keep track of what regions are registered, so
you don't double lock or unlock, but I don't see what the point of
actualy storing the translations in hpas is.

I had assumed it was so that you could later on get to the
translations in real mode when you do in-kernel acceleration.  But
that doesn't make sense, because the array is vmalloc()ed, so can't be
accessed in real mode anyway.

I can't think of a circumstance in which you can use hpas where you
couldn't just walk the page tables anyway.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


pgpXYAi7YA0h0.pgp
Description: PGP signature
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH kernel v9 28/32] powerpc/mmu: Add userspace-to-physical addresses translation cache

2015-04-29 Thread David Gibson
On Sat, Apr 25, 2015 at 10:14:52PM +1000, Alexey Kardashevskiy wrote:
> We are adding support for DMA memory pre-registration to be used in
> conjunction with VFIO. The idea is that the userspace which is going to
> run a guest may want to pre-register a user space memory region so
> it all gets pinned once and never goes away. Having this done,
> a hypervisor will not have to pin/unpin pages on every DMA map/unmap
> request. This is going to help with multiple pinning of the same memory
> and in-kernel acceleration of DMA requests.
> 
> This adds a list of memory regions to mm_context_t. Each region consists
> of a header and a list of physical addresses. This adds API to:
> 1. register/unregister memory regions;
> 2. do final cleanup (which puts all pre-registered pages);
> 3. do userspace to physical address translation;
> 4. manage a mapped pages counter; when it is zero, it is safe to
> unregister the region.
> 
> Multiple registration of the same region is allowed, kref is used to
> track the number of registrations.
> 
> Signed-off-by: Alexey Kardashevskiy 
> ---
> Changes:
> v8:
> * s/mm_iommu_table_group_mem_t/struct mm_iommu_table_group_mem_t/
> * fixed error fallback look (s/[i]/[j]/)
> ---
>  arch/powerpc/include/asm/mmu-hash64.h  |   3 +
>  arch/powerpc/include/asm/mmu_context.h |  17 +++
>  arch/powerpc/mm/Makefile   |   1 +
>  arch/powerpc/mm/mmu_context_hash64.c   |   6 +
>  arch/powerpc/mm/mmu_context_hash64_iommu.c | 215 
> +
>  5 files changed, 242 insertions(+)
>  create mode 100644 arch/powerpc/mm/mmu_context_hash64_iommu.c
> 
> diff --git a/arch/powerpc/include/asm/mmu-hash64.h 
> b/arch/powerpc/include/asm/mmu-hash64.h
> index 1da6a81..a82f534 100644
> --- a/arch/powerpc/include/asm/mmu-hash64.h
> +++ b/arch/powerpc/include/asm/mmu-hash64.h
> @@ -536,6 +536,9 @@ typedef struct {
>   /* for 4K PTE fragment support */
>   void *pte_frag;
>  #endif
> +#ifdef CONFIG_SPAPR_TCE_IOMMU
> + struct list_head iommu_group_mem_list;
> +#endif

Urgh.  I know I'm not one to talk, having done the hugepage crap in
there, but man mm_context_t has grown to a bloated mess from orginally
being just intended as a context ID integer :/.

>  } mm_context_t;
>  
>  
> diff --git a/arch/powerpc/include/asm/mmu_context.h 
> b/arch/powerpc/include/asm/mmu_context.h
> index 73382eb..d6116ca 100644
> --- a/arch/powerpc/include/asm/mmu_context.h
> +++ b/arch/powerpc/include/asm/mmu_context.h
> @@ -16,6 +16,23 @@
>   */
>  extern int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
>  extern void destroy_context(struct mm_struct *mm);
> +#ifdef CONFIG_SPAPR_TCE_IOMMU
> +struct mm_iommu_table_group_mem_t;
> +
> +extern bool mm_iommu_preregistered(void);
> +extern long mm_iommu_alloc(unsigned long ua, unsigned long entries,
> + struct mm_iommu_table_group_mem_t **pmem);
> +extern struct mm_iommu_table_group_mem_t *mm_iommu_get(unsigned long ua,
> + unsigned long entries);
> +extern long mm_iommu_put(struct mm_iommu_table_group_mem_t *mem);
> +extern void mm_iommu_cleanup(mm_context_t *ctx);
> +extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(unsigned long ua,
> + unsigned long size);
> +extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
> + unsigned long ua, unsigned long *hpa);
> +extern long mm_iommu_mapped_update(struct mm_iommu_table_group_mem_t *mem,
> + bool inc);
> +#endif
>  
>  extern void switch_mmu_context(struct mm_struct *prev, struct mm_struct 
> *next);
>  extern void switch_slb(struct task_struct *tsk, struct mm_struct *mm);
> diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
> index 9c8770b..e216704 100644
> --- a/arch/powerpc/mm/Makefile
> +++ b/arch/powerpc/mm/Makefile
> @@ -36,3 +36,4 @@ obj-$(CONFIG_PPC_SUBPAGE_PROT)  += subpage-prot.o
>  obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
>  obj-$(CONFIG_HIGHMEM)+= highmem.o
>  obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o
> +obj-$(CONFIG_SPAPR_TCE_IOMMU)+= mmu_context_hash64_iommu.o
> diff --git a/arch/powerpc/mm/mmu_context_hash64.c 
> b/arch/powerpc/mm/mmu_context_hash64.c
> index 178876ae..eb3080c 100644
> --- a/arch/powerpc/mm/mmu_context_hash64.c
> +++ b/arch/powerpc/mm/mmu_context_hash64.c
> @@ -89,6 +89,9 @@ int init_new_context(struct task_struct *tsk, struct 
> mm_struct *mm)
>  #ifdef CONFIG_PPC_64K_PAGES
>   mm->context.pte_frag = NULL;
>  #endif
> +#ifdef CONFIG_SPAPR_TCE_IOMMU
> + INIT_LIST_HEAD_RCU(&mm->context.iommu_group_mem_list);
> +#endif
>   return 0;
>  }
>  
> @@ -132,6 +135,9 @@ static inline void destroy_pagetable_page(struct 
> mm_struct *mm)
>  
>  void destroy_context(struct mm_struct *mm)
>  {
> +#ifdef CONFIG_SPAPR_TCE_IOMMU
> + mm_iommu_cleanup(&mm->context);
> +#endif
>  
>  #ifdef CONFIG_PPC_ICSWX
>   drop_cop(mm->context.acop, mm);
> diff --git a/arch/powerpc/mm/mmu_context_hash

[PATCH kernel v9 28/32] powerpc/mmu: Add userspace-to-physical addresses translation cache

2015-04-25 Thread Alexey Kardashevskiy
We are adding support for DMA memory pre-registration to be used in
conjunction with VFIO. The idea is that the userspace which is going to
run a guest may want to pre-register a user space memory region so
it all gets pinned once and never goes away. Having this done,
a hypervisor will not have to pin/unpin pages on every DMA map/unmap
request. This is going to help with multiple pinning of the same memory
and in-kernel acceleration of DMA requests.

This adds a list of memory regions to mm_context_t. Each region consists
of a header and a list of physical addresses. This adds API to:
1. register/unregister memory regions;
2. do final cleanup (which puts all pre-registered pages);
3. do userspace to physical address translation;
4. manage a mapped pages counter; when it is zero, it is safe to
unregister the region.

Multiple registration of the same region is allowed, kref is used to
track the number of registrations.

Signed-off-by: Alexey Kardashevskiy 
---
Changes:
v8:
* s/mm_iommu_table_group_mem_t/struct mm_iommu_table_group_mem_t/
* fixed error fallback look (s/[i]/[j]/)
---
 arch/powerpc/include/asm/mmu-hash64.h  |   3 +
 arch/powerpc/include/asm/mmu_context.h |  17 +++
 arch/powerpc/mm/Makefile   |   1 +
 arch/powerpc/mm/mmu_context_hash64.c   |   6 +
 arch/powerpc/mm/mmu_context_hash64_iommu.c | 215 +
 5 files changed, 242 insertions(+)
 create mode 100644 arch/powerpc/mm/mmu_context_hash64_iommu.c

diff --git a/arch/powerpc/include/asm/mmu-hash64.h 
b/arch/powerpc/include/asm/mmu-hash64.h
index 1da6a81..a82f534 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -536,6 +536,9 @@ typedef struct {
/* for 4K PTE fragment support */
void *pte_frag;
 #endif
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+   struct list_head iommu_group_mem_list;
+#endif
 } mm_context_t;
 
 
diff --git a/arch/powerpc/include/asm/mmu_context.h 
b/arch/powerpc/include/asm/mmu_context.h
index 73382eb..d6116ca 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -16,6 +16,23 @@
  */
 extern int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
 extern void destroy_context(struct mm_struct *mm);
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+struct mm_iommu_table_group_mem_t;
+
+extern bool mm_iommu_preregistered(void);
+extern long mm_iommu_alloc(unsigned long ua, unsigned long entries,
+   struct mm_iommu_table_group_mem_t **pmem);
+extern struct mm_iommu_table_group_mem_t *mm_iommu_get(unsigned long ua,
+   unsigned long entries);
+extern long mm_iommu_put(struct mm_iommu_table_group_mem_t *mem);
+extern void mm_iommu_cleanup(mm_context_t *ctx);
+extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(unsigned long ua,
+   unsigned long size);
+extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
+   unsigned long ua, unsigned long *hpa);
+extern long mm_iommu_mapped_update(struct mm_iommu_table_group_mem_t *mem,
+   bool inc);
+#endif
 
 extern void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next);
 extern void switch_slb(struct task_struct *tsk, struct mm_struct *mm);
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 9c8770b..e216704 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -36,3 +36,4 @@ obj-$(CONFIG_PPC_SUBPAGE_PROT)+= subpage-prot.o
 obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
 obj-$(CONFIG_HIGHMEM)  += highmem.o
 obj-$(CONFIG_PPC_COPRO_BASE)   += copro_fault.o
+obj-$(CONFIG_SPAPR_TCE_IOMMU)  += mmu_context_hash64_iommu.o
diff --git a/arch/powerpc/mm/mmu_context_hash64.c 
b/arch/powerpc/mm/mmu_context_hash64.c
index 178876ae..eb3080c 100644
--- a/arch/powerpc/mm/mmu_context_hash64.c
+++ b/arch/powerpc/mm/mmu_context_hash64.c
@@ -89,6 +89,9 @@ int init_new_context(struct task_struct *tsk, struct 
mm_struct *mm)
 #ifdef CONFIG_PPC_64K_PAGES
mm->context.pte_frag = NULL;
 #endif
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+   INIT_LIST_HEAD_RCU(&mm->context.iommu_group_mem_list);
+#endif
return 0;
 }
 
@@ -132,6 +135,9 @@ static inline void destroy_pagetable_page(struct mm_struct 
*mm)
 
 void destroy_context(struct mm_struct *mm)
 {
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+   mm_iommu_cleanup(&mm->context);
+#endif
 
 #ifdef CONFIG_PPC_ICSWX
drop_cop(mm->context.acop, mm);
diff --git a/arch/powerpc/mm/mmu_context_hash64_iommu.c 
b/arch/powerpc/mm/mmu_context_hash64_iommu.c
new file mode 100644
index 000..af7668c
--- /dev/null
+++ b/arch/powerpc/mm/mmu_context_hash64_iommu.c
@@ -0,0 +1,215 @@
+/*
+ *  IOMMU helpers in MMU context.
+ *
+ *  Copyright (C) 2015 IBM Corp. 
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the L