[PATCH v4 08/28] vfio: powerpc/spapr: Register memory

2015-02-16 Thread Alexey Kardashevskiy
The existing implementation accounts the whole DMA window in
the locked_vm counter which is going to be even worse with multiple
containers and huge DMA windows.

This introduces 2 ioctls to register/unregister DMA memory which
receive user space address and size of a memory region which
needs to be pinned/unpinned and counted in locked_vm.

If any memory region was registered, all subsequent DMA map requests
should address already pinned memory. If no memory was registered,
then the amount of memory required for a single default memory will be
accounted when the container is enabled and every map/unmap will pin/unpin
a page (with degraded performance).

Dynamic DMA window and in-kernel acceleration will require memory to
be preregistered in order to work.

The accounting is done per VFIO container. When the support of
multiple groups per container is added, we will have accurate locked_vm
accounting.

Signed-off-by: Alexey Kardashevskiy 
---
Changes:
v4:
* updated docs
* s/kzmalloc/vzalloc/
* in tce_pin_pages()/tce_unpin_pages() removed @vaddr, @size and
replaced offset with index
* renamed vfio_iommu_type_register_memory to vfio_iommu_spapr_register_memory
and removed duplicating vfio_iommu_spapr_register_memory
---
 Documentation/vfio.txt  |  19 +++
 drivers/vfio/vfio_iommu_spapr_tce.c | 274 +++-
 include/uapi/linux/vfio.h   |  25 
 3 files changed, 312 insertions(+), 6 deletions(-)

diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index 96978ec..791e85c 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -427,6 +427,25 @@ The code flow from the example above should be slightly 
changed:
 

 
+5) PPC64 paravirtualized guests may generate a lot of map/unmap requests,
+and the handling of those includes pinning/unpinning pages and updating
+mm::locked_vm counter to make sure we do not exceed the rlimit. Handling these
+in real-mode is quite expensive and may fail. In order to simplify in-kernel
+acceleration of map/unmap requests, two ioctls have been added to pre-register
+and unregister guest RAM pages where DMA can possibly happen to. Having these
+calles, the userspace and in-kernel handlers do not have to take care of
+pinning or accounting.
+
+The ioctls are VFIO_IOMMU_SPAPR_REGISTER_MEMORY and
+VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY.
+These receive a user space address and size of the block to be pinned.
+Bisecting is not supported and VFIO_IOMMU_UNREGISTER_MEMORY is expected to
+be called with the exact address and size used for registering
+the memory block.
+
+The user space is not expected to call these often and the block descriptors
+are stored in a linked list in the kernel.
+
 ---
 
 [1] VFIO was originally an acronym for "Virtual Function I/O" in its
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 7fd60f9..9b884e0 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -93,8 +94,196 @@ struct tce_container {
struct iommu_table *tbl;
bool enabled;
unsigned long locked_pages;
+   struct list_head mem_list;
 };
 
+struct tce_memory {
+   struct list_head next;
+   struct rcu_head rcu;
+   __u64 vaddr;
+   __u64 size;
+   __u64 hpas[];
+};
+
+static inline bool tce_preregistered(struct tce_container *container)
+{
+   return !list_empty(>mem_list);
+}
+
+static struct tce_memory *tce_mem_alloc(struct tce_container *container,
+   __u64 vaddr, __u64 size)
+{
+   struct tce_memory *mem;
+   long ret;
+
+   ret = try_increment_locked_vm(size >> PAGE_SHIFT);
+   if (ret)
+   return NULL;
+
+   mem = vzalloc(sizeof(*mem) + (size >> (PAGE_SHIFT - 3)));
+   if (!mem) {
+   decrement_locked_vm(size >> PAGE_SHIFT);
+   return NULL;
+   }
+
+   mem->vaddr = vaddr;
+   mem->size = size;
+
+   list_add_rcu(>next, >mem_list);
+
+   return mem;
+}
+
+static void release_tce_memory(struct rcu_head *head)
+{
+   struct tce_memory *mem = container_of(head, struct tce_memory, rcu);
+
+   vfree(mem);
+}
+
+static void tce_mem_free(struct tce_memory *mem)
+{
+   decrement_locked_vm(mem->size);
+   list_del_rcu(>next);
+   call_rcu(>rcu, release_tce_memory);
+}
+
+static struct tce_memory *tce_pinned_desc(struct tce_container *container,
+   __u64 vaddr, __u64 size)
+{
+   struct tce_memory *mem, *ret = NULL;
+
+   rcu_read_lock();
+   vaddr &= ~(TCE_PCI_READ | TCE_PCI_WRITE);
+   list_for_each_entry_rcu(mem, >mem_list, next) {
+   if ((mem->vaddr <= vaddr) &&
+   (vaddr + size <= mem->vaddr + mem->size)) {
+   ret = mem;
+

[PATCH v4 08/28] vfio: powerpc/spapr: Register memory

2015-02-16 Thread Alexey Kardashevskiy
The existing implementation accounts the whole DMA window in
the locked_vm counter which is going to be even worse with multiple
containers and huge DMA windows.

This introduces 2 ioctls to register/unregister DMA memory which
receive user space address and size of a memory region which
needs to be pinned/unpinned and counted in locked_vm.

If any memory region was registered, all subsequent DMA map requests
should address already pinned memory. If no memory was registered,
then the amount of memory required for a single default memory will be
accounted when the container is enabled and every map/unmap will pin/unpin
a page (with degraded performance).

Dynamic DMA window and in-kernel acceleration will require memory to
be preregistered in order to work.

The accounting is done per VFIO container. When the support of
multiple groups per container is added, we will have accurate locked_vm
accounting.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
Changes:
v4:
* updated docs
* s/kzmalloc/vzalloc/
* in tce_pin_pages()/tce_unpin_pages() removed @vaddr, @size and
replaced offset with index
* renamed vfio_iommu_type_register_memory to vfio_iommu_spapr_register_memory
and removed duplicating vfio_iommu_spapr_register_memory
---
 Documentation/vfio.txt  |  19 +++
 drivers/vfio/vfio_iommu_spapr_tce.c | 274 +++-
 include/uapi/linux/vfio.h   |  25 
 3 files changed, 312 insertions(+), 6 deletions(-)

diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index 96978ec..791e85c 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -427,6 +427,25 @@ The code flow from the example above should be slightly 
changed:
 

 
+5) PPC64 paravirtualized guests may generate a lot of map/unmap requests,
+and the handling of those includes pinning/unpinning pages and updating
+mm::locked_vm counter to make sure we do not exceed the rlimit. Handling these
+in real-mode is quite expensive and may fail. In order to simplify in-kernel
+acceleration of map/unmap requests, two ioctls have been added to pre-register
+and unregister guest RAM pages where DMA can possibly happen to. Having these
+calles, the userspace and in-kernel handlers do not have to take care of
+pinning or accounting.
+
+The ioctls are VFIO_IOMMU_SPAPR_REGISTER_MEMORY and
+VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY.
+These receive a user space address and size of the block to be pinned.
+Bisecting is not supported and VFIO_IOMMU_UNREGISTER_MEMORY is expected to
+be called with the exact address and size used for registering
+the memory block.
+
+The user space is not expected to call these often and the block descriptors
+are stored in a linked list in the kernel.
+
 ---
 
 [1] VFIO was originally an acronym for Virtual Function I/O in its
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 7fd60f9..9b884e0 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -21,6 +21,7 @@
 #include linux/uaccess.h
 #include linux/err.h
 #include linux/vfio.h
+#include linux/vmalloc.h
 #include asm/iommu.h
 #include asm/tce.h
 
@@ -93,8 +94,196 @@ struct tce_container {
struct iommu_table *tbl;
bool enabled;
unsigned long locked_pages;
+   struct list_head mem_list;
 };
 
+struct tce_memory {
+   struct list_head next;
+   struct rcu_head rcu;
+   __u64 vaddr;
+   __u64 size;
+   __u64 hpas[];
+};
+
+static inline bool tce_preregistered(struct tce_container *container)
+{
+   return !list_empty(container-mem_list);
+}
+
+static struct tce_memory *tce_mem_alloc(struct tce_container *container,
+   __u64 vaddr, __u64 size)
+{
+   struct tce_memory *mem;
+   long ret;
+
+   ret = try_increment_locked_vm(size  PAGE_SHIFT);
+   if (ret)
+   return NULL;
+
+   mem = vzalloc(sizeof(*mem) + (size  (PAGE_SHIFT - 3)));
+   if (!mem) {
+   decrement_locked_vm(size  PAGE_SHIFT);
+   return NULL;
+   }
+
+   mem-vaddr = vaddr;
+   mem-size = size;
+
+   list_add_rcu(mem-next, container-mem_list);
+
+   return mem;
+}
+
+static void release_tce_memory(struct rcu_head *head)
+{
+   struct tce_memory *mem = container_of(head, struct tce_memory, rcu);
+
+   vfree(mem);
+}
+
+static void tce_mem_free(struct tce_memory *mem)
+{
+   decrement_locked_vm(mem-size);
+   list_del_rcu(mem-next);
+   call_rcu(mem-rcu, release_tce_memory);
+}
+
+static struct tce_memory *tce_pinned_desc(struct tce_container *container,
+   __u64 vaddr, __u64 size)
+{
+   struct tce_memory *mem, *ret = NULL;
+
+   rcu_read_lock();
+   vaddr = ~(TCE_PCI_READ | TCE_PCI_WRITE);
+   list_for_each_entry_rcu(mem, container-mem_list, next) {
+   if ((mem-vaddr = vaddr) 
+