from:"Shenming Lu"

Some devices only allow selective DMA faulting. Similar to the selective
dirty page tracking, the vendor driver can call vfio_pin_pages() to
indicate the non-faultable scope, we add a new struct vfio_range to
record it, then when the IOPF handler receives any page request out
of the scope, we can directly return with an invalid response.

Suggested-by: Kevin Tian 
Signed-off-by: Shenming Lu 
---
 drivers/vfio/vfio.c |   4 +-
 drivers/vfio/vfio_iommu_type1.c | 357 +++-
 include/linux/vfio.h|   1 +
 3 files changed, 358 insertions(+), 4 deletions(-)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 38779e6fd80c..44c8dfabf7de 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -2013,7 +2013,8 @@ int vfio_unpin_pages(struct device *dev, unsigned long 
*user_pfn, int npage)
container = group->container;
driver = container->iommu_driver;
if (likely(driver && driver->ops->unpin_pages))
-   ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
+   ret = driver->ops->unpin_pages(container->iommu_data,
+  group->iommu_group, user_pfn,
   npage);
else
ret = -ENOTTY;
@@ -2112,6 +2113,7 @@ int vfio_group_unpin_pages(struct vfio_group *group,
driver = container->iommu_driver;
if (likely(driver && driver->ops->unpin_pages))
ret = driver->ops->unpin_pages(container->iommu_data,
+  group->iommu_group,
   user_iova_pfn, npage);
else
ret = -ENOTTY;
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index dcc93c3b258c..ba2b5a1cf6e9 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -150,10 +150,19 @@ struct vfio_regions {
 static struct rb_root iopf_group_list = RB_ROOT;
 static DEFINE_MUTEX(iopf_group_list_lock);
 
+struct vfio_range {
+   struct rb_node  node;
+   dma_addr_t  base_iova;
+   size_t  span;
+   unsigned intref_count;
+};
+
 struct vfio_iopf_group {
struct rb_node  node;
struct iommu_group  *iommu_group;
struct vfio_iommu   *iommu;
+   struct rb_root  pinned_range_list;
+   boolselective_faulting;
 };
 
 #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)\
@@ -496,6 +505,255 @@ static void vfio_unlink_iopf_group(struct vfio_iopf_group 
*old)
mutex_unlock(&iopf_group_list_lock);
 }
 
+/*
+ * Helper functions for range list, handle one page at a time.
+ */
+static struct vfio_range *vfio_find_range(struct rb_root *range_list,
+ dma_addr_t iova)
+{
+   struct rb_node *node = range_list->rb_node;
+   struct vfio_range *range;
+
+   while (node) {
+   range = rb_entry(node, struct vfio_range, node);
+
+   if (iova + PAGE_SIZE <= range->base_iova)
+   node = node->rb_left;
+   else if (iova >= range->base_iova + range->span)
+   node = node->rb_right;
+   else
+   return range;
+   }
+
+   return NULL;
+}
+
+/* Do the possible merge adjacent to the input range. */
+static void vfio_merge_range_list(struct rb_root *range_list,
+ struct vfio_range *range)
+{
+   struct rb_node *node_prev = rb_prev(&range->node);
+   struct rb_node *node_next = rb_next(&range->node);
+
+   if (node_next) {
+   struct vfio_range *range_next = rb_entry(node_next,
+struct vfio_range,
+node);
+
+   if (range_next->base_iova == (range->base_iova + range->span) &&
+   range_next->ref_count == range->ref_count) {
+   rb_erase(node_next, range_list);
+   range->span += range_next->span;
+   kfree(range_next);
+   }
+   }
+
+   if (node_prev) {
+   struct vfio_range *range_prev = rb_entry(node_prev,
+struct vfio_range,
+node);
+
+   if (range->base_iova == (range_prev->base_iova + 
range_prev->span)
+   && range->ref_count == range_prev->ref_count) {
+   rb_erase(&range->node, range_list);
+   range_prev->span += range->span;
+

[RFC PATCH v3 6/8] vfio/type1: No need to statically pin and map if IOPF enabled

If IOPF enabled for the VFIO container, there is no need to statically
pin and map the entire DMA range, we can do it on demand. And unmap
according to the IOPF mapped bitmap when removing vfio_dma.

Note that we still mark all pages dirty even if IOPF enabled, we may
add IOPF-based fine grained dirty tracking support in the future.

Signed-off-by: Shenming Lu 
---
 drivers/vfio/vfio_iommu_type1.c | 38 +++--
 1 file changed, 32 insertions(+), 6 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 7df5711e743a..dcc93c3b258c 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -175,6 +175,7 @@ struct vfio_iopf_group {
 #define IOPF_MAPPED_BITMAP_GET(dma, i) \
  ((dma->iopf_mapped_bitmap[(i) / BITS_PER_LONG]
\
   >> ((i) % BITS_PER_LONG)) & 0x1)
+#define IOPF_MAPPED_BITMAP_BYTES(n)DIRTY_BITMAP_BYTES(n)
 
 #define WAITED 1
 
@@ -959,7 +960,8 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
 * already pinned and accounted. Accouting should be done if there is no
 * iommu capable domain in the container.
 */
-   do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
+   do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) ||
+   iommu->iopf_enabled;
 
for (i = 0; i < npage; i++) {
struct vfio_pfn *vpfn;
@@ -1048,7 +1050,8 @@ static int vfio_iommu_type1_unpin_pages(void *iommu_data,
 
mutex_lock(&iommu->lock);
 
-   do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
+   do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) ||
+   iommu->iopf_enabled;
for (i = 0; i < npage; i++) {
struct vfio_dma *dma;
dma_addr_t iova;
@@ -1169,7 +1172,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, 
struct vfio_dma *dma,
if (!dma->size)
return 0;
 
-   if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
+   if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) || iommu->iopf_enabled)
return 0;
 
/*
@@ -1306,11 +1309,20 @@ static void vfio_unmap_partial_iopf(struct vfio_iommu 
*iommu,
}
 }
 
+static void vfio_dma_clean_iopf(struct vfio_iommu *iommu, struct vfio_dma *dma)
+{
+   vfio_unmap_partial_iopf(iommu, dma, dma->iova, dma->iova + dma->size);
+
+   kfree(dma->iopf_mapped_bitmap);
+}
+
 static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
 {
WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list));
vfio_unmap_unpin(iommu, dma, true);
vfio_unlink_dma(iommu, dma);
+   if (iommu->iopf_enabled)
+   vfio_dma_clean_iopf(iommu, dma);
put_task_struct(dma->task);
vfio_dma_bitmap_free(dma);
if (dma->vaddr_invalid) {
@@ -1359,7 +1371,8 @@ static int update_user_bitmap(u64 __user *bitmap, struct 
vfio_iommu *iommu,
 * mark all pages dirty if any IOMMU capable device is not able
 * to report dirty pages and all pages are pinned and mapped.
 */
-   if (iommu->num_non_pinned_groups && dma->iommu_mapped)
+   if (iommu->num_non_pinned_groups &&
+   (dma->iommu_mapped || iommu->iopf_enabled))
bitmap_set(dma->bitmap, 0, nbits);
 
if (shift) {
@@ -1772,6 +1785,16 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
goto out_unlock;
}
 
+   if (iommu->iopf_enabled) {
+   dma->iopf_mapped_bitmap = kvzalloc(IOPF_MAPPED_BITMAP_BYTES(
+   size >> PAGE_SHIFT), 
GFP_KERNEL);
+   if (!dma->iopf_mapped_bitmap) {
+   ret = -ENOMEM;
+   kfree(dma);
+   goto out_unlock;
+   }
+   }
+
iommu->dma_avail--;
dma->iova = iova;
dma->vaddr = vaddr;
@@ -1811,8 +1834,11 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
/* Insert zero-sized and grow as we map chunks of it */
vfio_link_dma(iommu, dma);
 
-   /* Don't pin and map if container doesn't contain IOMMU capable domain*/
-   if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
+   /*
+* Don't pin and map if container doesn't contain IOMMU capable domain,
+* or IOPF enabled for the container.
+*/
+   if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) || iommu->iopf_enabled)
dma->size = size;
else
ret = vfio_pin_map_dma(iommu, dma, size);
-- 
2.19.1

[RFC PATCH v3 8/8] vfio: Add nested IOPF support

To set up nested mode, drivers such as vfio_pci need to register a
handler to receive stage/level 1 faults from the IOMMU, but since
currently each device can only have one iommu dev fault handler,
and if stage 2 IOPF is already enabled (VFIO_IOMMU_ENABLE_IOPF),
we choose to update the registered handler (a consolidated one) via
flags (set FAULT_REPORT_NESTED_L1), and further deliver the received
stage 1 faults in the handler to the guest through a newly added
vfio_device_ops callback.

Signed-off-by: Shenming Lu 
---
 drivers/vfio/vfio.c | 81 +
 drivers/vfio/vfio_iommu_type1.c | 49 +++-
 include/linux/vfio.h| 12 +
 3 files changed, 141 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 44c8dfabf7de..4245f15914bf 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -2356,6 +2356,87 @@ struct iommu_domain *vfio_group_iommu_domain(struct 
vfio_group *group)
 }
 EXPORT_SYMBOL_GPL(vfio_group_iommu_domain);
 
+/*
+ * Register/Update the VFIO IOPF handler to receive
+ * nested stage/level 1 faults.
+ */
+int vfio_iommu_dev_fault_handler_register_nested(struct device *dev)
+{
+   struct vfio_container *container;
+   struct vfio_group *group;
+   struct vfio_iommu_driver *driver;
+   int ret;
+
+   if (!dev)
+   return -EINVAL;
+
+   group = vfio_group_get_from_dev(dev);
+   if (!group)
+   return -ENODEV;
+
+   ret = vfio_group_add_container_user(group);
+   if (ret)
+   goto out;
+
+   container = group->container;
+   driver = container->iommu_driver;
+   if (likely(driver && driver->ops->register_handler))
+   ret = driver->ops->register_handler(container->iommu_data, dev);
+   else
+   ret = -ENOTTY;
+
+   vfio_group_try_dissolve_container(group);
+
+out:
+   vfio_group_put(group);
+   return ret;
+}
+EXPORT_SYMBOL_GPL(vfio_iommu_dev_fault_handler_register_nested);
+
+int vfio_iommu_dev_fault_handler_unregister_nested(struct device *dev)
+{
+   struct vfio_container *container;
+   struct vfio_group *group;
+   struct vfio_iommu_driver *driver;
+   int ret;
+
+   if (!dev)
+   return -EINVAL;
+
+   group = vfio_group_get_from_dev(dev);
+   if (!group)
+   return -ENODEV;
+
+   ret = vfio_group_add_container_user(group);
+   if (ret)
+   goto out;
+
+   container = group->container;
+   driver = container->iommu_driver;
+   if (likely(driver && driver->ops->unregister_handler))
+   ret = driver->ops->unregister_handler(container->iommu_data, 
dev);
+   else
+   ret = -ENOTTY;
+
+   vfio_group_try_dissolve_container(group);
+
+out:
+   vfio_group_put(group);
+   return ret;
+}
+EXPORT_SYMBOL_GPL(vfio_iommu_dev_fault_handler_unregister_nested);
+
+int vfio_transfer_iommu_fault(struct device *dev, struct iommu_fault *fault)
+{
+   struct vfio_device *device = dev_get_drvdata(dev);
+
+   if (unlikely(!device->ops->transfer))
+   return -EOPNOTSUPP;
+
+   return device->ops->transfer(device->device_data, fault);
+}
+EXPORT_SYMBOL_GPL(vfio_transfer_iommu_fault);
+
 /**
  * Module/class support
  */
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index ba2b5a1cf6e9..9d1adeddb303 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -3821,13 +3821,32 @@ static int vfio_iommu_type1_dma_map_iopf(struct 
iommu_fault *fault, void *data)
struct vfio_batch batch;
struct vfio_range *range;
dma_addr_t iova = ALIGN_DOWN(fault->prm.addr, PAGE_SIZE);
-   int access_flags = 0;
+   int access_flags = 0, nested;
size_t premap_len, map_len, mapped_len = 0;
unsigned long bit_offset, vaddr, pfn, i, npages;
int ret;
enum iommu_page_response_code status = IOMMU_PAGE_RESP_INVALID;
struct iommu_page_response resp = {0};
 
+   if (vfio_dev_domian_nested(dev, &nested))
+   return -ENODEV;
+
+   /*
+* When configured in nested mode, further deliver the
+* stage/level 1 faults to the guest.
+*/
+   if (nested) {
+   bool l2;
+
+   if (fault->type == IOMMU_FAULT_PAGE_REQ)
+   l2 = fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_L2;
+   if (fault->type == IOMMU_FAULT_DMA_UNRECOV)
+   l2 = fault->event.flags & IOMMU_FAULT_UNRECOV_L2;
+
+   if (!l2)
+   return vfio_transfer_iommu_fault(dev, fault);
+   }
+
if (fault->type != IOMMU_FAULT_PAGE_REQ)
return -EOPNOTSUPP;
 
@@ -4201,6 +4220,32 @@ static void vfio_iommu_type1_notify(void *iommu

[RFC PATCH v3 3/8] vfio/type1: Add an MMU notifier to avoid pinning

To avoid pinning pages when they are mapped in IOMMU page tables, we
add an MMU notifier to tell the addresses which are no longer valid
and try to unmap them.

Signed-off-by: Shenming Lu 
---
 drivers/vfio/vfio_iommu_type1.c | 112 +++-
 1 file changed, 109 insertions(+), 3 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index ab0ff60ee207..1cb9d1f2717b 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -40,6 +40,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define DRIVER_VERSION  "0.2"
 #define DRIVER_AUTHOR   "Alex Williamson "
@@ -69,6 +70,7 @@ struct vfio_iommu {
struct mutexlock;
struct rb_root  dma_list;
struct blocking_notifier_head notifier;
+   struct mmu_notifier mn;
unsigned intdma_avail;
unsigned intvaddr_invalid_count;
uint64_tpgsize_bitmap;
@@ -1204,6 +1206,72 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, 
struct vfio_dma *dma,
return unlocked;
 }
 
+/* Unmap the IOPF mapped pages in the specified range. */
+static void vfio_unmap_partial_iopf(struct vfio_iommu *iommu,
+   struct vfio_dma *dma,
+   dma_addr_t start, dma_addr_t end)
+{
+   struct iommu_iotlb_gather *gathers;
+   struct vfio_domain *d;
+   int i, num_domains = 0;
+
+   list_for_each_entry(d, &iommu->domain_list, next)
+   num_domains++;
+
+   gathers = kzalloc(sizeof(*gathers) * num_domains, GFP_KERNEL);
+   if (gathers) {
+   for (i = 0; i < num_domains; i++)
+   iommu_iotlb_gather_init(&gathers[i]);
+   }
+
+   while (start < end) {
+   unsigned long bit_offset;
+   size_t len;
+
+   bit_offset = (start - dma->iova) >> PAGE_SHIFT;
+
+   for (len = 0; start + len < end; len += PAGE_SIZE) {
+   if (!IOPF_MAPPED_BITMAP_GET(dma,
+   bit_offset + (len >> PAGE_SHIFT)))
+   break;
+   }
+
+   if (len) {
+   i = 0;
+   list_for_each_entry(d, &iommu->domain_list, next) {
+   size_t unmapped;
+
+   if (gathers)
+   unmapped = iommu_unmap_fast(d->domain,
+   start, len,
+   
&gathers[i++]);
+   else
+   unmapped = iommu_unmap(d->domain,
+  start, len);
+
+   if (WARN_ON(unmapped != len))
+   goto out;
+   }
+
+   bitmap_clear(dma->iopf_mapped_bitmap,
+bit_offset, len >> PAGE_SHIFT);
+
+   cond_resched();
+   }
+
+   start += (len + PAGE_SIZE);
+   }
+
+out:
+   if (gathers) {
+   i = 0;
+   list_for_each_entry(d, &iommu->domain_list, next)
+   iommu_iotlb_sync(d->domain, &gathers[i++]);
+
+   kfree(gathers);
+   }
+}
+
 static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
 {
WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list));
@@ -3197,17 +3265,18 @@ static int vfio_iommu_type1_dma_map_iopf(struct 
iommu_fault *fault, void *data)
 
vaddr = iova - dma->iova + dma->vaddr;
 
-   if (vfio_pin_page_external(dma, vaddr, &pfn, true))
+   if (vfio_pin_page_external(dma, vaddr, &pfn, false))
goto out_invalid;
 
if (vfio_iommu_map(iommu, iova, pfn, 1, dma->prot)) {
-   if (put_pfn(pfn, dma->prot))
-   vfio_lock_acct(dma, -1, true);
+   put_pfn(pfn, dma->prot);
goto out_invalid;
}
 
bitmap_set(dma->iopf_mapped_bitmap, bit_offset, 1);
 
+   put_pfn(pfn, dma->prot);
+
 out_success:
status = IOMMU_PAGE_RESP_SUCCESS;
 
@@ -3220,6 +3289,43 @@ static int vfio_iommu_type1_dma_map_iopf(struct 
iommu_fault *fault, void *data)
return 0;
 }
 
+static void mn_invalidate_range(struct mmu_notifier *mn, struct mm_struct *mm,
+   unsigned long start, unsigned long end)
+{
+   struct vfio_iommu *iommu = container_of(mn, struct vfio_iommu, mn);
+   struct rb_node *n;
+   int ret;
+
+   mutex_lock(&iommu->lock);
+
+   ret = vfio_wait_all_valid(iommu);
+   if (WAR

[RFC PATCH v3 5/8] vfio/type1: VFIO_IOMMU_ENABLE_IOPF

Since enabling IOPF for devices may lead to a slow ramp up of performance,
we add an ioctl VFIO_IOMMU_ENABLE_IOPF to make it configurable. And the
IOPF enabling of a VFIO device includes setting IOMMU_DEV_FEAT_IOPF and
registering the VFIO IOPF handler.

Note that VFIO_IOMMU_DISABLE_IOPF is not supported since there may be
inflight page faults when disabling.

Signed-off-by: Shenming Lu 
---
 drivers/vfio/vfio_iommu_type1.c | 223 +++-
 include/uapi/linux/vfio.h   |   6 +
 2 files changed, 226 insertions(+), 3 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 01e296c6dc9e..7df5711e743a 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -71,6 +71,7 @@ struct vfio_iommu {
struct rb_root  dma_list;
struct blocking_notifier_head notifier;
struct mmu_notifier mn;
+   struct mm_struct*mm;
unsigned intdma_avail;
unsigned intvaddr_invalid_count;
uint64_tpgsize_bitmap;
@@ -81,6 +82,7 @@ struct vfio_iommu {
booldirty_page_tracking;
boolpinned_page_dirty_scope;
boolcontainer_open;
+   booliopf_enabled;
 };
 
 struct vfio_domain {
@@ -461,6 +463,38 @@ vfio_find_iopf_group(struct iommu_group *iommu_group)
return node ? iopf_group : NULL;
 }
 
+static void vfio_link_iopf_group(struct vfio_iopf_group *new)
+{
+   struct rb_node **link, *parent = NULL;
+   struct vfio_iopf_group *iopf_group;
+
+   mutex_lock(&iopf_group_list_lock);
+
+   link = &iopf_group_list.rb_node;
+
+   while (*link) {
+   parent = *link;
+   iopf_group = rb_entry(parent, struct vfio_iopf_group, node);
+
+   if (new->iommu_group < iopf_group->iommu_group)
+   link = &(*link)->rb_left;
+   else
+   link = &(*link)->rb_right;
+   }
+
+   rb_link_node(&new->node, parent, link);
+   rb_insert_color(&new->node, &iopf_group_list);
+
+   mutex_unlock(&iopf_group_list_lock);
+}
+
+static void vfio_unlink_iopf_group(struct vfio_iopf_group *old)
+{
+   mutex_lock(&iopf_group_list_lock);
+   rb_erase(&old->node, &iopf_group_list);
+   mutex_unlock(&iopf_group_list_lock);
+}
+
 static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
 {
struct mm_struct *mm;
@@ -2363,6 +2397,68 @@ static void vfio_iommu_iova_insert_copy(struct 
vfio_iommu *iommu,
list_splice_tail(iova_copy, iova);
 }
 
+static int vfio_dev_domian_nested(struct device *dev, int *nested)
+{
+   struct iommu_domain *domain;
+
+   domain = iommu_get_domain_for_dev(dev);
+   if (!domain)
+   return -ENODEV;
+
+   return iommu_domain_get_attr(domain, DOMAIN_ATTR_NESTING, nested);
+}
+
+static int vfio_iommu_type1_dma_map_iopf(struct iommu_fault *fault, void 
*data);
+
+static int dev_enable_iopf(struct device *dev, void *data)
+{
+   int *enabled_dev_cnt = data;
+   int nested;
+   u32 flags;
+   int ret;
+
+   ret = iommu_dev_enable_feature(dev, IOMMU_DEV_FEAT_IOPF);
+   if (ret)
+   return ret;
+
+   ret = vfio_dev_domian_nested(dev, &nested);
+   if (ret)
+   goto out_disable;
+
+   if (nested)
+   flags = FAULT_REPORT_NESTED_L2;
+   else
+   flags = FAULT_REPORT_FLAT;
+
+   ret = iommu_register_device_fault_handler(dev,
+   vfio_iommu_type1_dma_map_iopf, flags, dev);
+   if (ret)
+   goto out_disable;
+
+   (*enabled_dev_cnt)++;
+   return 0;
+
+out_disable:
+   iommu_dev_disable_feature(dev, IOMMU_DEV_FEAT_IOPF);
+   return ret;
+}
+
+static int dev_disable_iopf(struct device *dev, void *data)
+{
+   int *enabled_dev_cnt = data;
+
+   if (enabled_dev_cnt && *enabled_dev_cnt <= 0)
+   return -1;
+
+   WARN_ON(iommu_unregister_device_fault_handler(dev));
+   WARN_ON(iommu_dev_disable_feature(dev, IOMMU_DEV_FEAT_IOPF));
+
+   if (enabled_dev_cnt)
+   (*enabled_dev_cnt)--;
+
+   return 0;
+}
+
 static int vfio_iommu_type1_attach_group(void *iommu_data,
 struct iommu_group *iommu_group)
 {
@@ -2376,6 +2472,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
struct iommu_domain_geometry geo;
LIST_HEAD(iova_copy);
LIST_HEAD(group_resv_regions);
+   int iopf_enabled_dev_cnt = 0;
+   struct vfio_iopf_group *iopf_group = NULL;
 
mutex_lock(&iommu->lock);
 
@@ -2453,6 +2551,24 @@ static int vfio_iommu_type1_attach_group(void 
*iommu_data,
if (ret)
goto out_domain;

[RFC PATCH v3 4/8] vfio/type1: Pre-map more pages than requested in the IOPF handling

To optimize for fewer page fault handlings, we can pre-map more pages
than requested at once.

Note that IOPF_PREMAP_LEN is just an arbitrary value for now, which we
could try further tuning.

Signed-off-by: Shenming Lu 
---
 drivers/vfio/vfio_iommu_type1.c | 131 ++--
 1 file changed, 123 insertions(+), 8 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 1cb9d1f2717b..01e296c6dc9e 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -3217,6 +3217,91 @@ static int vfio_iommu_type1_dirty_pages(struct 
vfio_iommu *iommu,
return -EINVAL;
 }
 
+/*
+ * To optimize for fewer page fault handlings, try to
+ * pre-map more pages than requested.
+ */
+#define IOPF_PREMAP_LEN512
+
+/*
+ * Return 0 on success or a negative error code, the
+ * number of pages contiguously pinned is in @pinned.
+ */
+static int pin_pages_iopf(struct vfio_dma *dma, unsigned long vaddr,
+ unsigned long npages, unsigned long *pfn_base,
+ unsigned long *pinned, struct vfio_batch *batch)
+{
+   struct mm_struct *mm;
+   unsigned long pfn;
+   int ret = 0;
+   *pinned = 0;
+
+   mm = get_task_mm(dma->task);
+   if (!mm)
+   return -ENODEV;
+
+   if (batch->size) {
+   *pfn_base = page_to_pfn(batch->pages[batch->offset]);
+   pfn = *pfn_base;
+   } else {
+   *pfn_base = 0;
+   }
+
+   while (npages) {
+   if (!batch->size) {
+   unsigned long req_pages = min_t(unsigned long, npages,
+   batch->capacity);
+
+   ret = vaddr_get_pfns(mm, vaddr, req_pages, dma->prot,
+&pfn, batch->pages);
+   if (ret < 0)
+   goto out;
+
+   batch->size = ret;
+   batch->offset = 0;
+   ret = 0;
+
+   if (!*pfn_base)
+   *pfn_base = pfn;
+   }
+
+   while (true) {
+   if (pfn != *pfn_base + *pinned)
+   goto out;
+
+   (*pinned)++;
+   npages--;
+   vaddr += PAGE_SIZE;
+   batch->offset++;
+   batch->size--;
+
+   if (!batch->size)
+   break;
+
+   pfn = page_to_pfn(batch->pages[batch->offset]);
+   }
+
+   if (unlikely(disable_hugepages))
+   break;
+   }
+
+out:
+   if (batch->size == 1 && !batch->offset) {
+   put_pfn(pfn, dma->prot);
+   batch->size = 0;
+   }
+
+   mmput(mm);
+   return ret;
+}
+
+static void unpin_pages_iopf(struct vfio_dma *dma,
+unsigned long pfn, unsigned long npages)
+{
+   while (npages--)
+   put_pfn(pfn++, dma->prot);
+}
+
 /* VFIO I/O Page Fault handler */
 static int vfio_iommu_type1_dma_map_iopf(struct iommu_fault *fault, void *data)
 {
@@ -3225,9 +3310,11 @@ static int vfio_iommu_type1_dma_map_iopf(struct 
iommu_fault *fault, void *data)
struct vfio_iopf_group *iopf_group;
struct vfio_iommu *iommu;
struct vfio_dma *dma;
+   struct vfio_batch batch;
dma_addr_t iova = ALIGN_DOWN(fault->prm.addr, PAGE_SIZE);
int access_flags = 0;
-   unsigned long bit_offset, vaddr, pfn;
+   size_t premap_len, map_len, mapped_len = 0;
+   unsigned long bit_offset, vaddr, pfn, i, npages;
int ret;
enum iommu_page_response_code status = IOMMU_PAGE_RESP_INVALID;
struct iommu_page_response resp = {0};
@@ -3263,19 +3350,47 @@ static int vfio_iommu_type1_dma_map_iopf(struct 
iommu_fault *fault, void *data)
if (IOPF_MAPPED_BITMAP_GET(dma, bit_offset))
goto out_success;
 
+   premap_len = IOPF_PREMAP_LEN << PAGE_SHIFT;
+   npages = dma->size >> PAGE_SHIFT;
+   map_len = PAGE_SIZE;
+   for (i = bit_offset + 1; i < npages; i++) {
+   if (map_len >= premap_len || IOPF_MAPPED_BITMAP_GET(dma, i))
+   break;
+   map_len += PAGE_SIZE;
+   }
vaddr = iova - dma->iova + dma->vaddr;
+   vfio_batch_init(&batch);
 
-   if (vfio_pin_page_external(dma, vaddr, &pfn, false))
-   goto out_invalid;
+   while (map_len) {
+   ret = pin_pages_iopf(dma, vaddr + mapped_len,
+map_len >> PAGE_SHIFT, &pfn,
+&npages, &batch);
+   if (!npag

[RFC PATCH v3 2/8] vfio/type1: Add a page fault handler

VFIO manages the DMA mapping itself. To support IOPF (on-demand paging)
for VFIO (IOMMU capable) devices, we add a VFIO page fault handler to
serve the reported page faults from the IOMMU driver.

Signed-off-by: Shenming Lu 
---
 drivers/vfio/vfio_iommu_type1.c | 114 
 1 file changed, 114 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 45cbfd4879a5..ab0ff60ee207 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -101,6 +101,7 @@ struct vfio_dma {
struct task_struct  *task;
struct rb_root  pfn_list;   /* Ex-user pinned pfn list */
unsigned long   *bitmap;
+   unsigned long   *iopf_mapped_bitmap;
 };
 
 struct vfio_batch {
@@ -141,6 +142,16 @@ struct vfio_regions {
size_t len;
 };
 
+/* A global IOPF enabled group list */
+static struct rb_root iopf_group_list = RB_ROOT;
+static DEFINE_MUTEX(iopf_group_list_lock);
+
+struct vfio_iopf_group {
+   struct rb_node  node;
+   struct iommu_group  *iommu_group;
+   struct vfio_iommu   *iommu;
+};
+
 #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)\
(!list_empty(&iommu->domain_list))
 
@@ -157,6 +168,10 @@ struct vfio_regions {
 #define DIRTY_BITMAP_PAGES_MAX  ((u64)INT_MAX)
 #define DIRTY_BITMAP_SIZE_MAX   DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
 
+#define IOPF_MAPPED_BITMAP_GET(dma, i) \
+ ((dma->iopf_mapped_bitmap[(i) / BITS_PER_LONG]
\
+  >> ((i) % BITS_PER_LONG)) & 0x1)
+
 #define WAITED 1
 
 static int put_pfn(unsigned long pfn, int prot);
@@ -416,6 +431,34 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, 
struct vfio_pfn *vpfn)
return ret;
 }
 
+/*
+ * Helper functions for iopf_group_list
+ */
+static struct vfio_iopf_group *
+vfio_find_iopf_group(struct iommu_group *iommu_group)
+{
+   struct vfio_iopf_group *iopf_group;
+   struct rb_node *node;
+
+   mutex_lock(&iopf_group_list_lock);
+
+   node = iopf_group_list.rb_node;
+
+   while (node) {
+   iopf_group = rb_entry(node, struct vfio_iopf_group, node);
+
+   if (iommu_group < iopf_group->iommu_group)
+   node = node->rb_left;
+   else if (iommu_group > iopf_group->iommu_group)
+   node = node->rb_right;
+   else
+   break;
+   }
+
+   mutex_unlock(&iopf_group_list_lock);
+   return node ? iopf_group : NULL;
+}
+
 static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
 {
struct mm_struct *mm;
@@ -3106,6 +3149,77 @@ static int vfio_iommu_type1_dirty_pages(struct 
vfio_iommu *iommu,
return -EINVAL;
 }
 
+/* VFIO I/O Page Fault handler */
+static int vfio_iommu_type1_dma_map_iopf(struct iommu_fault *fault, void *data)
+{
+   struct device *dev = (struct device *)data;
+   struct iommu_group *iommu_group;
+   struct vfio_iopf_group *iopf_group;
+   struct vfio_iommu *iommu;
+   struct vfio_dma *dma;
+   dma_addr_t iova = ALIGN_DOWN(fault->prm.addr, PAGE_SIZE);
+   int access_flags = 0;
+   unsigned long bit_offset, vaddr, pfn;
+   int ret;
+   enum iommu_page_response_code status = IOMMU_PAGE_RESP_INVALID;
+   struct iommu_page_response resp = {0};
+
+   if (fault->type != IOMMU_FAULT_PAGE_REQ)
+   return -EOPNOTSUPP;
+
+   iommu_group = iommu_group_get(dev);
+   if (!iommu_group)
+   return -ENODEV;
+
+   iopf_group = vfio_find_iopf_group(iommu_group);
+   iommu_group_put(iommu_group);
+   if (!iopf_group)
+   return -ENODEV;
+
+   iommu = iopf_group->iommu;
+
+   mutex_lock(&iommu->lock);
+
+   ret = vfio_find_dma_valid(iommu, iova, PAGE_SIZE, &dma);
+   if (ret < 0)
+   goto out_invalid;
+
+   if (fault->prm.perm & IOMMU_FAULT_PERM_READ)
+   access_flags |= IOMMU_READ;
+   if (fault->prm.perm & IOMMU_FAULT_PERM_WRITE)
+   access_flags |= IOMMU_WRITE;
+   if ((dma->prot & access_flags) != access_flags)
+   goto out_invalid;
+
+   bit_offset = (iova - dma->iova) >> PAGE_SHIFT;
+   if (IOPF_MAPPED_BITMAP_GET(dma, bit_offset))
+   goto out_success;
+
+   vaddr = iova - dma->iova + dma->vaddr;
+
+   if (vfio_pin_page_external(dma, vaddr, &pfn, true))
+   goto out_invalid;
+
+   if (vfio_iommu_map(iommu, iova, pfn, 1, dma->prot)) {
+   if (put_pfn(pfn, dma->prot))
+   vfio_lock_acct(dma, -1, true);
+   goto out_invalid;
+   }
+
+   bitmap_set(dma->iopf_mapped_bitmap, bit_offset, 1);
+
+out_succe

[RFC PATCH v3 0/8] Add IOPF support for VFIO passthrough

Hi,

Requesting for your comments and suggestions. :-)

The static pinning and mapping problem in VFIO and possible solutions
have been discussed a lot [1, 2]. One of the solutions is to add I/O
Page Fault support for VFIO devices. Different from those relatively
complicated software approaches such as presenting a vIOMMU that provides
the DMA buffer information (might include para-virtualized optimizations),
IOPF mainly depends on the hardware faulting capability, such as the PCIe
PRI extension or Arm SMMU stall model. What's more, the IOPF support in
the IOMMU driver has already been implemented in SVA [3]. So we add IOPF
support for VFIO passthrough based on the IOPF part of SVA in this series.

We have measured its performance with UADK [4] (passthrough an accelerator
to a VM(1U16G)) on Hisilicon Kunpeng920 board (and compared with host SVA):

Run hisi_sec_test...
 - with varying sending times and message lengths
 - with/without IOPF enabled (speed slowdown)

when msg_len = 1MB (and PREMAP_LEN (in Patch 4) = 1):
slowdown (num of faults)
 times  VFIO IOPF  host SVA
 1  63.4% (518)82.8% (512)
 10022.9% (1058)   47.9% (1024)
 1000   2.6% (1071)8.5% (1024)

when msg_len = 10MB (and PREMAP_LEN = 512):
slowdown (num of faults)
 times  VFIO IOPF
 1  32.6% (13)
 1003.5% (26)
 1000   1.6% (26)

History:

v2 -> v3
 - Nit fixes.
 - No reason to disable reporting the unrecoverable faults. (baolu)
 - Maintain a global IOPF enabled group list.
 - Split the pre-mapping optimization to be a separate patch.
 - Add selective faulting support (use vfio_pin_pages to indicate the
   non-faultable scope and add a new struct vfio_range to record it,
   untested). (Kevin)

v1 -> v2
 - Numerous improvements following the suggestions. Thanks a lot to all
   of you.

Note that PRI is not supported at the moment since there is no hardware.

Links:
[1] Lesokhin I, et al. Page Fault Support for Network Controllers. In ASPLOS,
2016.
[2] Tian K, et al. coIOMMU: A Virtual IOMMU with Cooperative DMA Buffer Tracking
for Efficient Memory Management in Direct I/O. In USENIX ATC, 2020.
[3] 
https://patchwork.kernel.org/project/linux-arm-kernel/cover/20210401154718.307519-1-jean-phili...@linaro.org/
[4] https://github.com/Linaro/uadk

Thanks,
Shenming


Shenming Lu (8):
  iommu: Evolve the device fault reporting framework
  vfio/type1: Add a page fault handler
  vfio/type1: Add an MMU notifier to avoid pinning
  vfio/type1: Pre-map more pages than requested in the IOPF handling
  vfio/type1: VFIO_IOMMU_ENABLE_IOPF
  vfio/type1: No need to statically pin and map if IOPF enabled
  vfio/type1: Add selective DMA faulting support
  vfio: Add nested IOPF support

 .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c   |3 +-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   |   18 +-
 drivers/iommu/iommu.c |   56 +-
 drivers/vfio/vfio.c   |   85 +-
 drivers/vfio/vfio_iommu_type1.c   | 1000 -
 include/linux/iommu.h |   19 +-
 include/linux/vfio.h  |   13 +
 include/uapi/linux/iommu.h|4 +
 include/uapi/linux/vfio.h |6 +
 9 files changed, 1181 insertions(+), 23 deletions(-)

-- 
2.19.1

[RFC PATCH v3 1/8] iommu: Evolve the device fault reporting framework

This patch follows the discussion here:

https://lore.kernel.org/linux-acpi/YAaxjmJW+ZMvrhac@myrica/

Besides SVA/vSVA, such as VFIO may also enable (2nd level) IOPF to remove
pinning restriction. In order to better support more scenarios of using
device faults, we extend iommu_register_fault_handler() with flags and
introduce FAULT_REPORT_ to describe the device fault reporting capability
under a specific configuration.

Note that we don't further distinguish recoverable and unrecoverable faults
by flags in the fault reporting cap, having PAGE_FAULT_REPORT_ +
UNRECOV_FAULT_REPORT_ seems not a clean way.

In addition, still take VFIO as an example, in nested mode, the 1st level
and 2nd level fault reporting may be configured separately and currently
each device can only register one iommu dev fault handler, so we add a
handler update interface for this.

Signed-off-by: Shenming Lu 
---
 .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c   |  3 +-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   | 18 --
 drivers/iommu/iommu.c | 56 ++-
 include/linux/iommu.h | 19 ++-
 include/uapi/linux/iommu.h|  4 ++
 5 files changed, 90 insertions(+), 10 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index ee66d1f4cb81..e6d766fb8f1a 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -482,7 +482,8 @@ static int arm_smmu_master_sva_enable_iopf(struct 
arm_smmu_master *master)
if (ret)
return ret;
 
-   ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
+   ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf,
+ FAULT_REPORT_FLAT, dev);
if (ret) {
iopf_queue_remove_device(master->smmu->evtq.iopf, dev);
return ret;
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 53abad8fdd91..51843f54a87f 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1448,10 +1448,6 @@ static int arm_smmu_handle_evt(struct arm_smmu_device 
*smmu, u64 *evt)
return -EOPNOTSUPP;
}
 
-   /* Stage-2 is always pinned at the moment */
-   if (evt[1] & EVTQ_1_S2)
-   return -EFAULT;
-
if (evt[1] & EVTQ_1_RnW)
perm |= IOMMU_FAULT_PERM_READ;
else
@@ -1469,26 +1465,36 @@ static int arm_smmu_handle_evt(struct arm_smmu_device 
*smmu, u64 *evt)
.flags = IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE,
.grpid = FIELD_GET(EVTQ_1_STAG, evt[1]),
.perm = perm,
-   .addr = FIELD_GET(EVTQ_2_ADDR, evt[2]),
};
 
if (ssid_valid) {
flt->prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
flt->prm.pasid = FIELD_GET(EVTQ_0_SSID, evt[0]);
}
+
+   if (evt[1] & EVTQ_1_S2) {
+   flt->prm.flags |= IOMMU_FAULT_PAGE_REQUEST_L2;
+   flt->prm.addr = FIELD_GET(EVTQ_3_IPA, evt[3]);
+   } else
+   flt->prm.addr = FIELD_GET(EVTQ_2_ADDR, evt[2]);
} else {
flt->type = IOMMU_FAULT_DMA_UNRECOV;
flt->event = (struct iommu_fault_unrecoverable) {
.reason = reason,
.flags = IOMMU_FAULT_UNRECOV_ADDR_VALID,
.perm = perm,
-   .addr = FIELD_GET(EVTQ_2_ADDR, evt[2]),
};
 
if (ssid_valid) {
flt->event.flags |= IOMMU_FAULT_UNRECOV_PASID_VALID;
flt->event.pasid = FIELD_GET(EVTQ_0_SSID, evt[0]);
}
+
+   if (evt[1] & EVTQ_1_S2) {
+   flt->event.flags |= IOMMU_FAULT_UNRECOV_L2;
+   flt->event.addr = FIELD_GET(EVTQ_3_IPA, evt[3]);
+   } else
+   flt->event.addr = FIELD_GET(EVTQ_2_ADDR, evt[2]);
}
 
mutex_lock(&smmu->streams_mutex);
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index d0b0a15dba84..b50b526b45ac 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1056,6 +1056,40 @@ int iommu_group_unregister_notifier(struct iommu_group 
*group,
 }
 EXPORT_SYMBOL_GPL(iommu_group_unregister_notifier);
 
+/*
+ * iommu_update_device_fault_handler - Update the device fault handler via 
flags
+ * @dev: the device
+ * @mask: bits(not set) to clear
+ * @set: bits to set
+ *
+ * Update the device fault handler installed by
+ * iommu_register_device_fault_handler().
+ *
+ * Return 0 on su

[PATCH v1] vfio/type1: Remove the almost unused check in vfio_iommu_type1_unpin_pages

2021-04-06 Thread Shenming Lu

The check i > npage at the end of vfio_iommu_type1_unpin_pages is unused
unless npage < 0, but if npage < 0, this function will return npage, which
should return -EINVAL instead. So let's just check the parameter npage at
the start of the function. By the way, replace unpin_exit with break.

Signed-off-by: Shenming Lu 
---
 drivers/vfio/vfio_iommu_type1.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 45cbfd4879a5..fd4213c41743 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -960,7 +960,7 @@ static int vfio_iommu_type1_unpin_pages(void *iommu_data,
bool do_accounting;
int i;
 
-   if (!iommu || !user_pfn)
+   if (!iommu || !user_pfn || npage <= 0)
return -EINVAL;
 
/* Supported for v2 version only */
@@ -977,13 +977,13 @@ static int vfio_iommu_type1_unpin_pages(void *iommu_data,
iova = user_pfn[i] << PAGE_SHIFT;
dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
if (!dma)
-   goto unpin_exit;
+   break;
+
vfio_unpin_page_external(dma, iova, do_accounting);
}
 
-unpin_exit:
mutex_unlock(&iommu->lock);
-   return i > npage ? npage : (i > 0 ? i : -EINVAL);
+   return i > 0 ? i : -EINVAL;
 }
 
 static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain,
-- 
2.19.1

Re: [PATCH v5 0/6] KVM: arm64: Add VLPI migration support on GICv4.1

2021-03-24 Thread Shenming Lu

On 2021/3/25 2:19, Marc Zyngier wrote:
> On Mon, 22 Mar 2021 14:01:52 +0800, Shenming Lu wrote:
>> In GICv4.1, migration has been supported except for (directly-injected)
>> VLPI. And GICv4.1 Spec explicitly gives a way to get the VLPI's pending
>> state (which was crucially missing in GICv4.0). So we make VLPI migration
>> capable on GICv4.1 in this series.
>>
>> In order to support VLPI migration, we need to save and restore all
>> required configuration information and pending states of VLPIs. But
>> in fact, the configuration information of VLPIs has already been saved
>> (or will be reallocated on the dst host...) in vgic(kvm) migration.
>> So we only have to migrate the pending states of VLPIs specially.
>>
>> [...]
> 
> Applied to next, thanks!

Thanks a lot again for all the comments and suggestions. :-)

Shenming

> 
> [1/6] irqchip/gic-v3-its: Add a cache invalidation right after vPE unmapping
>   commit: 301beaf19739cb6e640ed44e630e7da993f0ecc8
> [2/6] irqchip/gic-v3-its: Drop the setting of PTZ altogether
>   commit: c21bc068cdbe5613d3319ae171c3f2eb9f321352
> [3/6] KVM: arm64: GICv4.1: Add function to get VLPI state
>   commit: 80317fe4a65375fae668672a1398a0fb73eb9023
> [4/6] KVM: arm64: GICv4.1: Try to save VLPI state in save_pending_tables
>   commit: f66b7b151e00427168409f8c1857970e926b1e27
> [5/6] KVM: arm64: GICv4.1: Restore VLPI pending state to physical side
>   commit: 12df7429213abbfa9632ab7db94f629ec309a58b
> [6/6] KVM: arm64: GICv4.1: Give a chance to save VLPI state
>   commit: 8082d50f4817ff6a7e08f4b7e9b18e5f8bfa290d
> 
> Cheers,
> 
>   M.
>

[PATCH v5 6/6] KVM: arm64: GICv4.1: Give a chance to save VLPI state

Before GICv4.1, we don't have direct access to the VLPI state. So
we simply let it fail early when encountering any VLPI in saving.

But now we don't have to return -EACCES directly if on GICv4.1. Let’s
change the hard code and give a chance to save the VLPI state (and
preserve the UAPI).

Signed-off-by: Shenming Lu 
---
 Documentation/virt/kvm/devices/arm-vgic-its.rst | 2 +-
 arch/arm64/kvm/vgic/vgic-its.c  | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Documentation/virt/kvm/devices/arm-vgic-its.rst 
b/Documentation/virt/kvm/devices/arm-vgic-its.rst
index 6c304fd2b1b4..d257eddbae29 100644
--- a/Documentation/virt/kvm/devices/arm-vgic-its.rst
+++ b/Documentation/virt/kvm/devices/arm-vgic-its.rst
@@ -80,7 +80,7 @@ KVM_DEV_ARM_VGIC_GRP_CTRL
 -EFAULT  Invalid guest ram access
 -EBUSY   One or more VCPUS are running
 -EACCES  The virtual ITS is backed by a physical GICv4 ITS, and the
-state is not available
+state is not available without GICv4.1
 ===  ==
 
 KVM_DEV_ARM_VGIC_GRP_ITS_REGS
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index 40cbaca81333..ec7543a9617c 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -2218,10 +2218,10 @@ static int vgic_its_save_itt(struct vgic_its *its, 
struct its_device *device)
/*
 * If an LPI carries the HW bit, this means that this
 * interrupt is controlled by GICv4, and we do not
-* have direct access to that state. Let's simply fail
-* the save operation...
+* have direct access to that state without GICv4.1.
+* Let's simply fail the save operation...
 */
-   if (ite->irq->hw)
+   if (ite->irq->hw && !kvm_vgic_global_state.has_gicv4_1)
return -EACCES;
 
ret = vgic_its_save_ite(its, device, ite, gpa, ite_esz);
-- 
2.19.1

[PATCH v5 2/6] irqchip/gic-v3-its: Drop the setting of PTZ altogether

GICv4.1 gives a way to get the VLPI state, which needs to map the
vPE first, and after the state read, we may remap the vPE back while
the VPT is not empty. So we can't assume that the VPT is empty at
the first map. Besides, the optimization of PTZ is probably limited
since the HW should be fairly efficient to parse the empty VPT. Let's
drop the setting of PTZ altogether.

Signed-off-by: Shenming Lu 
---
 drivers/irqchip/irq-gic-v3-its.c | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 4eb907f65bd0..c8b5a88ac31c 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -794,8 +794,16 @@ static struct its_vpe *its_build_vmapp_cmd(struct its_node 
*its,
 
its_encode_alloc(cmd, alloc);
 
-   /* We can only signal PTZ when alloc==1. Why do we have two bits? */
-   its_encode_ptz(cmd, alloc);
+   /*
+* We can only signal PTZ when alloc==1. Why do we have two bits?
+* GICv4.1 gives a way to get the VLPI state, which needs the vPE
+* to be unmapped first, and in this case, we may remap the vPE
+* back while the VPT is not empty. So we can't assume that the
+* VPT is empty at the first map. Besides, the optimization of PTZ
+* is probably limited since the HW should be fairly efficient to
+* parse the empty VPT. Let's drop the setting of PTZ altogether.
+*/
+   its_encode_ptz(cmd, false);
its_encode_vconf_addr(cmd, vconf_addr);
its_encode_vmapp_default_db(cmd, desc->its_vmapp_cmd.vpe->vpe_db_lpi);
 
-- 
2.19.1

[PATCH v5 4/6] KVM: arm64: GICv4.1: Try to save VLPI state in save_pending_tables

After pausing all vCPUs and devices capable of interrupting, in order
to save the states of all interrupts, besides flushing the states in
kvm’s vgic, we also try to flush the states of VLPIs in the virtual
pending tables into guest RAM, but we need to have GICv4.1 and safely
unmap the vPEs first.

As for the saving of VSGIs, which needs the vPEs to be mapped and might
conflict with the saving of VLPIs, but since we will map the vPEs back
at the end of save_pending_tables and both savings require the kvm->lock
to be held (thus only happen serially), it will work fine.

Signed-off-by: Shenming Lu 
---
 arch/arm64/kvm/vgic/vgic-v3.c | 66 +++
 1 file changed, 60 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 6f530925a231..41ecf219c333 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0-only
 
 #include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -356,6 +358,32 @@ int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, 
struct vgic_irq *irq)
return 0;
 }
 
+/*
+ * The deactivation of the doorbell interrupt will trigger the
+ * unmapping of the associated vPE.
+ */
+static void unmap_all_vpes(struct vgic_dist *dist)
+{
+   struct irq_desc *desc;
+   int i;
+
+   for (i = 0; i < dist->its_vm.nr_vpes; i++) {
+   desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
+   irq_domain_deactivate_irq(irq_desc_get_irq_data(desc));
+   }
+}
+
+static void map_all_vpes(struct vgic_dist *dist)
+{
+   struct irq_desc *desc;
+   int i;
+
+   for (i = 0; i < dist->its_vm.nr_vpes; i++) {
+   desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
+   irq_domain_activate_irq(irq_desc_get_irq_data(desc), false);
+   }
+}
+
 /**
  * vgic_v3_save_pending_tables - Save the pending tables into guest RAM
  * kvm lock and all vcpu lock must be held
@@ -365,13 +393,28 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
struct vgic_dist *dist = &kvm->arch.vgic;
struct vgic_irq *irq;
gpa_t last_ptr = ~(gpa_t)0;
-   int ret;
+   bool vlpi_avail = false;
+   int ret = 0;
u8 val;
 
+   if (unlikely(!vgic_initialized(kvm)))
+   return -ENXIO;
+
+   /*
+* A preparation for getting any VLPI states.
+* The above vgic initialized check also ensures that the allocation
+* and enabling of the doorbells have already been done.
+*/
+   if (kvm_vgic_global_state.has_gicv4_1) {
+   unmap_all_vpes(dist);
+   vlpi_avail = true;
+   }
+
list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
int byte_offset, bit_nr;
struct kvm_vcpu *vcpu;
gpa_t pendbase, ptr;
+   bool is_pending;
bool stored;
 
vcpu = irq->target_vcpu;
@@ -387,24 +430,35 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
if (ptr != last_ptr) {
ret = kvm_read_guest_lock(kvm, ptr, &val, 1);
if (ret)
-   return ret;
+   goto out;
last_ptr = ptr;
}
 
stored = val & (1U << bit_nr);
-   if (stored == irq->pending_latch)
+
+   is_pending = irq->pending_latch;
+
+   if (irq->hw && vlpi_avail)
+   vgic_v4_get_vlpi_state(irq, &is_pending);
+
+   if (stored == is_pending)
continue;
 
-   if (irq->pending_latch)
+   if (is_pending)
val |= 1 << bit_nr;
else
val &= ~(1 << bit_nr);
 
ret = kvm_write_guest_lock(kvm, ptr, &val, 1);
if (ret)
-   return ret;
+   goto out;
}
-   return 0;
+
+out:
+   if (vlpi_avail)
+   map_all_vpes(dist);
+
+   return ret;
 }
 
 /**
-- 
2.19.1

[PATCH v5 3/6] KVM: arm64: GICv4.1: Add function to get VLPI state

With GICv4.1 and the vPE unmapped, which indicates the invalidation
of any VPT caches associated with the vPE, we can get the VLPI state
by peeking at the VPT. So we add a function for this.

Signed-off-by: Shenming Lu 
---
 arch/arm64/kvm/vgic/vgic-v4.c | 19 +++
 arch/arm64/kvm/vgic/vgic.h|  1 +
 2 files changed, 20 insertions(+)

diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
index 66508b03094f..ac029ba3d337 100644
--- a/arch/arm64/kvm/vgic/vgic-v4.c
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@@ -203,6 +203,25 @@ void vgic_v4_configure_vsgis(struct kvm *kvm)
kvm_arm_resume_guest(kvm);
 }
 
+/*
+ * Must be called with GICv4.1 and the vPE unmapped, which
+ * indicates the invalidation of any VPT caches associated
+ * with the vPE, thus we can get the VLPI state by peeking
+ * at the VPT.
+ */
+void vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val)
+{
+   struct its_vpe *vpe = &irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
+   int mask = BIT(irq->intid % BITS_PER_BYTE);
+   void *va;
+   u8 *ptr;
+
+   va = page_address(vpe->vpt_page);
+   ptr = va + irq->intid / BITS_PER_BYTE;
+
+   *val = !!(*ptr & mask);
+}
+
 /**
  * vgic_v4_init - Initialize the GICv4 data structures
  * @kvm:   Pointer to the VM being initialized
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
index 64fcd750..d8cfd360838c 100644
--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -317,5 +317,6 @@ bool vgic_supports_direct_msis(struct kvm *kvm);
 int vgic_v4_init(struct kvm *kvm);
 void vgic_v4_teardown(struct kvm *kvm);
 void vgic_v4_configure_vsgis(struct kvm *kvm);
+void vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val);
 
 #endif
-- 
2.19.1

[PATCH v5 1/6] irqchip/gic-v3-its: Add a cache invalidation right after vPE unmapping

From: Marc Zyngier 

Since there may be a direct read from the CPU side to the VPT after
unmapping the vPE, we add a cache coherency maintenance at the end
of its_vpe_irq_domain_deactivate() to ensure the validity of the VPT
read later.

Signed-off-by: Marc Zyngier 
Signed-off-by: Shenming Lu 
---
 drivers/irqchip/irq-gic-v3-its.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index ed46e6057e33..4eb907f65bd0 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -4554,6 +4554,15 @@ static void its_vpe_irq_domain_deactivate(struct 
irq_domain *domain,
 
its_send_vmapp(its, vpe, false);
}
+
+   /*
+* There may be a direct read to the VPT after unmapping the
+* vPE, to guarantee the validity of this, we make the VPT
+* memory coherent with the CPU caches here.
+*/
+   if (find_4_1_its() && !atomic_read(&vpe->vmapp_count))
+   gic_flush_dcache_to_poc(page_address(vpe->vpt_page),
+   LPI_PENDBASE_SZ);
 }
 
 static const struct irq_domain_ops its_vpe_domain_ops = {
-- 
2.19.1

[PATCH v5 5/6] KVM: arm64: GICv4.1: Restore VLPI pending state to physical side

From: Zenghui Yu 

When setting the forwarding path of a VLPI (switch to the HW mode),
we can also transfer the pending state from irq->pending_latch to
VPT (especially in migration, the pending states of VLPIs are restored
into kvm’s vgic first). And we currently send "INT+VSYNC" to trigger
a VLPI to pending.

Signed-off-by: Zenghui Yu 
Signed-off-by: Shenming Lu 
---
 arch/arm64/kvm/vgic/vgic-v4.c | 19 +++
 1 file changed, 19 insertions(+)

diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
index ac029ba3d337..c1845d8f5f7e 100644
--- a/arch/arm64/kvm/vgic/vgic-v4.c
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@@ -404,6 +404,7 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq,
struct vgic_its *its;
struct vgic_irq *irq;
struct its_vlpi_map map;
+   unsigned long flags;
int ret;
 
if (!vgic_supports_direct_msis(kvm))
@@ -449,6 +450,24 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq,
irq->host_irq   = virq;
atomic_inc(&map.vpe->vlpi_count);
 
+   /* Transfer pending state */
+   raw_spin_lock_irqsave(&irq->irq_lock, flags);
+   if (irq->pending_latch) {
+   ret = irq_set_irqchip_state(irq->host_irq,
+   IRQCHIP_STATE_PENDING,
+   irq->pending_latch);
+   WARN_RATELIMIT(ret, "IRQ %d", irq->host_irq);
+
+   /*
+* Clear pending_latch and communicate this state
+* change via vgic_queue_irq_unlock.
+*/
+   irq->pending_latch = false;
+   vgic_queue_irq_unlock(kvm, irq, flags);
+   } else {
+   raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+   }
+
 out:
mutex_unlock(&its->its_lock);
return ret;
-- 
2.19.1

[PATCH v5 0/6] KVM: arm64: Add VLPI migration support on GICv4.1

Hi,

In GICv4.1, migration has been supported except for (directly-injected)
VLPI. And GICv4.1 Spec explicitly gives a way to get the VLPI's pending
state (which was crucially missing in GICv4.0). So we make VLPI migration
capable on GICv4.1 in this series.

In order to support VLPI migration, we need to save and restore all
required configuration information and pending states of VLPIs. But
in fact, the configuration information of VLPIs has already been saved
(or will be reallocated on the dst host...) in vgic(kvm) migration.
So we only have to migrate the pending states of VLPIs specially.

Below is the related workflow in migration.

On the save path:
In migration completion:
pause all vCPUs
|
call each VM state change handler:
pause other devices (just keep from sending interrupts, 
and
such as VFIO migration protocol has already realized it 
[1])
|
flush ITS tables into guest RAM
|
flush RDIST pending tables (also flush VLPI pending 
states here)
|
...
On the resume path:
load each device's state:
restore ITS tables (include pending tables) from guest RAM
|
for other (PCI) devices (paused), if configured to have VLPIs,
establish the forwarding paths of their VLPIs (and transfer
the pending states from kvm's vgic to VPT here)

We have tested this series in VFIO migration, and found some related
issues in QEMU [2].

Links:
[1] vfio: UAPI for migration interface for device state:

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a8a24f3f6e38103b77cf399c38eb54e1219d00d6
[2] vfio: Some fixes and optimizations for VFIO migration:

https://patchwork.ozlabs.org/project/qemu-devel/cover/20210310030233.1133-1-lushenm...@huawei.com/

History:

v4 -> v5
 - Lock the whole pending state read/write sequence. (in Patch 5, from Marc)

v3 -> v4
 - Nit fixes.
 - Add a CPU cache invalidation right after unmapping the vPE. (Patch 1)
 - Drop the setting of PTZ altogether. (Patch 2)
 - Bail out if spot !vgic_initialized(). (in Patch 4)
 - Communicate the state change (clear pending_latch) via
   vgic_queue_irq_unlock. (in Patch 5)

Thanks a lot for the suggestions from Marc!

v2 -> v3
 - Add the vgic initialized check to ensure that the allocation and enabling
   of the doorbells have already been done before unmapping the vPEs.
 - Check all get_vlpi_state related conditions in save_pending_tables in one 
place.
 - Nit fixes.

v1 -> v2:
 - Get the VLPI state from the KVM side.
 - Nit fixes.

Thanks,
Shenming


Marc Zyngier (1):
  irqchip/gic-v3-its: Add a cache invalidation right after vPE unmapping

Shenming Lu (4):
  irqchip/gic-v3-its: Drop the setting of PTZ altogether
  KVM: arm64: GICv4.1: Add function to get VLPI state
  KVM: arm64: GICv4.1: Try to save VLPI state in save_pending_tables
  KVM: arm64: GICv4.1: Give a chance to save VLPI state

Zenghui Yu (1):
  KVM: arm64: GICv4.1: Restore VLPI pending state to physical side

 .../virt/kvm/devices/arm-vgic-its.rst |  2 +-
 arch/arm64/kvm/vgic/vgic-its.c|  6 +-
 arch/arm64/kvm/vgic/vgic-v3.c | 66 +--
 arch/arm64/kvm/vgic/vgic-v4.c | 38 +++
 arch/arm64/kvm/vgic/vgic.h|  1 +
 drivers/irqchip/irq-gic-v3-its.c  | 21 +-
 6 files changed, 122 insertions(+), 12 deletions(-)

-- 
2.19.1

Re: [RFC PATCH v1 0/4] vfio: Add IOPF support for VFIO passthrough

2021-03-18 Thread Shenming Lu

On 2021/3/18 20:32, Tian, Kevin wrote:
>> From: Shenming Lu 
>> Sent: Thursday, March 18, 2021 7:54 PM
>>
>> On 2021/3/18 17:07, Tian, Kevin wrote:
>>>> From: Shenming Lu 
>>>> Sent: Thursday, March 18, 2021 3:53 PM
>>>>
>>>> On 2021/2/4 14:52, Tian, Kevin wrote:>>> In reality, many
>>>>>>> devices allow I/O faulting only in selective contexts. However, there
>>>>>>> is no standard way (e.g. PCISIG) for the device to report whether
>>>>>>> arbitrary I/O fault is allowed. Then we may have to maintain device
>>>>>>> specific knowledge in software, e.g. in an opt-in table to list devices
>>>>>>> which allows arbitrary faults. For devices which only support selective
>>>>>>> faulting, a mediator (either through vendor extensions on vfio-pci-core
>>>>>>> or a mdev wrapper) might be necessary to help lock down non-
>> faultable
>>>>>>> mappings and then enable faulting on the rest mappings.
>>>>>>
>>>>>> For devices which only support selective faulting, they could tell it to 
>>>>>> the
>>>>>> IOMMU driver and let it filter out non-faultable faults? Do I get it 
>>>>>> wrong?
>>>>>
>>>>> Not exactly to IOMMU driver. There is already a vfio_pin_pages() for
>>>>> selectively page-pinning. The matter is that 'they' imply some device
>>>>> specific logic to decide which pages must be pinned and such knowledge
>>>>> is outside of VFIO.
>>>>>
>>>>> From enabling p.o.v we could possibly do it in phased approach. First
>>>>> handles devices which tolerate arbitrary DMA faults, and then extends
>>>>> to devices with selective-faulting. The former is simpler, but with one
>>>>> main open whether we want to maintain such device IDs in a static
>>>>> table in VFIO or rely on some hints from other components (e.g. PF
>>>>> driver in VF assignment case). Let's see how Alex thinks about it.
>>>>
>>>> Hi Kevin,
>>>>
>>>> You mentioned selective-faulting some time ago. I still have some doubt
>>>> about it:
>>>> There is already a vfio_pin_pages() which is used for limiting the IOMMU
>>>> group dirty scope to pinned pages, could it also be used for indicating
>>>> the faultable scope is limited to the pinned pages and the rest mappings
>>>> is non-faultable that should be pinned and mapped immediately? But it
>>>> seems to be a little weird and not exactly to what you meant... I will
>>>> be grateful if you can help to explain further. :-)
>>>>
>>>
>>> The opposite, i.e. the vendor driver uses vfio_pin_pages to lock down
>>> pages that are not faultable (based on its specific knowledge) and then
>>> the rest memory becomes faultable.
>>
>> Ahh...
>> Thus, from the perspective of VFIO IOMMU, if IOPF enabled for such device,
>> only the page faults within the pinned range are valid in the registered
>> iommu fault handler...
>> I have another question here, for the IOMMU backed devices, they are
>> already
>> all pinned and mapped when attaching, is there a need to call
>> vfio_pin_pages()
>> to lock down pages for them? Did I miss something?...
>>
> 
> If a device is marked as supporting I/O page fault (fully or selectively), 
> there should be no pinning at attach or DMA_MAP time (suppose as 
> this series does). Then for devices with selective-faulting its vendor 
> driver will lock down the pages which are not faultable at run-time, 
> e.g. when intercepting guest registration of a ring buffer...

Get it. Thanks a lot for this! :-)

Shenming

> 
> Thanks
> Kevin
>

Re: [RFC PATCH v1 0/4] vfio: Add IOPF support for VFIO passthrough

2021-03-18 Thread Shenming Lu

On 2021/3/18 17:07, Tian, Kevin wrote:
>> From: Shenming Lu 
>> Sent: Thursday, March 18, 2021 3:53 PM
>>
>> On 2021/2/4 14:52, Tian, Kevin wrote:>>> In reality, many
>>>>> devices allow I/O faulting only in selective contexts. However, there
>>>>> is no standard way (e.g. PCISIG) for the device to report whether
>>>>> arbitrary I/O fault is allowed. Then we may have to maintain device
>>>>> specific knowledge in software, e.g. in an opt-in table to list devices
>>>>> which allows arbitrary faults. For devices which only support selective
>>>>> faulting, a mediator (either through vendor extensions on vfio-pci-core
>>>>> or a mdev wrapper) might be necessary to help lock down non-faultable
>>>>> mappings and then enable faulting on the rest mappings.
>>>>
>>>> For devices which only support selective faulting, they could tell it to 
>>>> the
>>>> IOMMU driver and let it filter out non-faultable faults? Do I get it wrong?
>>>
>>> Not exactly to IOMMU driver. There is already a vfio_pin_pages() for
>>> selectively page-pinning. The matter is that 'they' imply some device
>>> specific logic to decide which pages must be pinned and such knowledge
>>> is outside of VFIO.
>>>
>>> From enabling p.o.v we could possibly do it in phased approach. First
>>> handles devices which tolerate arbitrary DMA faults, and then extends
>>> to devices with selective-faulting. The former is simpler, but with one
>>> main open whether we want to maintain such device IDs in a static
>>> table in VFIO or rely on some hints from other components (e.g. PF
>>> driver in VF assignment case). Let's see how Alex thinks about it.
>>
>> Hi Kevin,
>>
>> You mentioned selective-faulting some time ago. I still have some doubt
>> about it:
>> There is already a vfio_pin_pages() which is used for limiting the IOMMU
>> group dirty scope to pinned pages, could it also be used for indicating
>> the faultable scope is limited to the pinned pages and the rest mappings
>> is non-faultable that should be pinned and mapped immediately? But it
>> seems to be a little weird and not exactly to what you meant... I will
>> be grateful if you can help to explain further. :-)
>>
> 
> The opposite, i.e. the vendor driver uses vfio_pin_pages to lock down
> pages that are not faultable (based on its specific knowledge) and then
> the rest memory becomes faultable.

Ahh...
Thus, from the perspective of VFIO IOMMU, if IOPF enabled for such device,
only the page faults within the pinned range are valid in the registered
iommu fault handler...
I have another question here, for the IOMMU backed devices, they are already
all pinned and mapped when attaching, is there a need to call vfio_pin_pages()
to lock down pages for them? Did I miss something?...

Thanks,
Shenming

> 
> Thanks
> Kevin
>

Re: [RFC PATCH v1 0/4] vfio: Add IOPF support for VFIO passthrough

2021-03-18 Thread Shenming Lu

On 2021/2/4 14:52, Tian, Kevin wrote:>>> In reality, many
>>> devices allow I/O faulting only in selective contexts. However, there
>>> is no standard way (e.g. PCISIG) for the device to report whether
>>> arbitrary I/O fault is allowed. Then we may have to maintain device
>>> specific knowledge in software, e.g. in an opt-in table to list devices
>>> which allows arbitrary faults. For devices which only support selective
>>> faulting, a mediator (either through vendor extensions on vfio-pci-core
>>> or a mdev wrapper) might be necessary to help lock down non-faultable
>>> mappings and then enable faulting on the rest mappings.
>>
>> For devices which only support selective faulting, they could tell it to the
>> IOMMU driver and let it filter out non-faultable faults? Do I get it wrong?
> 
> Not exactly to IOMMU driver. There is already a vfio_pin_pages() for
> selectively page-pinning. The matter is that 'they' imply some device
> specific logic to decide which pages must be pinned and such knowledge
> is outside of VFIO.
> 
> From enabling p.o.v we could possibly do it in phased approach. First 
> handles devices which tolerate arbitrary DMA faults, and then extends
> to devices with selective-faulting. The former is simpler, but with one
> main open whether we want to maintain such device IDs in a static
> table in VFIO or rely on some hints from other components (e.g. PF
> driver in VF assignment case). Let's see how Alex thinks about it.

Hi Kevin,

You mentioned selective-faulting some time ago. I still have some doubt
about it:
There is already a vfio_pin_pages() which is used for limiting the IOMMU
group dirty scope to pinned pages, could it also be used for indicating
the faultable scope is limited to the pinned pages and the rest mappings
is non-faultable that should be pinned and mapped immediately? But it
seems to be a little weird and not exactly to what you meant... I will
be grateful if you can help to explain further. :-)

Thanks,
Shenming

Re: [PATCH v4 5/6] KVM: arm64: GICv4.1: Restore VLPI pending state to physical side

2021-03-15 Thread Shenming Lu

On 2021/3/15 17:20, Marc Zyngier wrote:
> On 2021-03-15 09:11, Shenming Lu wrote:
>> On 2021/3/15 16:30, Marc Zyngier wrote:
>>> On 2021-03-13 08:38, Shenming Lu wrote:
>>>> From: Zenghui Yu 
>>>>
>>>> When setting the forwarding path of a VLPI (switch to the HW mode),
>>>> we can also transfer the pending state from irq->pending_latch to
>>>> VPT (especially in migration, the pending states of VLPIs are restored
>>>> into kvm’s vgic first). And we currently send "INT+VSYNC" to trigger
>>>> a VLPI to pending.
>>>>
>>>> Signed-off-by: Zenghui Yu 
>>>> Signed-off-by: Shenming Lu 
>>>> ---
>>>>  arch/arm64/kvm/vgic/vgic-v4.c | 18 ++
>>>>  1 file changed, 18 insertions(+)
>>>>
>>>> diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
>>>> index ac029ba3d337..3b82ab80c2f3 100644
>>>> --- a/arch/arm64/kvm/vgic/vgic-v4.c
>>>> +++ b/arch/arm64/kvm/vgic/vgic-v4.c
>>>> @@ -449,6 +449,24 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int 
>>>> virq,
>>>>  irq->host_irq    = virq;
>>>>  atomic_inc(&map.vpe->vlpi_count);
>>>>
>>>> +    /* Transfer pending state */
>>>> +    if (irq->pending_latch) {
>>>> +    unsigned long flags;
>>>> +
>>>> +    ret = irq_set_irqchip_state(irq->host_irq,
>>>> +    IRQCHIP_STATE_PENDING,
>>>> +    irq->pending_latch);
>>>> +    WARN_RATELIMIT(ret, "IRQ %d", irq->host_irq);
>>>> +
>>>> +    /*
>>>> + * Clear pending_latch and communicate this state
>>>> + * change via vgic_queue_irq_unlock.
>>>> + */
>>>> +    raw_spin_lock_irqsave(&irq->irq_lock, flags);
>>>> +    irq->pending_latch = false;
>>>> +    vgic_queue_irq_unlock(kvm, irq, flags);
>>>> +    }
>>>> +
>>>>  out:
>>>>  mutex_unlock(&its->its_lock);
>>>>  return ret;
>>>
>>> The read side of the pending state isn't locked, but the write side is.
>>> I'd rather you lock the whole sequence for peace of mind.
>>
>> Did you mean to lock before emitting the mapping request, Or just before 
>> reading
>> the pending state?
> 
> Just before reading the pending state, so that we can't get a concurrent
> modification of that state while we make the interrupt pending in the VPT
> and clearing it in the emulation.

Get it. I will correct it right now.

Thanks,
Shenming

> 
> Thanks,
> 
>     M.

Re: [PATCH v4 5/6] KVM: arm64: GICv4.1: Restore VLPI pending state to physical side

2021-03-15 Thread Shenming Lu

On 2021/3/15 16:30, Marc Zyngier wrote:
> On 2021-03-13 08:38, Shenming Lu wrote:
>> From: Zenghui Yu 
>>
>> When setting the forwarding path of a VLPI (switch to the HW mode),
>> we can also transfer the pending state from irq->pending_latch to
>> VPT (especially in migration, the pending states of VLPIs are restored
>> into kvm’s vgic first). And we currently send "INT+VSYNC" to trigger
>> a VLPI to pending.
>>
>> Signed-off-by: Zenghui Yu 
>> Signed-off-by: Shenming Lu 
>> ---
>>  arch/arm64/kvm/vgic/vgic-v4.c | 18 ++
>>  1 file changed, 18 insertions(+)
>>
>> diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
>> index ac029ba3d337..3b82ab80c2f3 100644
>> --- a/arch/arm64/kvm/vgic/vgic-v4.c
>> +++ b/arch/arm64/kvm/vgic/vgic-v4.c
>> @@ -449,6 +449,24 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int 
>> virq,
>>  irq->host_irq    = virq;
>>  atomic_inc(&map.vpe->vlpi_count);
>>
>> +    /* Transfer pending state */
>> +    if (irq->pending_latch) {
>> +    unsigned long flags;
>> +
>> +    ret = irq_set_irqchip_state(irq->host_irq,
>> +    IRQCHIP_STATE_PENDING,
>> +    irq->pending_latch);
>> +    WARN_RATELIMIT(ret, "IRQ %d", irq->host_irq);
>> +
>> +    /*
>> + * Clear pending_latch and communicate this state
>> + * change via vgic_queue_irq_unlock.
>> + */
>> +    raw_spin_lock_irqsave(&irq->irq_lock, flags);
>> +    irq->pending_latch = false;
>> +    vgic_queue_irq_unlock(kvm, irq, flags);
>> +    }
>> +
>>  out:
>>  mutex_unlock(&its->its_lock);
>>  return ret;
> 
> The read side of the pending state isn't locked, but the write side is.
> I'd rather you lock the whole sequence for peace of mind.

Did you mean to lock before emitting the mapping request, Or just before reading
the pending state?

Thanks,
Shenming

> 
> Thanks,
> 
>     M.

[PATCH v4 4/6] KVM: arm64: GICv4.1: Try to save VLPI state in save_pending_tables

After pausing all vCPUs and devices capable of interrupting, in order
to save the states of all interrupts, besides flushing the states in
kvm’s vgic, we also try to flush the states of VLPIs in the virtual
pending tables into guest RAM, but we need to have GICv4.1 and safely
unmap the vPEs first.

As for the saving of VSGIs, which needs the vPEs to be mapped and might
conflict with the saving of VLPIs, but since we will map the vPEs back
at the end of save_pending_tables and both savings require the kvm->lock
to be held (thus only happen serially), it will work fine.

Signed-off-by: Shenming Lu 
---
 arch/arm64/kvm/vgic/vgic-v3.c | 66 +++
 1 file changed, 60 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 52915b342351..359d4dc35264 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0-only
 
 #include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -356,6 +358,32 @@ int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, 
struct vgic_irq *irq)
return 0;
 }
 
+/*
+ * The deactivation of the doorbell interrupt will trigger the
+ * unmapping of the associated vPE.
+ */
+static void unmap_all_vpes(struct vgic_dist *dist)
+{
+   struct irq_desc *desc;
+   int i;
+
+   for (i = 0; i < dist->its_vm.nr_vpes; i++) {
+   desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
+   irq_domain_deactivate_irq(irq_desc_get_irq_data(desc));
+   }
+}
+
+static void map_all_vpes(struct vgic_dist *dist)
+{
+   struct irq_desc *desc;
+   int i;
+
+   for (i = 0; i < dist->its_vm.nr_vpes; i++) {
+   desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
+   irq_domain_activate_irq(irq_desc_get_irq_data(desc), false);
+   }
+}
+
 /**
  * vgic_v3_save_pending_tables - Save the pending tables into guest RAM
  * kvm lock and all vcpu lock must be held
@@ -365,13 +393,28 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
struct vgic_dist *dist = &kvm->arch.vgic;
struct vgic_irq *irq;
gpa_t last_ptr = ~(gpa_t)0;
-   int ret;
+   bool vlpi_avail = false;
+   int ret = 0;
u8 val;
 
+   if (unlikely(!vgic_initialized(kvm)))
+   return -ENXIO;
+
+   /*
+* A preparation for getting any VLPI states.
+* The above vgic initialized check also ensures that the allocation
+* and enabling of the doorbells have already been done.
+*/
+   if (kvm_vgic_global_state.has_gicv4_1) {
+   unmap_all_vpes(dist);
+   vlpi_avail = true;
+   }
+
list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
int byte_offset, bit_nr;
struct kvm_vcpu *vcpu;
gpa_t pendbase, ptr;
+   bool is_pending;
bool stored;
 
vcpu = irq->target_vcpu;
@@ -387,24 +430,35 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
if (ptr != last_ptr) {
ret = kvm_read_guest_lock(kvm, ptr, &val, 1);
if (ret)
-   return ret;
+   goto out;
last_ptr = ptr;
}
 
stored = val & (1U << bit_nr);
-   if (stored == irq->pending_latch)
+
+   is_pending = irq->pending_latch;
+
+   if (irq->hw && vlpi_avail)
+   vgic_v4_get_vlpi_state(irq, &is_pending);
+
+   if (stored == is_pending)
continue;
 
-   if (irq->pending_latch)
+   if (is_pending)
val |= 1 << bit_nr;
else
val &= ~(1 << bit_nr);
 
ret = kvm_write_guest_lock(kvm, ptr, &val, 1);
if (ret)
-   return ret;
+   goto out;
}
-   return 0;
+
+out:
+   if (vlpi_avail)
+   map_all_vpes(dist);
+
+   return ret;
 }
 
 /**
-- 
2.19.1

[PATCH v4 6/6] KVM: arm64: GICv4.1: Give a chance to save VLPI state

Before GICv4.1, we don't have direct access to the VLPI state. So
we simply let it fail early when encountering any VLPI in saving.

But now we don't have to return -EACCES directly if on GICv4.1. Let’s
change the hard code and give a chance to save the VLPI state (and
preserve the UAPI).

Signed-off-by: Shenming Lu 
---
 Documentation/virt/kvm/devices/arm-vgic-its.rst | 2 +-
 arch/arm64/kvm/vgic/vgic-its.c  | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Documentation/virt/kvm/devices/arm-vgic-its.rst 
b/Documentation/virt/kvm/devices/arm-vgic-its.rst
index 6c304fd2b1b4..d257eddbae29 100644
--- a/Documentation/virt/kvm/devices/arm-vgic-its.rst
+++ b/Documentation/virt/kvm/devices/arm-vgic-its.rst
@@ -80,7 +80,7 @@ KVM_DEV_ARM_VGIC_GRP_CTRL
 -EFAULT  Invalid guest ram access
 -EBUSY   One or more VCPUS are running
 -EACCES  The virtual ITS is backed by a physical GICv4 ITS, and the
-state is not available
+state is not available without GICv4.1
 ===  ==
 
 KVM_DEV_ARM_VGIC_GRP_ITS_REGS
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index 40cbaca81333..ec7543a9617c 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -2218,10 +2218,10 @@ static int vgic_its_save_itt(struct vgic_its *its, 
struct its_device *device)
/*
 * If an LPI carries the HW bit, this means that this
 * interrupt is controlled by GICv4, and we do not
-* have direct access to that state. Let's simply fail
-* the save operation...
+* have direct access to that state without GICv4.1.
+* Let's simply fail the save operation...
 */
-   if (ite->irq->hw)
+   if (ite->irq->hw && !kvm_vgic_global_state.has_gicv4_1)
return -EACCES;
 
ret = vgic_its_save_ite(its, device, ite, gpa, ite_esz);
-- 
2.19.1

[PATCH v4 3/6] KVM: arm64: GICv4.1: Add function to get VLPI state

With GICv4.1 and the vPE unmapped, which indicates the invalidation
of any VPT caches associated with the vPE, we can get the VLPI state
by peeking at the VPT. So we add a function for this.

Signed-off-by: Shenming Lu 
---
 arch/arm64/kvm/vgic/vgic-v4.c | 19 +++
 arch/arm64/kvm/vgic/vgic.h|  1 +
 2 files changed, 20 insertions(+)

diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
index 66508b03094f..ac029ba3d337 100644
--- a/arch/arm64/kvm/vgic/vgic-v4.c
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@@ -203,6 +203,25 @@ void vgic_v4_configure_vsgis(struct kvm *kvm)
kvm_arm_resume_guest(kvm);
 }
 
+/*
+ * Must be called with GICv4.1 and the vPE unmapped, which
+ * indicates the invalidation of any VPT caches associated
+ * with the vPE, thus we can get the VLPI state by peeking
+ * at the VPT.
+ */
+void vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val)
+{
+   struct its_vpe *vpe = &irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
+   int mask = BIT(irq->intid % BITS_PER_BYTE);
+   void *va;
+   u8 *ptr;
+
+   va = page_address(vpe->vpt_page);
+   ptr = va + irq->intid / BITS_PER_BYTE;
+
+   *val = !!(*ptr & mask);
+}
+
 /**
  * vgic_v4_init - Initialize the GICv4 data structures
  * @kvm:   Pointer to the VM being initialized
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
index 64fcd750..d8cfd360838c 100644
--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -317,5 +317,6 @@ bool vgic_supports_direct_msis(struct kvm *kvm);
 int vgic_v4_init(struct kvm *kvm);
 void vgic_v4_teardown(struct kvm *kvm);
 void vgic_v4_configure_vsgis(struct kvm *kvm);
+void vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val);
 
 #endif
-- 
2.19.1

[PATCH v4 2/6] irqchip/gic-v3-its: Drop the setting of PTZ altogether

GICv4.1 gives a way to get the VLPI state, which needs to map the
vPE first, and after the state read, we may remap the vPE back while
the VPT is not empty. So we can't assume that the VPT is empty at
the first map. Besides, the optimization of PTZ is probably limited
since the HW should be fairly efficient to parse the empty VPT. Let's
drop the setting of PTZ altogether.

Signed-off-by: Shenming Lu 
---
 drivers/irqchip/irq-gic-v3-its.c | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 4eb907f65bd0..c8b5a88ac31c 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -794,8 +794,16 @@ static struct its_vpe *its_build_vmapp_cmd(struct its_node 
*its,
 
its_encode_alloc(cmd, alloc);
 
-   /* We can only signal PTZ when alloc==1. Why do we have two bits? */
-   its_encode_ptz(cmd, alloc);
+   /*
+* We can only signal PTZ when alloc==1. Why do we have two bits?
+* GICv4.1 gives a way to get the VLPI state, which needs the vPE
+* to be unmapped first, and in this case, we may remap the vPE
+* back while the VPT is not empty. So we can't assume that the
+* VPT is empty at the first map. Besides, the optimization of PTZ
+* is probably limited since the HW should be fairly efficient to
+* parse the empty VPT. Let's drop the setting of PTZ altogether.
+*/
+   its_encode_ptz(cmd, false);
its_encode_vconf_addr(cmd, vconf_addr);
its_encode_vmapp_default_db(cmd, desc->its_vmapp_cmd.vpe->vpe_db_lpi);
 
-- 
2.19.1

[PATCH v4 5/6] KVM: arm64: GICv4.1: Restore VLPI pending state to physical side

From: Zenghui Yu 

When setting the forwarding path of a VLPI (switch to the HW mode),
we can also transfer the pending state from irq->pending_latch to
VPT (especially in migration, the pending states of VLPIs are restored
into kvm’s vgic first). And we currently send "INT+VSYNC" to trigger
a VLPI to pending.

Signed-off-by: Zenghui Yu 
Signed-off-by: Shenming Lu 
---
 arch/arm64/kvm/vgic/vgic-v4.c | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
index ac029ba3d337..3b82ab80c2f3 100644
--- a/arch/arm64/kvm/vgic/vgic-v4.c
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@@ -449,6 +449,24 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq,
irq->host_irq   = virq;
atomic_inc(&map.vpe->vlpi_count);
 
+   /* Transfer pending state */
+   if (irq->pending_latch) {
+   unsigned long flags;
+
+   ret = irq_set_irqchip_state(irq->host_irq,
+   IRQCHIP_STATE_PENDING,
+   irq->pending_latch);
+   WARN_RATELIMIT(ret, "IRQ %d", irq->host_irq);
+
+   /*
+* Clear pending_latch and communicate this state
+* change via vgic_queue_irq_unlock.
+*/
+   raw_spin_lock_irqsave(&irq->irq_lock, flags);
+   irq->pending_latch = false;
+   vgic_queue_irq_unlock(kvm, irq, flags);
+   }
+
 out:
mutex_unlock(&its->its_lock);
return ret;
-- 
2.19.1

[PATCH v4 0/6] KVM: arm64: Add VLPI migration support on GICv4.1

Hi,

In GICv4.1, migration has been supported except for (directly-injected)
VLPI. And GICv4.1 Spec explicitly gives a way to get the VLPI's pending
state (which was crucially missing in GICv4.0). So we make VLPI migration
capable on GICv4.1 in this series.

In order to support VLPI migration, we need to save and restore all
required configuration information and pending states of VLPIs. But
in fact, the configuration information of VLPIs has already been saved
(or will be reallocated on the dst host...) in vgic(kvm) migration.
So we only have to migrate the pending states of VLPIs specially.

Below is the related workflow in migration.

On the save path:
In migration completion:
pause all vCPUs
|
call each VM state change handler:
pause other devices (just keep from sending interrupts, 
and
such as VFIO migration protocol has already realized it 
[1])
|
flush ITS tables into guest RAM
|
flush RDIST pending tables (also flush VLPI pending 
states here)
|
...
On the resume path:
load each device's state:
restore ITS tables (include pending tables) from guest RAM
|
for other (PCI) devices (paused), if configured to have VLPIs,
establish the forwarding paths of their VLPIs (and transfer
the pending states from kvm's vgic to VPT here)

We have tested this series in VFIO migration, and found some related
issues in QEMU [2].

Links:
[1] vfio: UAPI for migration interface for device state:

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a8a24f3f6e38103b77cf399c38eb54e1219d00d6
[2] vfio: Some fixes and optimizations for VFIO migration:

https://patchwork.ozlabs.org/project/qemu-devel/cover/20210310030233.1133-1-lushenm...@huawei.com/

History:

v3 -> v4
 - Nit fixes.
 - Add a CPU cache invalidation right after unmapping the vPE. (Patch 1)
 - Drop the setting of PTZ altogether. (Patch 2)
 - Bail out if spot !vgic_initialized(). (in Patch 4)
 - Communicate the state change (clear pending_latch) via
   vgic_queue_irq_unlock. (in Patch 5)

Thanks a lot for the suggestions from Marc!

v2 -> v3
 - Add the vgic initialized check to ensure that the allocation and enabling
   of the doorbells have already been done before unmapping the vPEs.
 - Check all get_vlpi_state related conditions in save_pending_tables in one 
place.
 - Nit fixes.

v1 -> v2:
 - Get the VLPI state from the KVM side.
 - Nit fixes.

Thanks,
Shenming


Marc Zyngier (1):
  irqchip/gic-v3-its: Add a cache invalidation right after vPE unmapping

Shenming Lu (4):
  irqchip/gic-v3-its: Drop the setting of PTZ altogether
  KVM: arm64: GICv4.1: Add function to get VLPI state
  KVM: arm64: GICv4.1: Try to save VLPI state in save_pending_tables
  KVM: arm64: GICv4.1: Give a chance to save VLPI state

Zenghui Yu (1):
  KVM: arm64: GICv4.1: Restore VLPI pending state to physical side

 .../virt/kvm/devices/arm-vgic-its.rst |  2 +-
 arch/arm64/kvm/vgic/vgic-its.c|  6 +-
 arch/arm64/kvm/vgic/vgic-v3.c | 66 +--
 arch/arm64/kvm/vgic/vgic-v4.c | 37 +++
 arch/arm64/kvm/vgic/vgic.h|  1 +
 drivers/irqchip/irq-gic-v3-its.c  | 21 +-
 6 files changed, 121 insertions(+), 12 deletions(-)

-- 
2.19.1

[PATCH v4 1/6] irqchip/gic-v3-its: Add a cache invalidation right after vPE unmapping

From: Marc Zyngier 

Since there may be a direct read from the CPU side to the VPT after
unmapping the vPE, we add a cache coherency maintenance at the end
of its_vpe_irq_domain_deactivate() to ensure the validity of the VPT
read later.

Signed-off-by: Marc Zyngier 
Signed-off-by: Shenming Lu 
---
 drivers/irqchip/irq-gic-v3-its.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index ed46e6057e33..4eb907f65bd0 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -4554,6 +4554,15 @@ static void its_vpe_irq_domain_deactivate(struct 
irq_domain *domain,
 
its_send_vmapp(its, vpe, false);
}
+
+   /*
+* There may be a direct read to the VPT after unmapping the
+* vPE, to guarantee the validity of this, we make the VPT
+* memory coherent with the CPU caches here.
+*/
+   if (find_4_1_its() && !atomic_read(&vpe->vmapp_count))
+   gic_flush_dcache_to_poc(page_address(vpe->vpt_page),
+   LPI_PENDBASE_SZ);
 }
 
 static const struct irq_domain_ops its_vpe_domain_ops = {
-- 
2.19.1

Re: [PATCH v3 3/4] KVM: arm64: GICv4.1: Restore VLPI's pending state to physical side

On 2021/3/12 20:02, Marc Zyngier wrote:
> On Fri, 12 Mar 2021 11:34:07 +,
> Shenming Lu  wrote:
>>
>> On 2021/3/12 19:10, Marc Zyngier wrote:
>>> On Fri, 12 Mar 2021 10:48:29 +,
>>> Shenming Lu  wrote:
>>>>
>>>> On 2021/3/12 17:05, Marc Zyngier wrote:
>>>>> On Thu, 11 Mar 2021 12:32:07 +,
>>>>> Shenming Lu  wrote:
>>>>>>
>>>>>> On 2021/3/11 17:14, Marc Zyngier wrote:
>>>>>>> On Wed, 27 Jan 2021 12:13:36 +,
>>>>>>> Shenming Lu  wrote:
>>>>>>>>
>>>>>>>> From: Zenghui Yu 
>>>>>>>>
>>>>>>>> When setting the forwarding path of a VLPI (switch to the HW mode),
>>>>>>>> we could also transfer the pending state from irq->pending_latch to
>>>>>>>> VPT (especially in migration, the pending states of VLPIs are restored
>>>>>>>> into kvm’s vgic first). And we currently send "INT+VSYNC" to trigger
>>>>>>>> a VLPI to pending.
>>>>>>>>
>>>>>>>> Signed-off-by: Zenghui Yu 
>>>>>>>> Signed-off-by: Shenming Lu 
>>>>>>>> ---
>>>>>>>>  arch/arm64/kvm/vgic/vgic-v4.c | 14 ++
>>>>>>>>  1 file changed, 14 insertions(+)
>>>>>>>>
>>>>>>>> diff --git a/arch/arm64/kvm/vgic/vgic-v4.c 
>>>>>>>> b/arch/arm64/kvm/vgic/vgic-v4.c
>>>>>>>> index ac029ba3d337..a3542af6f04a 100644
>>>>>>>> --- a/arch/arm64/kvm/vgic/vgic-v4.c
>>>>>>>> +++ b/arch/arm64/kvm/vgic/vgic-v4.c
>>>>>>>> @@ -449,6 +449,20 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, 
>>>>>>>> int virq,
>>>>>>>>irq->host_irq   = virq;
>>>>>>>>atomic_inc(&map.vpe->vlpi_count);
>>>>>>>>  
>>>>>>>> +  /* Transfer pending state */
>>>>>>>> +  if (irq->pending_latch) {
>>>>>>>> +  ret = irq_set_irqchip_state(irq->host_irq,
>>>>>>>> +  IRQCHIP_STATE_PENDING,
>>>>>>>> +  irq->pending_latch);
>>>>>>>> +  WARN_RATELIMIT(ret, "IRQ %d", irq->host_irq);
>>>>>>>> +
>>>>>>>> +  /*
>>>>>>>> +   * Let it be pruned from ap_list later and don't bother
>>>>>>>> +   * the List Register.
>>>>>>>> +   */
>>>>>>>> +  irq->pending_latch = false;
>>>>>>>
>>>>>>> NAK. If the interrupt is on the AP list, it must be pruned from it
>>>>>>> *immediately*. The only case where it can be !pending and still on the
>>>>>>> AP list is in interval between sync and prune. If we start messing
>>>>>>> with this, we can't reason about the state of this list anymore.
>>>>>>>
>>>>>>> Consider calling vgic_queue_irq_unlock() here.
>>>>>>
>>>>>> Thanks for giving a hint, but it seems that vgic_queue_irq_unlock() only
>>>>>> queues an IRQ after checking, did you mean vgic_prune_ap_list() instead?
>>>>>
>>>>> No, I really mean vgic_queue_irq_unlock(). It can be used to remove
>>>>> the pending state from an interrupt, and drop it from the AP
>>>>> list. This is exactly what happens when clearing the pending state of
>>>>> a level interrupt, for example.
>>>>
>>>> Hi, I have gone through vgic_queue_irq_unlock more than once, but
>>>> still can't find the place in it to drop an IRQ from the AP
>>>> list... Did I miss something ?...  Or could you help to point it
>>>> out? Thanks very much for this!
>>>
>>> NO, you are right. I think this is a missing optimisation. Please call
>>> the function anyway, as that's what is required to communicate a
>>> change of state in general.>
>>> I'll have a think about it.
>>
>> Maybe we could call vgic_prune_ap_list() if (irq->vcpu &&
>> !vgic_target_oracle(irq)) in vgic_queue_irq_unlock()...
> 
> The locking is pretty ugly in this case, and I don't want to reparse
> the whole AP list. It is basically doing the same work as the
> insertion, but with a list_del() instead of a list_add()...

make sense..

Thanks,
Shenming

> 
> We can live without it for now.
> 
>> OK, I will retest this series and send a v4 soon. :-)
> 
> Thanks,
> 
>   M.
>

Re: [PATCH v3 3/4] KVM: arm64: GICv4.1: Restore VLPI's pending state to physical side

On 2021/3/12 19:10, Marc Zyngier wrote:
> On Fri, 12 Mar 2021 10:48:29 +,
> Shenming Lu  wrote:
>>
>> On 2021/3/12 17:05, Marc Zyngier wrote:
>>> On Thu, 11 Mar 2021 12:32:07 +,
>>> Shenming Lu  wrote:
>>>>
>>>> On 2021/3/11 17:14, Marc Zyngier wrote:
>>>>> On Wed, 27 Jan 2021 12:13:36 +,
>>>>> Shenming Lu  wrote:
>>>>>>
>>>>>> From: Zenghui Yu 
>>>>>>
>>>>>> When setting the forwarding path of a VLPI (switch to the HW mode),
>>>>>> we could also transfer the pending state from irq->pending_latch to
>>>>>> VPT (especially in migration, the pending states of VLPIs are restored
>>>>>> into kvm’s vgic first). And we currently send "INT+VSYNC" to trigger
>>>>>> a VLPI to pending.
>>>>>>
>>>>>> Signed-off-by: Zenghui Yu 
>>>>>> Signed-off-by: Shenming Lu 
>>>>>> ---
>>>>>>  arch/arm64/kvm/vgic/vgic-v4.c | 14 ++
>>>>>>  1 file changed, 14 insertions(+)
>>>>>>
>>>>>> diff --git a/arch/arm64/kvm/vgic/vgic-v4.c 
>>>>>> b/arch/arm64/kvm/vgic/vgic-v4.c
>>>>>> index ac029ba3d337..a3542af6f04a 100644
>>>>>> --- a/arch/arm64/kvm/vgic/vgic-v4.c
>>>>>> +++ b/arch/arm64/kvm/vgic/vgic-v4.c
>>>>>> @@ -449,6 +449,20 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int 
>>>>>> virq,
>>>>>>  irq->host_irq   = virq;
>>>>>>  atomic_inc(&map.vpe->vlpi_count);
>>>>>>  
>>>>>> +/* Transfer pending state */
>>>>>> +if (irq->pending_latch) {
>>>>>> +ret = irq_set_irqchip_state(irq->host_irq,
>>>>>> +IRQCHIP_STATE_PENDING,
>>>>>> +irq->pending_latch);
>>>>>> +WARN_RATELIMIT(ret, "IRQ %d", irq->host_irq);
>>>>>> +
>>>>>> +/*
>>>>>> + * Let it be pruned from ap_list later and don't bother
>>>>>> + * the List Register.
>>>>>> + */
>>>>>> +irq->pending_latch = false;
>>>>>
>>>>> NAK. If the interrupt is on the AP list, it must be pruned from it
>>>>> *immediately*. The only case where it can be !pending and still on the
>>>>> AP list is in interval between sync and prune. If we start messing
>>>>> with this, we can't reason about the state of this list anymore.
>>>>>
>>>>> Consider calling vgic_queue_irq_unlock() here.
>>>>
>>>> Thanks for giving a hint, but it seems that vgic_queue_irq_unlock() only
>>>> queues an IRQ after checking, did you mean vgic_prune_ap_list() instead?
>>>
>>> No, I really mean vgic_queue_irq_unlock(). It can be used to remove
>>> the pending state from an interrupt, and drop it from the AP
>>> list. This is exactly what happens when clearing the pending state of
>>> a level interrupt, for example.
>>
>> Hi, I have gone through vgic_queue_irq_unlock more than once, but
>> still can't find the place in it to drop an IRQ from the AP
>> list... Did I miss something ?...  Or could you help to point it
>> out? Thanks very much for this!
> 
> NO, you are right. I think this is a missing optimisation. Please call
> the function anyway, as that's what is required to communicate a
> change of state in general.>
> I'll have a think about it.

Maybe we could call vgic_prune_ap_list() if (irq->vcpu && 
!vgic_target_oracle(irq)) in vgic_queue_irq_unlock()...

OK, I will retest this series and send a v4 soon. :-)

Thanks,
Shenming

> 
> Thanks,
> 
>   M.
>

Re: [PATCH v3 3/4] KVM: arm64: GICv4.1: Restore VLPI's pending state to physical side

On 2021/3/12 17:05, Marc Zyngier wrote:
> On Thu, 11 Mar 2021 12:32:07 +,
> Shenming Lu  wrote:
>>
>> On 2021/3/11 17:14, Marc Zyngier wrote:
>>> On Wed, 27 Jan 2021 12:13:36 +,
>>> Shenming Lu  wrote:
>>>>
>>>> From: Zenghui Yu 
>>>>
>>>> When setting the forwarding path of a VLPI (switch to the HW mode),
>>>> we could also transfer the pending state from irq->pending_latch to
>>>> VPT (especially in migration, the pending states of VLPIs are restored
>>>> into kvm’s vgic first). And we currently send "INT+VSYNC" to trigger
>>>> a VLPI to pending.
>>>>
>>>> Signed-off-by: Zenghui Yu 
>>>> Signed-off-by: Shenming Lu 
>>>> ---
>>>>  arch/arm64/kvm/vgic/vgic-v4.c | 14 ++
>>>>  1 file changed, 14 insertions(+)
>>>>
>>>> diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
>>>> index ac029ba3d337..a3542af6f04a 100644
>>>> --- a/arch/arm64/kvm/vgic/vgic-v4.c
>>>> +++ b/arch/arm64/kvm/vgic/vgic-v4.c
>>>> @@ -449,6 +449,20 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int 
>>>> virq,
>>>>irq->host_irq   = virq;
>>>>atomic_inc(&map.vpe->vlpi_count);
>>>>  
>>>> +  /* Transfer pending state */
>>>> +  if (irq->pending_latch) {
>>>> +  ret = irq_set_irqchip_state(irq->host_irq,
>>>> +  IRQCHIP_STATE_PENDING,
>>>> +  irq->pending_latch);
>>>> +  WARN_RATELIMIT(ret, "IRQ %d", irq->host_irq);
>>>> +
>>>> +  /*
>>>> +   * Let it be pruned from ap_list later and don't bother
>>>> +   * the List Register.
>>>> +   */
>>>> +  irq->pending_latch = false;
>>>
>>> NAK. If the interrupt is on the AP list, it must be pruned from it
>>> *immediately*. The only case where it can be !pending and still on the
>>> AP list is in interval between sync and prune. If we start messing
>>> with this, we can't reason about the state of this list anymore.
>>>
>>> Consider calling vgic_queue_irq_unlock() here.
>>
>> Thanks for giving a hint, but it seems that vgic_queue_irq_unlock() only
>> queues an IRQ after checking, did you mean vgic_prune_ap_list() instead?
> 
> No, I really mean vgic_queue_irq_unlock(). It can be used to remove
> the pending state from an interrupt, and drop it from the AP
> list. This is exactly what happens when clearing the pending state of
> a level interrupt, for example.

Hi, I have gone through vgic_queue_irq_unlock more than once, but still can't
find the place in it to drop an IRQ from the AP list... Did I miss something 
?...
Or could you help to point it out? Thanks very much for this!

Shenming

> 
>   M.
>

Re: [PATCH v3 2/4] KVM: arm64: GICv4.1: Try to save hw pending state in save_pending_tables

On 2021/3/12 17:02, Marc Zyngier wrote:
> On Thu, 11 Mar 2021 12:31:48 +,
> Shenming Lu  wrote:
>>
>> On 2021/3/11 17:09, Marc Zyngier wrote:
> 
>>> I have asked that question in the past: is it actually safe to remap
>>> the vPEs and expect them to be runnable
>>
>> In my opinion, logically it can work, but there might be problems like the
>> one below that I didn't notice...
> 
> One thing is that you will have lost interrupts in the meantime
> (assuming your devices are still alive). How will you make up for
> that?

I think that devices should be paused for (not only) saving interrupt states,
and in fact, that's exactly what such as VFIO devices do...

> 
>>
>>>
>>> Also, the current code assumes that VMAPP.PTZ can be advertised if a
>>> VPT is mapped for the first time. Clearly, it is unlikely that the VPT
>>> will be only populated with 0s, so you'll end up with state corruption
>>> on the first remap.
>>
>> Oh, thanks for pointing it out.
>> And if we always signal PTZ when alloc = 1, does it mean that we
>> can't remap the vPE when the VPT is not empty, thus there is no
>> chance to get the VLPI state?  Could we just assume that the VPT is
>> not empty when first mapping the vPE?
> 
> I think we should drop the setting of PTZ altogether. It is a silly
> micro-optimisation, and if the HW can't parse the VPT efficiently when
> it is empty, then the HW is pretty bad, PTZ or not.

agree :-)

Thanks,
Shenming

> 
> Thanks,
> 
>   M.
>

Re: [PATCH v3 3/4] KVM: arm64: GICv4.1: Restore VLPI's pending state to physical side

2021-03-11 Thread Shenming Lu

On 2021/3/11 17:14, Marc Zyngier wrote:
> On Wed, 27 Jan 2021 12:13:36 +,
> Shenming Lu  wrote:
>>
>> From: Zenghui Yu 
>>
>> When setting the forwarding path of a VLPI (switch to the HW mode),
>> we could also transfer the pending state from irq->pending_latch to
>> VPT (especially in migration, the pending states of VLPIs are restored
>> into kvm’s vgic first). And we currently send "INT+VSYNC" to trigger
>> a VLPI to pending.
>>
>> Signed-off-by: Zenghui Yu 
>> Signed-off-by: Shenming Lu 
>> ---
>>  arch/arm64/kvm/vgic/vgic-v4.c | 14 ++
>>  1 file changed, 14 insertions(+)
>>
>> diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
>> index ac029ba3d337..a3542af6f04a 100644
>> --- a/arch/arm64/kvm/vgic/vgic-v4.c
>> +++ b/arch/arm64/kvm/vgic/vgic-v4.c
>> @@ -449,6 +449,20 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int 
>> virq,
>>  irq->host_irq   = virq;
>>  atomic_inc(&map.vpe->vlpi_count);
>>  
>> +/* Transfer pending state */
>> +if (irq->pending_latch) {
>> +ret = irq_set_irqchip_state(irq->host_irq,
>> +IRQCHIP_STATE_PENDING,
>> +irq->pending_latch);
>> +WARN_RATELIMIT(ret, "IRQ %d", irq->host_irq);
>> +
>> +/*
>> + * Let it be pruned from ap_list later and don't bother
>> + * the List Register.
>> + */
>> +irq->pending_latch = false;
> 
> NAK. If the interrupt is on the AP list, it must be pruned from it
> *immediately*. The only case where it can be !pending and still on the
> AP list is in interval between sync and prune. If we start messing
> with this, we can't reason about the state of this list anymore.
> 
> Consider calling vgic_queue_irq_unlock() here.

Thanks for giving a hint, but it seems that vgic_queue_irq_unlock() only
queues an IRQ after checking, did you mean vgic_prune_ap_list() instead?

Thanks a lot for the comments! :-)
Shenming

> 
> Thanks,
> 
>   M.
>

Re: [PATCH v3 2/4] KVM: arm64: GICv4.1: Try to save hw pending state in save_pending_tables

2021-03-11 Thread Shenming Lu

On 2021/3/11 17:09, Marc Zyngier wrote:
> On Wed, 27 Jan 2021 12:13:35 +,
> Shenming Lu  wrote:
>>
>> After pausing all vCPUs and devices capable of interrupting, in order
>> to save the information of all interrupts, besides flushing the pending
>> states in kvm’s vgic, we also try to flush the states of VLPIs in the
>> virtual pending tables into guest RAM, but we need to have GICv4.1 and
>> safely unmap the vPEs first.
>>
>> As for the saving of VSGIs, which needs the vPEs to be mapped and might
>> conflict with the saving of VLPIs, but since we will map the vPEs back
>> at the end of save_pending_tables and both savings require the kvm->lock
>> to be held (only happen serially), it will work fine.
>>
>> Signed-off-by: Shenming Lu 
>> ---
>>  arch/arm64/kvm/vgic/vgic-v3.c | 61 +++
>>  1 file changed, 55 insertions(+), 6 deletions(-)
>>
>> diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
>> index 52915b342351..06b1162b7a0a 100644
>> --- a/arch/arm64/kvm/vgic/vgic-v3.c
>> +++ b/arch/arm64/kvm/vgic/vgic-v3.c
>> @@ -1,6 +1,8 @@
>>  // SPDX-License-Identifier: GPL-2.0-only
>>  
>>  #include 
>> +#include 
>> +#include 
>>  #include 
>>  #include 
>>  #include 
>> @@ -356,6 +358,32 @@ int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, 
>> struct vgic_irq *irq)
>>  return 0;
>>  }
>>  
>> +/*
>> + * The deactivation of the doorbell interrupt will trigger the
>> + * unmapping of the associated vPE.
>> + */
>> +static void unmap_all_vpes(struct vgic_dist *dist)
>> +{
>> +struct irq_desc *desc;
>> +int i;
>> +
>> +for (i = 0; i < dist->its_vm.nr_vpes; i++) {
>> +desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
>> +irq_domain_deactivate_irq(irq_desc_get_irq_data(desc));
>> +}
>> +}
>> +
>> +static void map_all_vpes(struct vgic_dist *dist)
>> +{
>> +struct irq_desc *desc;
>> +int i;
>> +
>> +for (i = 0; i < dist->its_vm.nr_vpes; i++) {
>> +desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
>> +irq_domain_activate_irq(irq_desc_get_irq_data(desc), false);
>> +}
>> +}
>> +
>>  /**
>>   * vgic_v3_save_pending_tables - Save the pending tables into guest RAM
>>   * kvm lock and all vcpu lock must be held
>> @@ -365,14 +393,26 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
>>  struct vgic_dist *dist = &kvm->arch.vgic;
>>  struct vgic_irq *irq;
>>  gpa_t last_ptr = ~(gpa_t)0;
>> -int ret;
>> +bool vlpi_avail = false;
>> +int ret = 0;
>>  u8 val;
>>  
>> +/*
>> + * As a preparation for getting any VLPI states.
>> + * The vgic initialized check ensures that the allocation and
>> + * enabling of the doorbells have already been done.
>> + */
>> +if (kvm_vgic_global_state.has_gicv4_1 && 
>> !WARN_ON(!vgic_initialized(kvm))) {
> 
> Should we bail out if we ever spot !vgic_initialized()? In general, I
> find the double negation horrible to read).

Ok, I will change it.

> 
>> +unmap_all_vpes(dist);
>> +vlpi_avail = true;
>> +}
>> +
>>  list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
>>  int byte_offset, bit_nr;
>>  struct kvm_vcpu *vcpu;
>>  gpa_t pendbase, ptr;
>>  bool stored;
>> +bool is_pending = irq->pending_latch;
>>  
>>  vcpu = irq->target_vcpu;
>>  if (!vcpu)
>> @@ -387,24 +427,33 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
>>  if (ptr != last_ptr) {
>>  ret = kvm_read_guest_lock(kvm, ptr, &val, 1);
>>  if (ret)
>> -return ret;
>> +goto out;
>>  last_ptr = ptr;
>>  }
>>  
>>  stored = val & (1U << bit_nr);
>> -if (stored == irq->pending_latch)
>> +
>> +if (irq->hw && vlpi_avail)
>> +vgic_v4_get_vlpi_state(irq, &is_pending);
> 
> Keep the 'is_pending = irq->pending_latch;' statement close to the VPT
> read, since they represent the same state.

Ok, make sense.

> 
>> +
>> +if (stored == is_pending)
>>

Re: [PATCH v3 1/4] KVM: arm64: GICv4.1: Add function to get VLPI state

2021-03-11 Thread Shenming Lu

On 2021/3/11 16:57, Marc Zyngier wrote:
> On Wed, 27 Jan 2021 12:13:34 +,
> Shenming Lu  wrote:
>>
>> With GICv4.1 and the vPE unmapped, which indicates the invalidation
>> of any VPT caches associated with the vPE, we can get the VLPI state
>> by peeking at the VPT. So we add a function for this.
>>
>> Signed-off-by: Shenming Lu 
>> ---
>>  arch/arm64/kvm/vgic/vgic-v4.c | 19 +++
>>  arch/arm64/kvm/vgic/vgic.h|  1 +
>>  2 files changed, 20 insertions(+)
>>
>> diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
>> index 66508b03094f..ac029ba3d337 100644
>> --- a/arch/arm64/kvm/vgic/vgic-v4.c
>> +++ b/arch/arm64/kvm/vgic/vgic-v4.c
>> @@ -203,6 +203,25 @@ void vgic_v4_configure_vsgis(struct kvm *kvm)
>>  kvm_arm_resume_guest(kvm);
>>  }
>>  
>> +/*
>> + * Must be called with GICv4.1 and the vPE unmapped, which
>> + * indicates the invalidation of any VPT caches associated
>> + * with the vPE, thus we can get the VLPI state by peeking
>> + * at the VPT.
>> + */
>> +void vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val)
>> +{
>> +struct its_vpe *vpe = &irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
>> +int mask = BIT(irq->intid % BITS_PER_BYTE);
>> +void *va;
>> +u8 *ptr;
>> +
>> +va = page_address(vpe->vpt_page);
>> +ptr = va + irq->intid / BITS_PER_BYTE;
>> +
>> +*val = !!(*ptr & mask);
> 
> What guarantees that you can actually read anything valid? Yes, the
> VPT caches are clean. But that doesn't make them coherent with CPU
> caches.
> 
> You need some level of cache maintenance here.

Yeah, and you have come up with some codes for this in v2:

diff --git a/drivers/irqchip/irq-gic-v3-its.c
b/drivers/irqchip/irq-gic-v3-its.c
index 7db602434ac5..2dbef127ca15 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -4552,6 +4552,10 @@ static void its_vpe_irq_domain_deactivate(struct
irq_domain *domain,

its_send_vmapp(its, vpe, false);
}
+
+   if (find_4_1_its() && !atomic_read(vpe->vmapp_count))
+   gic_flush_dcache_to_poc(page_address(vpe->vpt_page),
+   LPI_PENDBASE_SZ);
  }

  static const struct irq_domain_ops its_vpe_domain_ops = {

Could I add it to this series? :-)

Thanks,
Shenming

> 
> Thanks,
> 
>   M.
>

Re: [PATCH v3 0/4] KVM: arm64: Add VLPI migration support on GICv4.1

2021-03-10 Thread Shenming Lu

Hi,

Sorry to bother again, I am really hoping a response for this series. :-)

Thanks,
Shenming

On 2021/2/26 16:58, Shenming Lu wrote:
> Hi Marc,
> 
> Gentle ping. Does this series need any further modification? Wish you can 
> pick it up. :-)
> 
> Thanks,
> Shenming
> 
> On 2021/1/27 20:13, Shenming Lu wrote:
>> Hi Marc, sorry for the late commit.
>>
>> In GICv4.1, migration has been supported except for (directly-injected)
>> VLPI. And GICv4.1 Spec explicitly gives a way to get the VLPI's pending
>> state (which was crucially missing in GICv4.0). So we make VLPI migration
>> capable on GICv4.1 in this patch set.
>>
>> In order to support VLPI migration, we need to save and restore all
>> required configuration information and pending states of VLPIs. But
>> in fact, the configuration information of VLPIs has already been saved
>> (or will be reallocated on the dst host...) in vgic(kvm) migration.
>> So we only have to migrate the pending states of VLPIs specially.
>>
>> Below is the related workflow in migration.
>>
>> On the save path:
>>  In migration completion:
>>  pause all vCPUs
>>  |
>>  call each VM state change handler:
>>  pause other devices (just keep from sending interrupts, 
>> and
>>  such as VFIO migration protocol has already realized it 
>> [1])
>>  |
>>  flush ITS tables into guest RAM
>>  |
>>  flush RDIST pending tables (also flush VLPI state here)
>>  |
>>  ...
>> On the resume path:
>>  load each device's state:
>>  restore ITS tables (include pending tables) from guest RAM
>>  |
>>  for other (PCI) devices (paused), if configured to have VLPIs,
>>  establish the forwarding paths of their VLPIs (and transfer
>>  the pending states from kvm's vgic to VPT here)
>>
>> We have tested this series in VFIO migration, and found some related
>> issues in QEMU [2].
>>
>> Links:
>> [1] vfio: UAPI for migration interface for device state:
>> 
>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a8a24f3f6e38103b77cf399c38eb54e1219d00d6
>> [2] vfio: Some fixes and optimizations for VFIO migration:
>> https://patchwork.ozlabs.org/cover/1413263/
>>
>> History:
>>
>> v2 -> v3
>>  - Add the vgic initialized check to ensure that the allocation and enabling
>>of the doorbells have already been done before unmapping the vPEs.
>>  - Check all get_vlpi_state related conditions in save_pending_tables in one 
>> place.
>>  - Nit fixes.
>>
>> v1 -> v2:
>>  - Get the VLPI state from the KVM side.
>>  - Nit fixes.
>>
>> Thanks,
>> Shenming
>>
>>
>> Shenming Lu (3):
>>   KVM: arm64: GICv4.1: Add function to get VLPI state
>>   KVM: arm64: GICv4.1: Try to save hw pending state in
>> save_pending_tables
>>   KVM: arm64: GICv4.1: Give a chance to save VLPI's pending state
>>
>> Zenghui Yu (1):
>>   KVM: arm64: GICv4.1: Restore VLPI's pending state to physical side
>>
>>  .../virt/kvm/devices/arm-vgic-its.rst |  2 +-
>>  arch/arm64/kvm/vgic/vgic-its.c|  6 +-
>>  arch/arm64/kvm/vgic/vgic-v3.c | 61 +--
>>  arch/arm64/kvm/vgic/vgic-v4.c | 33 ++
>>  arch/arm64/kvm/vgic/vgic.h|  1 +
>>  5 files changed, 93 insertions(+), 10 deletions(-)
>>

Re: [RFC PATCH v2 1/6] iommu: Evolve to support more scenarios of using IOPF

2021-03-09 Thread Shenming Lu

Hi Baolu,

On 2021/3/10 10:09, Lu Baolu wrote:
> Hi Shenming,
> 
> On 3/9/21 2:22 PM, Shenming Lu wrote:
>> This patch follows the discussion here:
>>
>> https://lore.kernel.org/linux-acpi/YAaxjmJW+ZMvrhac@myrica/
>>
>> In order to support more scenarios of using IOPF (mainly consider
>> the nested extension), besides keeping IOMMU_DEV_FEAT_IOPF as a
>> general capability for whether delivering faults through the IOMMU,
>> we extend iommu_register_fault_handler() with flags and introduce
>> IOPF_REPORT_FLAT and IOPF_REPORT_NESTED to describe the page fault
>> reporting capability under a specific configuration.
>> IOPF_REPORT_NESTED needs additional info to indicate which level/stage
>> is concerned since the fault client may be interested in only one
>> level.
>>
>> Signed-off-by: Shenming Lu 
>> ---
>>   .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c   |  3 +-
>>   drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   | 11 ++--
>>   drivers/iommu/io-pgfault.c    |  4 --
>>   drivers/iommu/iommu.c | 56 ++-
>>   include/linux/iommu.h | 21 ++-
>>   include/uapi/linux/iommu.h    |  3 +
>>   6 files changed, 85 insertions(+), 13 deletions(-)
>>
>> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c 
>> b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
>> index ee66d1f4cb81..5de9432349d4 100644
>> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
>> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
>> @@ -482,7 +482,8 @@ static int arm_smmu_master_sva_enable_iopf(struct 
>> arm_smmu_master *master)
>>   if (ret)
>>   return ret;
>>   -    ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
>> +    ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf,
>> +  IOPF_REPORT_FLAT, dev);
>>   if (ret) {
>>   iopf_queue_remove_device(master->smmu->evtq.iopf, dev);
>>   return ret;
>> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
>> b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
>> index 363744df8d51..f40529d0075d 100644
>> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
>> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
>> @@ -1447,10 +1447,6 @@ static int arm_smmu_handle_evt(struct arm_smmu_device 
>> *smmu, u64 *evt)
>>   return -EOPNOTSUPP;
>>   }
>>   -    /* Stage-2 is always pinned at the moment */
>> -    if (evt[1] & EVTQ_1_S2)
>> -    return -EFAULT;
>> -
>>   if (evt[1] & EVTQ_1_RnW)
>>   perm |= IOMMU_FAULT_PERM_READ;
>>   else
>> @@ -1468,13 +1464,18 @@ static int arm_smmu_handle_evt(struct 
>> arm_smmu_device *smmu, u64 *evt)
>>   .flags = IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE,
>>   .grpid = FIELD_GET(EVTQ_1_STAG, evt[1]),
>>   .perm = perm,
>> -    .addr = FIELD_GET(EVTQ_2_ADDR, evt[2]),
>>   };
>>     if (ssid_valid) {
>>   flt->prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
>>   flt->prm.pasid = FIELD_GET(EVTQ_0_SSID, evt[0]);
>>   }
>> +
>> +    if (evt[1] & EVTQ_1_S2) {
>> +    flt->prm.flags |= IOMMU_FAULT_PAGE_REQUEST_L2;
>> +    flt->prm.addr = FIELD_GET(EVTQ_3_IPA, evt[3]);
>> +    } else
>> +    flt->prm.addr = FIELD_GET(EVTQ_2_ADDR, evt[2]);
>>   } else {
>>   flt->type = IOMMU_FAULT_DMA_UNRECOV;
>>   flt->event = (struct iommu_fault_unrecoverable) {
>> diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c
>> index 1df8c1dcae77..abf16e06bcf5 100644
>> --- a/drivers/iommu/io-pgfault.c
>> +++ b/drivers/iommu/io-pgfault.c
>> @@ -195,10 +195,6 @@ int iommu_queue_iopf(struct iommu_fault *fault, void 
>> *cookie)
>>     lockdep_assert_held(¶m->lock);
>>   -    if (fault->type != IOMMU_FAULT_PAGE_REQ)
>> -    /* Not a recoverable page fault */
>> -    return -EOPNOTSUPP;
>> -
> 
> Any reasons why do you want to remove this check?

My thought was to make the reporting cap more detailed: IOPF_REPORT_ is only 
for recoverable
page faults (IOMMU_FAULT_PAGE_REQ), and we may add UNRECOV_FAULT_REPORT_ later 
for unrecoverable
faults (IOMMU_FAULT_DMA_UNRECOV)...

> 
>>   /*
>>    * As long as we're holding param->lock, the queue can't be unlinked
>>    * from

[RFC PATCH v2 6/6] vfio: Add nested IOPF support

To set up nested mode, drivers such as vfio_pci have to register
a handler to receive stage/level 1 faults from the IOMMU, but
since currently each device can only have one iommu dev fault
handler, and if stage 2 IOPF is already enabled (VFIO_IOMMU_ENABLE_IOPF),
we choose to update the registered handler (a combined one) via
flags (set IOPF_REPORT_NESTED_L1_CONCERNED), and further deliver
the received stage 1 faults in the handler to the guest through
a newly added vfio_device_ops callback.

Signed-off-by: Shenming Lu 
---
 drivers/vfio/vfio.c | 83 +
 drivers/vfio/vfio_iommu_type1.c | 37 +++
 include/linux/vfio.h|  9 
 3 files changed, 129 insertions(+)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 77b29bbd3027..c6a01d947d0d 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -2389,6 +2389,89 @@ int vfio_iommu_dev_fault_handler(struct iommu_fault 
*fault, void *data)
 }
 EXPORT_SYMBOL_GPL(vfio_iommu_dev_fault_handler);
 
+int vfio_iommu_dev_fault_handler_unregister_nested(struct device *dev)
+{
+   struct vfio_container *container;
+   struct vfio_group *group;
+   struct vfio_iommu_driver *driver;
+   int ret;
+
+   if (!dev)
+   return -EINVAL;
+
+   group = vfio_group_get_from_dev(dev);
+   if (!group)
+   return -ENODEV;
+
+   ret = vfio_group_add_container_user(group);
+   if (ret)
+   goto out;
+
+   container = group->container;
+   driver = container->iommu_driver;
+   if (likely(driver && driver->ops->unregister_hdlr_nested))
+   ret = driver->ops->unregister_hdlr_nested(container->iommu_data,
+ dev);
+   else
+   ret = -ENOTTY;
+
+   vfio_group_try_dissolve_container(group);
+
+out:
+   vfio_group_put(group);
+   return ret;
+}
+EXPORT_SYMBOL_GPL(vfio_iommu_dev_fault_handler_unregister_nested);
+
+/*
+ * Register/Update the VFIO page fault handler
+ * to receive nested stage/level 1 faults.
+ */
+int vfio_iommu_dev_fault_handler_register_nested(struct device *dev)
+{
+   struct vfio_container *container;
+   struct vfio_group *group;
+   struct vfio_iommu_driver *driver;
+   int ret;
+
+   if (!dev)
+   return -EINVAL;
+
+   group = vfio_group_get_from_dev(dev);
+   if (!group)
+   return -ENODEV;
+
+   ret = vfio_group_add_container_user(group);
+   if (ret)
+   goto out;
+
+   container = group->container;
+   driver = container->iommu_driver;
+   if (likely(driver && driver->ops->register_hdlr_nested))
+   ret = driver->ops->register_hdlr_nested(container->iommu_data,
+   dev);
+   else
+   ret = -ENOTTY;
+
+   vfio_group_try_dissolve_container(group);
+
+out:
+   vfio_group_put(group);
+   return ret;
+}
+EXPORT_SYMBOL_GPL(vfio_iommu_dev_fault_handler_register_nested);
+
+int vfio_transfer_dev_fault(struct device *dev, struct iommu_fault *fault)
+{
+   struct vfio_device *device = dev_get_drvdata(dev);
+
+   if (unlikely(!device->ops->transfer))
+   return -EOPNOTSUPP;
+
+   return device->ops->transfer(device->device_data, fault);
+}
+EXPORT_SYMBOL_GPL(vfio_transfer_dev_fault);
+
 /**
  * Module/class support
  */
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 8d14ced649a6..62ad4a47de4a 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -3581,6 +3581,13 @@ static int vfio_iommu_type1_dma_map_iopf(void 
*iommu_data,
enum iommu_page_response_code status = IOMMU_PAGE_RESP_INVALID;
struct iommu_page_response resp = {0};
 
+   /*
+* When configured in nested mode, further deliver
+* stage/level 1 faults to the guest.
+*/
+   if (iommu->nesting && !(fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_L2))
+   return vfio_transfer_dev_fault(dev, fault);
+
mutex_lock(&iommu->lock);
 
dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
@@ -3654,6 +3661,34 @@ static int vfio_iommu_type1_dma_map_iopf(void 
*iommu_data,
return 0;
 }
 
+static int vfio_iommu_type1_register_hdlr_nested(void *iommu_data,
+struct device *dev)
+{
+   struct vfio_iommu *iommu = iommu_data;
+
+   if (iommu->iopf_enabled)
+   return iommu_update_device_fault_handler(dev, ~0,
+   IOPF_REPORT_NESTED_L1_CONCERNED);
+   else
+   return iommu_register_device_fault_handler(dev,
+   vfio_iommu_dev_fault_handler,
+

[RFC PATCH v2 4/6] vfio: VFIO_IOMMU_ENABLE_IOPF

Since enabling IOPF for devices may lead to a slow ramp up of
performance, we add an VFIO_IOMMU_ENABLE_IOPF ioctl to make it
configurable. And the IOPF enabling of a VFIO device includes
setting IOMMU_DEV_FEAT_IOPF and registering the VFIO page fault
handler. Note that VFIO_IOMMU_DISABLE_IOPF is not supported
since there may be inflight page faults when disabling.

Signed-off-by: Shenming Lu 
---
 drivers/vfio/vfio_iommu_type1.c | 139 +++-
 include/uapi/linux/vfio.h   |   6 ++
 2 files changed, 142 insertions(+), 3 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 167d52c1468b..3997473be4a7 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -71,6 +71,7 @@ struct vfio_iommu {
struct rb_root  dma_list;
struct blocking_notifier_head notifier;
struct mmu_notifier mn;
+   struct mm_struct*mm;
unsigned intdma_avail;
unsigned intvaddr_invalid_count;
uint64_tpgsize_bitmap;
@@ -81,6 +82,7 @@ struct vfio_iommu {
booldirty_page_tracking;
boolpinned_page_dirty_scope;
boolcontainer_open;
+   booliopf_enabled;
 };
 
 struct vfio_domain {
@@ -2278,6 +2280,62 @@ static void vfio_iommu_iova_insert_copy(struct 
vfio_iommu *iommu,
list_splice_tail(iova_copy, iova);
 }
 
+static int dev_disable_iopf(struct device *dev, void *data)
+{
+   int *enabled_dev_cnt = data;
+
+   if (enabled_dev_cnt && *enabled_dev_cnt <= 0)
+   return -1;
+
+   iommu_unregister_device_fault_handler(dev);
+   iommu_dev_disable_feature(dev, IOMMU_DEV_FEAT_IOPF);
+
+   if (enabled_dev_cnt)
+   (*enabled_dev_cnt)--;
+
+   return 0;
+}
+
+static int dev_enable_iopf(struct device *dev, void *data)
+{
+   int *enabled_dev_cnt = data;
+   struct iommu_domain *domain;
+   int nested;
+   u32 flags;
+   int ret;
+
+   ret = iommu_dev_enable_feature(dev, IOMMU_DEV_FEAT_IOPF);
+   if (ret)
+   return ret;
+
+   domain = iommu_get_domain_for_dev(dev);
+   if (!domain) {
+   ret = -ENODEV;
+   goto out_disable;
+   }
+
+   ret = iommu_domain_get_attr(domain, DOMAIN_ATTR_NESTING, &nested);
+   if (ret)
+   goto out_disable;
+
+   if (nested)
+   flags = IOPF_REPORT_NESTED | IOPF_REPORT_NESTED_L2_CONCERNED;
+   else
+   flags = IOPF_REPORT_FLAT;
+
+   ret = iommu_register_device_fault_handler(dev,
+   vfio_iommu_dev_fault_handler, flags, dev);
+   if (ret)
+   goto out_disable;
+
+   (*enabled_dev_cnt)++;
+   return 0;
+
+out_disable:
+   iommu_dev_disable_feature(dev, IOMMU_DEV_FEAT_IOPF);
+   return ret;
+}
+
 static int vfio_iommu_type1_attach_group(void *iommu_data,
 struct iommu_group *iommu_group)
 {
@@ -2291,6 +2349,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
struct iommu_domain_geometry geo;
LIST_HEAD(iova_copy);
LIST_HEAD(group_resv_regions);
+   int iopf_enabled_dev_cnt = 0;
 
mutex_lock(&iommu->lock);
 
@@ -2368,6 +2427,13 @@ static int vfio_iommu_type1_attach_group(void 
*iommu_data,
if (ret)
goto out_domain;
 
+   if (iommu->iopf_enabled) {
+   ret = iommu_group_for_each_dev(iommu_group, 
&iopf_enabled_dev_cnt,
+  dev_enable_iopf);
+   if (ret)
+   goto out_detach;
+   }
+
/* Get aperture info */
iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY, &geo);
 
@@ -2449,9 +2515,11 @@ static int vfio_iommu_type1_attach_group(void 
*iommu_data,
vfio_test_domain_fgsp(domain);
 
/* replay mappings on new domains */
-   ret = vfio_iommu_replay(iommu, domain);
-   if (ret)
-   goto out_detach;
+   if (!iommu->iopf_enabled) {
+   ret = vfio_iommu_replay(iommu, domain);
+   if (ret)
+   goto out_detach;
+   }
 
if (resv_msi) {
ret = iommu_get_msi_cookie(domain->domain, resv_msi_base);
@@ -2482,6 +2550,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
iommu_domain_free(domain->domain);
vfio_iommu_iova_free(&iova_copy);
vfio_iommu_resv_free(&group_resv_regions);
+   iommu_group_for_each_dev(iommu_group, &iopf_enabled_dev_cnt,
+dev_disable_iopf);
 out_free:
kfree(domain);
kfree(group);
@@ -2643,6 +2713,10 @@ static void vfio_iommu_type1_detach_group(void 
*iommu_data,
if (!group)

[RFC PATCH v2 2/6] vfio: Add an MMU notifier to avoid pinning

To avoid pinning pages when they are mapped in IOMMU page tables,
we add an MMU notifier to tell the addresses which are no longer
valid and try to unmap them.

Signed-off-by: Shenming Lu 
---
 drivers/vfio/vfio_iommu_type1.c | 68 +
 1 file changed, 68 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 4bb162c1d649..03ccc11057af 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -40,6 +40,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define DRIVER_VERSION  "0.2"
 #define DRIVER_AUTHOR   "Alex Williamson "
@@ -69,6 +70,7 @@ struct vfio_iommu {
struct mutexlock;
struct rb_root  dma_list;
struct blocking_notifier_head notifier;
+   struct mmu_notifier mn;
unsigned intdma_avail;
unsigned intvaddr_invalid_count;
uint64_tpgsize_bitmap;
@@ -101,6 +103,7 @@ struct vfio_dma {
struct task_struct  *task;
struct rb_root  pfn_list;   /* Ex-user pinned pfn list */
unsigned long   *bitmap;
+   unsigned long   *iopf_mapped_bitmap;
 };
 
 struct vfio_batch {
@@ -157,6 +160,10 @@ struct vfio_regions {
 #define DIRTY_BITMAP_PAGES_MAX  ((u64)INT_MAX)
 #define DIRTY_BITMAP_SIZE_MAX   DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
 
+#define IOPF_MAPPED_BITMAP_GET(dma, i) \
+ ((dma->iopf_mapped_bitmap[(i) / BITS_PER_LONG]
\
+  >> ((i) % BITS_PER_LONG)) & 0x1)
+
 #define WAITED 1
 
 static int put_pfn(unsigned long pfn, int prot);
@@ -1149,6 +1156,35 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, 
struct vfio_dma *dma,
return unlocked;
 }
 
+static void vfio_unmap_partial_iopf(struct vfio_iommu *iommu,
+   struct vfio_dma *dma,
+   dma_addr_t start, dma_addr_t end)
+{
+   unsigned long bit_offset;
+   size_t len;
+   struct vfio_domain *d;
+
+   while (start < end) {
+   bit_offset = (start - dma->iova) >> PAGE_SHIFT;
+
+   for (len = 0; start + len < end; len += PAGE_SIZE) {
+   if (!IOPF_MAPPED_BITMAP_GET(dma,
+   bit_offset + (len >> PAGE_SHIFT)))
+   break;
+   }
+
+   if (len) {
+   list_for_each_entry(d, &iommu->domain_list, next)
+   iommu_unmap(d->domain, start, len);
+
+   bitmap_clear(dma->iopf_mapped_bitmap,
+bit_offset, len >> PAGE_SHIFT);
+   }
+
+   start += (len + PAGE_SIZE);
+   }
+}
+
 static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
 {
WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list));
@@ -3096,6 +3132,38 @@ static int vfio_iommu_type1_dirty_pages(struct 
vfio_iommu *iommu,
return -EINVAL;
 }
 
+static void mn_invalidate_range(struct mmu_notifier *mn, struct mm_struct *mm,
+   unsigned long start, unsigned long end)
+{
+   struct vfio_iommu *iommu = container_of(mn, struct vfio_iommu, mn);
+   struct rb_node *n;
+
+   mutex_lock(&iommu->lock);
+
+   for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
+   struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+   unsigned long start_n, end_n;
+
+   if (end <= dma->vaddr || start >= dma->vaddr + dma->size)
+   continue;
+
+   start_n = ALIGN_DOWN(max_t(unsigned long, start, dma->vaddr),
+PAGE_SIZE);
+   end_n = ALIGN(min_t(unsigned long, end, dma->vaddr + dma->size),
+ PAGE_SIZE);
+
+   vfio_unmap_partial_iopf(iommu, dma,
+   start_n - dma->vaddr + dma->iova,
+   end_n - dma->vaddr + dma->iova);
+   }
+
+   mutex_unlock(&iommu->lock);
+}
+
+static const struct mmu_notifier_ops vfio_iommu_type1_mn_ops = {
+   .invalidate_range   = mn_invalidate_range,
+};
+
 static long vfio_iommu_type1_ioctl(void *iommu_data,
   unsigned int cmd, unsigned long arg)
 {
-- 
2.19.1

[RFC PATCH v2 5/6] vfio: No need to statically pin and map if IOPF enabled

If IOPF enabled for the VFIO container, there is no need to
statically pin and map the entire DMA range, we can do it on
demand. And unmap according to the IOPF mapped bitmap when
removing vfio_dma.

Signed-off-by: Shenming Lu 
---
 drivers/vfio/vfio_iommu_type1.c | 35 -
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 3997473be4a7..8d14ced649a6 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -165,6 +165,7 @@ struct vfio_regions {
 #define IOPF_MAPPED_BITMAP_GET(dma, i) \
  ((dma->iopf_mapped_bitmap[(i) / BITS_PER_LONG]
\
   >> ((i) % BITS_PER_LONG)) & 0x1)
+#define IOPF_MAPPED_BITMAP_BYTES(n)DIRTY_BITMAP_BYTES(n)
 
 #define WAITED 1
 
@@ -877,7 +878,8 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
 * already pinned and accounted. Accouting should be done if there is no
 * iommu capable domain in the container.
 */
-   do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
+   do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) ||
+   iommu->iopf_enabled;
 
for (i = 0; i < npage; i++) {
struct vfio_pfn *vpfn;
@@ -966,7 +968,8 @@ static int vfio_iommu_type1_unpin_pages(void *iommu_data,
 
mutex_lock(&iommu->lock);
 
-   do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
+   do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) ||
+   iommu->iopf_enabled;
for (i = 0; i < npage; i++) {
struct vfio_dma *dma;
dma_addr_t iova;
@@ -1087,7 +1090,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, 
struct vfio_dma *dma,
if (!dma->size)
return 0;
 
-   if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
+   if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) || iommu->iopf_enabled)
return 0;
 
/*
@@ -1187,11 +1190,20 @@ static void vfio_unmap_partial_iopf(struct vfio_iommu 
*iommu,
}
 }
 
+static void vfio_dma_clean_iopf(struct vfio_iommu *iommu, struct vfio_dma *dma)
+{
+   vfio_unmap_partial_iopf(iommu, dma, dma->iova, dma->iova + dma->size);
+
+   kfree(dma->iopf_mapped_bitmap);
+}
+
 static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
 {
WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list));
vfio_unmap_unpin(iommu, dma, true);
vfio_unlink_dma(iommu, dma);
+   if (iommu->iopf_enabled)
+   vfio_dma_clean_iopf(iommu, dma);
put_task_struct(dma->task);
vfio_dma_bitmap_free(dma);
if (dma->vaddr_invalid) {
@@ -1655,6 +1667,16 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
goto out_unlock;
}
 
+   if (iommu->iopf_enabled) {
+   dma->iopf_mapped_bitmap = kvzalloc(IOPF_MAPPED_BITMAP_BYTES(
+   size >> PAGE_SHIFT), 
GFP_KERNEL);
+   if (!dma->iopf_mapped_bitmap) {
+   ret = -ENOMEM;
+   kfree(dma);
+   goto out_unlock;
+   }
+   }
+
iommu->dma_avail--;
dma->iova = iova;
dma->vaddr = vaddr;
@@ -1694,8 +1716,11 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
/* Insert zero-sized and grow as we map chunks of it */
vfio_link_dma(iommu, dma);
 
-   /* Don't pin and map if container doesn't contain IOMMU capable domain*/
-   if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
+   /*
+* Don't pin and map if container doesn't contain IOMMU capable domain,
+* or IOPF enabled for the container.
+*/
+   if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) || iommu->iopf_enabled)
dma->size = size;
else
ret = vfio_pin_map_dma(iommu, dma, size);
-- 
2.19.1

[RFC PATCH v2 1/6] iommu: Evolve to support more scenarios of using IOPF

This patch follows the discussion here:

https://lore.kernel.org/linux-acpi/YAaxjmJW+ZMvrhac@myrica/

In order to support more scenarios of using IOPF (mainly consider
the nested extension), besides keeping IOMMU_DEV_FEAT_IOPF as a
general capability for whether delivering faults through the IOMMU,
we extend iommu_register_fault_handler() with flags and introduce
IOPF_REPORT_FLAT and IOPF_REPORT_NESTED to describe the page fault
reporting capability under a specific configuration.
IOPF_REPORT_NESTED needs additional info to indicate which level/stage
is concerned since the fault client may be interested in only one
level.

Signed-off-by: Shenming Lu 
---
 .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c   |  3 +-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   | 11 ++--
 drivers/iommu/io-pgfault.c|  4 --
 drivers/iommu/iommu.c | 56 ++-
 include/linux/iommu.h | 21 ++-
 include/uapi/linux/iommu.h|  3 +
 6 files changed, 85 insertions(+), 13 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index ee66d1f4cb81..5de9432349d4 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -482,7 +482,8 @@ static int arm_smmu_master_sva_enable_iopf(struct 
arm_smmu_master *master)
if (ret)
return ret;
 
-   ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
+   ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf,
+ IOPF_REPORT_FLAT, dev);
if (ret) {
iopf_queue_remove_device(master->smmu->evtq.iopf, dev);
return ret;
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 363744df8d51..f40529d0075d 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1447,10 +1447,6 @@ static int arm_smmu_handle_evt(struct arm_smmu_device 
*smmu, u64 *evt)
return -EOPNOTSUPP;
}
 
-   /* Stage-2 is always pinned at the moment */
-   if (evt[1] & EVTQ_1_S2)
-   return -EFAULT;
-
if (evt[1] & EVTQ_1_RnW)
perm |= IOMMU_FAULT_PERM_READ;
else
@@ -1468,13 +1464,18 @@ static int arm_smmu_handle_evt(struct arm_smmu_device 
*smmu, u64 *evt)
.flags = IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE,
.grpid = FIELD_GET(EVTQ_1_STAG, evt[1]),
.perm = perm,
-   .addr = FIELD_GET(EVTQ_2_ADDR, evt[2]),
};
 
if (ssid_valid) {
flt->prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
flt->prm.pasid = FIELD_GET(EVTQ_0_SSID, evt[0]);
}
+
+   if (evt[1] & EVTQ_1_S2) {
+   flt->prm.flags |= IOMMU_FAULT_PAGE_REQUEST_L2;
+   flt->prm.addr = FIELD_GET(EVTQ_3_IPA, evt[3]);
+   } else
+   flt->prm.addr = FIELD_GET(EVTQ_2_ADDR, evt[2]);
} else {
flt->type = IOMMU_FAULT_DMA_UNRECOV;
flt->event = (struct iommu_fault_unrecoverable) {
diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c
index 1df8c1dcae77..abf16e06bcf5 100644
--- a/drivers/iommu/io-pgfault.c
+++ b/drivers/iommu/io-pgfault.c
@@ -195,10 +195,6 @@ int iommu_queue_iopf(struct iommu_fault *fault, void 
*cookie)
 
lockdep_assert_held(¶m->lock);
 
-   if (fault->type != IOMMU_FAULT_PAGE_REQ)
-   /* Not a recoverable page fault */
-   return -EOPNOTSUPP;
-
/*
 * As long as we're holding param->lock, the queue can't be unlinked
 * from the device and therefore cannot disappear.
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index d0b0a15dba84..cb1d93b00f7d 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1056,6 +1056,40 @@ int iommu_group_unregister_notifier(struct iommu_group 
*group,
 }
 EXPORT_SYMBOL_GPL(iommu_group_unregister_notifier);
 
+/*
+ * iommu_update_device_fault_handler - Update the device fault handler via 
flags
+ * @dev: the device
+ * @mask: bits(not set) to clear
+ * @set: bits to set
+ *
+ * Update the device fault handler installed by
+ * iommu_register_device_fault_handler().
+ *
+ * Return 0 on success, or an error.
+ */
+int iommu_update_device_fault_handler(struct device *dev, u32 mask, u32 set)
+{
+   struct dev_iommu *param = dev->iommu;
+   int ret = 0;
+
+   if (!param)
+   return -EINVAL;
+
+   mutex_lock(¶m->lock);
+
+   if (param->fault_param) {
+   ret = -EINVAL;
+   goto out_unlock;
+

[RFC PATCH v2 3/6] vfio: Add a page fault handler

VFIO manages the DMA mapping itself. To support IOPF for VFIO
devices, we add a VFIO page fault handler to serve the reported
page faults from the IOMMU driver. Besides, we can pre-map more
pages than requested at once to optimize for fewer page fault
handlings.

Signed-off-by: Shenming Lu 
---
 drivers/vfio/vfio.c |  35 +++
 drivers/vfio/vfio_iommu_type1.c | 167 
 include/linux/vfio.h|   5 +
 3 files changed, 207 insertions(+)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 38779e6fd80c..77b29bbd3027 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -2354,6 +2354,41 @@ struct iommu_domain *vfio_group_iommu_domain(struct 
vfio_group *group)
 }
 EXPORT_SYMBOL_GPL(vfio_group_iommu_domain);
 
+int vfio_iommu_dev_fault_handler(struct iommu_fault *fault, void *data)
+{
+   struct device *dev = (struct device *)data;
+   struct vfio_container *container;
+   struct vfio_group *group;
+   struct vfio_iommu_driver *driver;
+   int ret;
+
+   if (!dev)
+   return -EINVAL;
+
+   group = vfio_group_get_from_dev(dev);
+   if (!group)
+   return -ENODEV;
+
+   ret = vfio_group_add_container_user(group);
+   if (ret)
+   goto out;
+
+   container = group->container;
+   driver = container->iommu_driver;
+   if (likely(driver && driver->ops->dma_map_iopf))
+   ret = driver->ops->dma_map_iopf(container->iommu_data,
+   fault, dev);
+   else
+   ret = -ENOTTY;
+
+   vfio_group_try_dissolve_container(group);
+
+out:
+   vfio_group_put(group);
+   return ret;
+}
+EXPORT_SYMBOL_GPL(vfio_iommu_dev_fault_handler);
+
 /**
  * Module/class support
  */
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 03ccc11057af..167d52c1468b 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -3330,6 +3330,172 @@ static void vfio_iommu_type1_notify(void *iommu_data,
wake_up_all(&iommu->vaddr_wait);
 }
 
+/*
+ * To optimize for fewer page fault handlings, try to
+ * pre-map more pages than requested.
+ */
+#define IOPF_PREMAP_LEN512
+
+static void unpin_pages_iopf(struct vfio_dma *dma,
+unsigned long pfn, unsigned long npages)
+{
+   while (npages--)
+   put_pfn(pfn++, dma->prot);
+}
+
+/*
+ * Return 0 on success or a negative error code, the
+ * number of pages contiguously pinned is in @*pinned.
+ */
+static int pin_pages_iopf(struct vfio_dma *dma, unsigned long vaddr,
+ unsigned long npages, unsigned long *pfn_base,
+ unsigned long *pinned, struct vfio_batch *batch)
+{
+   struct mm_struct *mm;
+   unsigned long pfn;
+   int ret = 0;
+   *pinned = 0;
+
+   mm = get_task_mm(dma->task);
+   if (!mm)
+   return -ENODEV;
+
+   if (batch->size) {
+   *pfn_base = page_to_pfn(batch->pages[batch->offset]);
+   pfn = *pfn_base;
+   } else
+   *pfn_base = 0;
+
+   while (npages) {
+   if (!batch->size) {
+   unsigned long req_pages = min_t(unsigned long, npages,
+   batch->capacity);
+
+   ret = vaddr_get_pfns(mm, vaddr, req_pages, dma->prot,
+&pfn, batch->pages);
+   if (ret < 0)
+   goto out;
+
+   batch->size = ret;
+   batch->offset = 0;
+   ret = 0;
+
+   if (!*pfn_base)
+   *pfn_base = pfn;
+   }
+
+   while (true) {
+   if (pfn != *pfn_base + *pinned)
+   goto out;
+
+   (*pinned)++;
+   npages--;
+   vaddr += PAGE_SIZE;
+   batch->offset++;
+   batch->size--;
+
+   if (!batch->size)
+   break;
+
+   pfn = page_to_pfn(batch->pages[batch->offset]);
+   }
+
+   if (unlikely(disable_hugepages))
+   break;
+   }
+
+out:
+   mmput(mm);
+   return ret;
+}
+
+static int vfio_iommu_type1_dma_map_iopf(void *iommu_data,
+struct iommu_fault *fault,
+struct device *dev)
+{
+   struct vfio_iommu *iommu = iommu_data;
+   dma_addr_t iova = ALIGN_DOWN(fault->prm.addr, PAGE_SIZE);
+   struct vfio_dma *dma;
+   int access_flags = 0;
+   size_t premap_len, ma

[RFC PATCH v2 0/6] Add IOPF support for VFIO passthrough

Hi,

The static pinning and mapping problem in VFIO and possible solutions
have been discussed a lot [1, 2]. One of the solutions is to add I/O
page fault support for VFIO devices. Different from those relatively
complicated software approaches such as presenting a vIOMMU that provides
the DMA buffer information (might include para-virtualized optimizations),
IOPF mainly depends on the hardware faulting capability, such as the PCIe
PRI extension or Arm SMMU stall model. What's more, the IOPF support in
the IOMMU driver is being implemented in SVA [3]. So we add IOPF support
for VFIO passthrough based on the IOPF part of SVA in this series.

We have measured its performance with UADK [4] (passthrough an accelerator
to a VM) on Hisilicon Kunpeng920 board:

Run hisi_sec_test...
 - with varying message lengths and sending times
 - with/without stage 2 IOPF enabled

when msg_len = 1MB and PREMAP_LEN (in patch 3) = 1:
   speed (KB/s)
 times w/o IOPFwith IOPF (num of faults)degradation
 1 325596  119152 (518) 36.6%
 100   7524985 5804659 (1058)   77.1%
 1000  8661817 8440209 (1071)   97.4%
 5000  8804512 8724368 (1216)   99.1%

If we use the same region to send messages, since page faults occur almost
only when first accessing, more times, less degradation.

when msg_len = 10MB and PREMAP_LEN = 512:
   speed (KB/s)
 times w/o IOPFwith IOPF (num of faults)degradation
 1 1012758 682257 (13)  67.4%
 100   8680688 8374154 (26) 96.5%
 1000  8860861 8719918 (26) 98.4%

We see that pre-mapping can help.

And we also measured the performance of host SVA with the same params:

when msg_len = 1MB:
   speed (KB/s)
 times w/o IOPFwith IOPF (num of faults)degradation
 1 951672  163866 (512) 17.2%
 100   8691961 4529971 (1024)   52.1%
 1000  9158721 8376346 (1024)   91.5%
 5000  9184532 9008739 (1024)   98.1%

Besides, the avg time spent in vfio_iommu_dev_fault_handler() (in patch 3)
is little less than iopf_handle_group() (in SVA) (1.6 us vs 2.0 us).

History:

v1 -> v2
 - Numerous improvements following the suggestions. Thanks a lot to all
   of you.

Yet TODO:
 - Add support for PRI.
 - Consider selective-faulting. (suggested by Kevin)
 ...

Links:
[1] Lesokhin I, et al. Page Fault Support for Network Controllers. In ASPLOS,
2016.
[2] Tian K, et al. coIOMMU: A Virtual IOMMU with Cooperative DMA Buffer Tracking
for Efficient Memory Management in Direct I/O. In USENIX ATC, 2020.
[3] 
https://patchwork.kernel.org/project/linux-arm-kernel/cover/20210302092644.2553014-1-jean-phili...@linaro.org/
[4] https://github.com/Linaro/uadk

Any comments and suggestions are very welcome. :-)

Thanks,
Shenming


Shenming Lu (6):
  iommu: Evolve to support more scenarios of using IOPF
  vfio: Add an MMU notifier to avoid pinning
  vfio: Add a page fault handler
  vfio: VFIO_IOMMU_ENABLE_IOPF
  vfio: No need to statically pin and map if IOPF enabled
  vfio: Add nested IOPF support

 .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c   |   3 +-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   |  11 +-
 drivers/iommu/io-pgfault.c|   4 -
 drivers/iommu/iommu.c |  56 ++-
 drivers/vfio/vfio.c   | 118 +
 drivers/vfio/vfio_iommu_type1.c   | 446 +-
 include/linux/iommu.h |  21 +-
 include/linux/vfio.h  |  14 +
 include/uapi/linux/iommu.h|   3 +
 include/uapi/linux/vfio.h |   6 +
 10 files changed, 661 insertions(+), 21 deletions(-)

-- 
2.19.1

Re: [PATCH v3 0/4] KVM: arm64: Add VLPI migration support on GICv4.1

2021-02-26 Thread Shenming Lu

Hi Marc,

Gentle ping. Does this series need any further modification? Wish you can pick 
it up. :-)

Thanks,
Shenming

On 2021/1/27 20:13, Shenming Lu wrote:
> Hi Marc, sorry for the late commit.
> 
> In GICv4.1, migration has been supported except for (directly-injected)
> VLPI. And GICv4.1 Spec explicitly gives a way to get the VLPI's pending
> state (which was crucially missing in GICv4.0). So we make VLPI migration
> capable on GICv4.1 in this patch set.
> 
> In order to support VLPI migration, we need to save and restore all
> required configuration information and pending states of VLPIs. But
> in fact, the configuration information of VLPIs has already been saved
> (or will be reallocated on the dst host...) in vgic(kvm) migration.
> So we only have to migrate the pending states of VLPIs specially.
> 
> Below is the related workflow in migration.
> 
> On the save path:
>   In migration completion:
>   pause all vCPUs
>   |
>   call each VM state change handler:
>   pause other devices (just keep from sending interrupts, 
> and
>   such as VFIO migration protocol has already realized it 
> [1])
>   |
>   flush ITS tables into guest RAM
>   |
>   flush RDIST pending tables (also flush VLPI state here)
>   |
>   ...
> On the resume path:
>   load each device's state:
>   restore ITS tables (include pending tables) from guest RAM
>   |
>   for other (PCI) devices (paused), if configured to have VLPIs,
>   establish the forwarding paths of their VLPIs (and transfer
>   the pending states from kvm's vgic to VPT here)
> 
> We have tested this series in VFIO migration, and found some related
> issues in QEMU [2].
> 
> Links:
> [1] vfio: UAPI for migration interface for device state:
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a8a24f3f6e38103b77cf399c38eb54e1219d00d6
> [2] vfio: Some fixes and optimizations for VFIO migration:
> https://patchwork.ozlabs.org/cover/1413263/
> 
> History:
> 
> v2 -> v3
>  - Add the vgic initialized check to ensure that the allocation and enabling
>of the doorbells have already been done before unmapping the vPEs.
>  - Check all get_vlpi_state related conditions in save_pending_tables in one 
> place.
>  - Nit fixes.
> 
> v1 -> v2:
>  - Get the VLPI state from the KVM side.
>  - Nit fixes.
> 
> Thanks,
> Shenming
> 
> 
> Shenming Lu (3):
>   KVM: arm64: GICv4.1: Add function to get VLPI state
>   KVM: arm64: GICv4.1: Try to save hw pending state in
> save_pending_tables
>   KVM: arm64: GICv4.1: Give a chance to save VLPI's pending state
> 
> Zenghui Yu (1):
>   KVM: arm64: GICv4.1: Restore VLPI's pending state to physical side
> 
>  .../virt/kvm/devices/arm-vgic-its.rst |  2 +-
>  arch/arm64/kvm/vgic/vgic-its.c|  6 +-
>  arch/arm64/kvm/vgic/vgic-v3.c | 61 +--
>  arch/arm64/kvm/vgic/vgic-v4.c | 33 ++
>  arch/arm64/kvm/vgic/vgic.h|  1 +
>  5 files changed, 93 insertions(+), 10 deletions(-)
>

Re: [PATCH v11 04/13] vfio/pci: Add VFIO_REGION_TYPE_NESTED region type

2021-02-23 Thread Shenming Lu

> +static int vfio_pci_dma_fault_init(struct vfio_pci_device *vdev)
> +{
> + struct vfio_region_dma_fault *header;
> + struct iommu_domain *domain;
> + size_t size;
> + bool nested;
> + int ret;
> +
> + domain = iommu_get_domain_for_dev(&vdev->pdev->dev);
> + ret = iommu_domain_get_attr(domain, DOMAIN_ATTR_NESTING, &nested);
> + if (ret || !nested)
> + return ret;

Hi Eric,

It seems that the type of nested should be int, the use of bool might trigger
a panic in arm_smmu_domain_get_attr().

Thanks,
Shenming

Re: [RFC PATCH v1 0/4] vfio: Add IOPF support for VFIO passthrough

2021-02-10 Thread Shenming Lu

On 2021/2/9 19:06, Liu, Yi L wrote:
>> From: Tian, Kevin 
>> Sent: Thursday, February 4, 2021 2:52 PM
>>
>>> From: Shenming Lu 
>>> Sent: Tuesday, February 2, 2021 2:42 PM
>>>
>>> On 2021/2/1 15:56, Tian, Kevin wrote:
>>>>> From: Alex Williamson 
>>>>> Sent: Saturday, January 30, 2021 6:58 AM
>>>>>
>>>>> On Mon, 25 Jan 2021 17:03:58 +0800
>>>>> Shenming Lu  wrote:
>>>>>
>>>>>> Hi,
>>>>>>
>>>>>> The static pinning and mapping problem in VFIO and possible
>> solutions
>>>>>> have been discussed a lot [1, 2]. One of the solutions is to add I/O
>>>>>> page fault support for VFIO devices. Different from those relatively
>>>>>> complicated software approaches such as presenting a vIOMMU that
>>>>> provides
>>>>>> the DMA buffer information (might include para-virtualized
>>> optimizations),
>>>>>> IOPF mainly depends on the hardware faulting capability, such as the
>>> PCIe
>>>>>> PRI extension or Arm SMMU stall model. What's more, the IOPF
>> support
>>> in
>>>>>> the IOMMU driver is being implemented in SVA [3]. So do we
>> consider to
>>>>>> add IOPF support for VFIO passthrough based on the IOPF part of SVA
>> at
>>>>>> present?
>>>>>>
>>>>>> We have implemented a basic demo only for one stage of translation
>>> (GPA
>>>>>> -> HPA in virtualization, note that it can be configured at either 
>>>>>> stage),
>>>>>> and tested on Hisilicon Kunpeng920 board. The nested mode is more
>>>>> complicated
>>>>>> since VFIO only handles the second stage page faults (same as the
>> non-
>>>>> nested
>>>>>> case), while the first stage page faults need to be further delivered to
>>>>>> the guest, which is being implemented in [4] on ARM. My thought on
>> this
>>>>>> is to report the page faults to VFIO regardless of the occured stage
>> (try
>>>>>> to carry the stage information), and handle respectively according to
>> the
>>>>>> configured mode in VFIO. Or the IOMMU driver might evolve to
>> support
>>>>> more...
>>>>>>
>>>>>> Might TODO:
>>>>>>  - Optimize the faulting path, and measure the performance (it might
>> still
>>>>>>be a big issue).
>>>>>>  - Add support for PRI.
>>>>>>  - Add a MMU notifier to avoid pinning.
>>>>>>  - Add support for the nested mode.
>>>>>> ...
>>>>>>
>>>>>> Any comments and suggestions are very welcome. :-)
>>>>>
>>>>> I expect performance to be pretty bad here, the lookup involved per
>>>>> fault is excessive.  There are cases where a user is not going to be
>>>>> willing to have a slow ramp up of performance for their devices as they
>>>>> fault in pages, so we might need to considering making this
>>>>> configurable through the vfio interface.  Our page mapping also only
>>>>
>>>> There is another factor to be considered. The presence of IOMMU_
>>>> DEV_FEAT_IOPF just indicates the device capability of triggering I/O
>>>> page fault through the IOMMU, but not exactly means that the device
>>>> can tolerate I/O page fault for arbitrary DMA requests.
>>>
>>> Yes, so I add a iopf_enabled field in VFIO to indicate the whole path
>> faulting
>>> capability and set it to true after registering a VFIO page fault handler.
>>>
>>>> In reality, many
>>>> devices allow I/O faulting only in selective contexts. However, there
>>>> is no standard way (e.g. PCISIG) for the device to report whether
>>>> arbitrary I/O fault is allowed. Then we may have to maintain device
>>>> specific knowledge in software, e.g. in an opt-in table to list devices
>>>> which allows arbitrary faults. For devices which only support selective
>>>> faulting, a mediator (either through vendor extensions on vfio-pci-core
>>>> or a mdev wrapper) might be necessary to help lock down non-faultable
>>>> mappings and then enable faulting on the rest mappings.
>>>
>>> For devices which only support selective faulting, they could te

Re: [RFC PATCH v1 0/4] vfio: Add IOPF support for VFIO passthrough

2021-02-07 Thread Shenming Lu

On 2021/2/7 16:20, Tian, Kevin wrote:
>> From: Jean-Philippe Brucker 
>> Sent: Friday, February 5, 2021 6:37 PM
>>
>> Hi,
>>
>> On Thu, Feb 04, 2021 at 06:52:10AM +, Tian, Kevin wrote:
>>> The static pinning and mapping problem in VFIO and possible
>> solutions
>>> have been discussed a lot [1, 2]. One of the solutions is to add I/O
>>> page fault support for VFIO devices. Different from those relatively
>>> complicated software approaches such as presenting a vIOMMU that
>> provides
>>> the DMA buffer information (might include para-virtualized
 optimizations),
>>
>> I'm curious about the performance difference between this and the
>> map/unmap vIOMMU, as well as the coIOMMU. This is probably a lot faster
>> but those don't depend on IOPF which is a pretty rare feature at the
>> moment.

Yeah, I will give the performance data later.

>>
>> [...]
> In reality, many
> devices allow I/O faulting only in selective contexts. However, there
> is no standard way (e.g. PCISIG) for the device to report whether
> arbitrary I/O fault is allowed. Then we may have to maintain device
> specific knowledge in software, e.g. in an opt-in table to list devices
> which allows arbitrary faults. For devices which only support selective
> faulting, a mediator (either through vendor extensions on vfio-pci-core
> or a mdev wrapper) might be necessary to help lock down non-
>> faultable
> mappings and then enable faulting on the rest mappings.

 For devices which only support selective faulting, they could tell it to 
 the
 IOMMU driver and let it filter out non-faultable faults? Do I get it wrong?
>>>
>>> Not exactly to IOMMU driver. There is already a vfio_pin_pages() for
>>> selectively page-pinning. The matter is that 'they' imply some device
>>> specific logic to decide which pages must be pinned and such knowledge
>>> is outside of VFIO.
>>>
>>> From enabling p.o.v we could possibly do it in phased approach. First
>>> handles devices which tolerate arbitrary DMA faults, and then extends
>>> to devices with selective-faulting. The former is simpler, but with one
>>> main open whether we want to maintain such device IDs in a static
>>> table in VFIO or rely on some hints from other components (e.g. PF
>>> driver in VF assignment case). Let's see how Alex thinks about it.
>>
>> Do you think selective-faulting will be the norm, or only a problem for
>> initial IOPF implementations?  To me it's the selective-faulting kind of
>> device that will be the odd one out, but that's pure speculation. Either
>> way maintaining a device list seems like a pain.
> 
> I would think it's norm for quite some time (e.g. multiple years), as from
> what I learned turning a complex accelerator to an implementation 
> tolerating arbitrary DMA fault is way complex (in every critical path) and
> not cost effective (tracking in-fly requests). It might be OK for some 
> purposely-built devices in specific usage but for most it has to be an 
> evolving path toward the 100%-faultable goal...
> 
>>
>> [...]
>>> Yes, it's in plan but just not happened yet. We are still focusing on guest
>>> SVA part thus only the 1st-level page fault (+Yi/Jacob). It's always
>> welcomed
>>> to collaborate/help if you have time. 😊
>>
>> By the way the current fault report API is missing a way to invalidate
>> partial faults: when the IOMMU device's PRI queue overflows, it may
>> auto-respond to page request groups that were already partially reported
>> by the IOMMU driver. Upon detecting an overflow, the IOMMU driver needs
>> to
>> tell all fault consumers to discard their partial groups.
>> iopf_queue_discard_partial() [1] does this for the internal IOPF handler
>> but we have nothing for the lower-level fault handler at the moment. And
>> it gets more complicated when injecting IOPFs to guests, we'd need a
>> mechanism to recall partial groups all the way through kernel->userspace
>> and userspace->guest.
> 
> I didn't know how to recall partial groups through emulated vIOMMUs
> (at least for virtual VT-d). Possibly it could be supported by virtio-iommu.
> But in any case I consider it more like an optimization instead of a 
> functional
> requirement (and could be avoided in below Shenming's suggestion).
> 
>>
>> Shenming suggests [2] to also use the IOPF handler for IOPFs managed by
>> device drivers. It's worth considering in my opinion because we could hold
>> partial groups within the kernel and only report full groups to device
>> drivers (and guests). In addition we'd consolidate tracking of IOPFs,
>> since they're done both by iommu_report_device_fault() and the IOPF
>> handler at the moment.
> 
> I also think it's the right thing to do. In concept w/ or w/o DEV_FEAT_IOPF
> just reflects how IOPFs are delivered to the system software. In the end 
> IOPFs are all about permission violations in the IOMMU page tables thus
> we should try to reuse/consolidate the IOMMU fault reporting

Re: [RFC PATCH v1 0/4] vfio: Add IOPF support for VFIO passthrough

2021-02-01 Thread Shenming Lu

On 2021/2/1 15:56, Tian, Kevin wrote:
>> From: Alex Williamson 
>> Sent: Saturday, January 30, 2021 6:58 AM
>>
>> On Mon, 25 Jan 2021 17:03:58 +0800
>> Shenming Lu  wrote:
>>
>>> Hi,
>>>
>>> The static pinning and mapping problem in VFIO and possible solutions
>>> have been discussed a lot [1, 2]. One of the solutions is to add I/O
>>> page fault support for VFIO devices. Different from those relatively
>>> complicated software approaches such as presenting a vIOMMU that
>> provides
>>> the DMA buffer information (might include para-virtualized optimizations),
>>> IOPF mainly depends on the hardware faulting capability, such as the PCIe
>>> PRI extension or Arm SMMU stall model. What's more, the IOPF support in
>>> the IOMMU driver is being implemented in SVA [3]. So do we consider to
>>> add IOPF support for VFIO passthrough based on the IOPF part of SVA at
>>> present?
>>>
>>> We have implemented a basic demo only for one stage of translation (GPA
>>> -> HPA in virtualization, note that it can be configured at either stage),
>>> and tested on Hisilicon Kunpeng920 board. The nested mode is more
>> complicated
>>> since VFIO only handles the second stage page faults (same as the non-
>> nested
>>> case), while the first stage page faults need to be further delivered to
>>> the guest, which is being implemented in [4] on ARM. My thought on this
>>> is to report the page faults to VFIO regardless of the occured stage (try
>>> to carry the stage information), and handle respectively according to the
>>> configured mode in VFIO. Or the IOMMU driver might evolve to support
>> more...
>>>
>>> Might TODO:
>>>  - Optimize the faulting path, and measure the performance (it might still
>>>be a big issue).
>>>  - Add support for PRI.
>>>  - Add a MMU notifier to avoid pinning.
>>>  - Add support for the nested mode.
>>> ...
>>>
>>> Any comments and suggestions are very welcome. :-)
>>
>> I expect performance to be pretty bad here, the lookup involved per
>> fault is excessive.  There are cases where a user is not going to be
>> willing to have a slow ramp up of performance for their devices as they
>> fault in pages, so we might need to considering making this
>> configurable through the vfio interface.  Our page mapping also only
> 
> There is another factor to be considered. The presence of IOMMU_
> DEV_FEAT_IOPF just indicates the device capability of triggering I/O 
> page fault through the IOMMU, but not exactly means that the device 
> can tolerate I/O page fault for arbitrary DMA requests.

Yes, so I add a iopf_enabled field in VFIO to indicate the whole path faulting
capability and set it to true after registering a VFIO page fault handler.

> In reality, many 
> devices allow I/O faulting only in selective contexts. However, there
> is no standard way (e.g. PCISIG) for the device to report whether 
> arbitrary I/O fault is allowed. Then we may have to maintain device
> specific knowledge in software, e.g. in an opt-in table to list devices
> which allows arbitrary faults. For devices which only support selective 
> faulting, a mediator (either through vendor extensions on vfio-pci-core
> or a mdev wrapper) might be necessary to help lock down non-faultable 
> mappings and then enable faulting on the rest mappings.

For devices which only support selective faulting, they could tell it to the
IOMMU driver and let it filter out non-faultable faults? Do I get it wrong?

> 
>> grows here, should mappings expire or do we need a least recently
>> mapped tracker to avoid exceeding the user's locked memory limit?  How
>> does a user know what to set for a locked memory limit?  The behavior
>> here would lead to cases where an idle system might be ok, but as soon
>> as load increases with more inflight DMA, we start seeing
>> "unpredictable" I/O faults from the user perspective.  Seems like there
>> are lots of outstanding considerations and I'd also like to hear from
>> the SVA folks about how this meshes with their work.  Thanks,
>>
> 
> The main overlap between this feature and SVA is the IOPF reporting
> framework, which currently still has gap to support both in nested
> mode, as discussed here:
> 
> https://lore.kernel.org/linux-acpi/YAaxjmJW+ZMvrhac@myrica/
> 
> Once that gap is resolved in the future, the VFIO fault handler just 
> adopts different actions according to the fault-level: 1st level faults
> are forwarded to userspace thru the vSVA path while 2nd-level faults
> are fixed (or warned if not intended) by VFIO itself thru the IOMMU
> mapping interface.

I understand what you mean is:
>From the perspective of VFIO, first, we need to set FEAT_IOPF, and then 
>regster its
own handler with a flag to indicate FLAT or NESTED and which level is concerned,
thus the VFIO handler can handle the page faults directly according to the 
carried
level information.

Is there any plan for evolving(implementing) the IOMMU driver to support this? 
Or
could we help this?  :-)

Thanks,
Shenming

> 
> Thanks
> Kevin
>

Re: [RFC PATCH v1 0/4] vfio: Add IOPF support for VFIO passthrough

2021-01-30 Thread Shenming Lu

On 2021/1/30 6:57, Alex Williamson wrote:
> On Mon, 25 Jan 2021 17:03:58 +0800
> Shenming Lu  wrote:
> 
>> Hi,
>>
>> The static pinning and mapping problem in VFIO and possible solutions
>> have been discussed a lot [1, 2]. One of the solutions is to add I/O
>> page fault support for VFIO devices. Different from those relatively
>> complicated software approaches such as presenting a vIOMMU that provides
>> the DMA buffer information (might include para-virtualized optimizations),
>> IOPF mainly depends on the hardware faulting capability, such as the PCIe
>> PRI extension or Arm SMMU stall model. What's more, the IOPF support in
>> the IOMMU driver is being implemented in SVA [3]. So do we consider to
>> add IOPF support for VFIO passthrough based on the IOPF part of SVA at
>> present?
>>
>> We have implemented a basic demo only for one stage of translation (GPA
>> -> HPA in virtualization, note that it can be configured at either stage),  
>> and tested on Hisilicon Kunpeng920 board. The nested mode is more complicated
>> since VFIO only handles the second stage page faults (same as the non-nested
>> case), while the first stage page faults need to be further delivered to
>> the guest, which is being implemented in [4] on ARM. My thought on this
>> is to report the page faults to VFIO regardless of the occured stage (try
>> to carry the stage information), and handle respectively according to the
>> configured mode in VFIO. Or the IOMMU driver might evolve to support more...
>>
>> Might TODO:
>>  - Optimize the faulting path, and measure the performance (it might still
>>be a big issue).
>>  - Add support for PRI.
>>  - Add a MMU notifier to avoid pinning.
>>  - Add support for the nested mode.
>> ...
>>
>> Any comments and suggestions are very welcome. :-)
> 
> I expect performance to be pretty bad here, the lookup involved per
> fault is excessive.

We might consider to prepin more pages as a further optimization.

> There are cases where a user is not going to be
> willing to have a slow ramp up of performance for their devices as they
> fault in pages, so we might need to considering making this
> configurable through the vfio interface.

Yeah, makes sense, I will try to implement this: maybe add a ioctl called
VFIO_IOMMU_ENABLE_IOPF for Type1 VFIO IOMMU...

> Our page mapping also only
> grows here, should mappings expire or do we need a least recently
> mapped tracker to avoid exceeding the user's locked memory limit?  How
> does a user know what to set for a locked memory limit?

Yeah, we can add a LRU(mapped) tracker to release the pages when exceeding
a memory limit, maybe have a thread to periodically check this.
And as for the memory limit, maybe we could give the user some levels
(10%(default)/30%/50%/70%/unlimited of the total user memory (mapping size))
to choose from via the VFIO_IOMMU_ENABLE_IOPF ioctl...

> The behavior
> here would lead to cases where an idle system might be ok, but as soon
> as load increases with more inflight DMA, we start seeing
> "unpredictable" I/O faults from the user perspective.

"unpredictable" I/O faults? We might see more problems after more testing...

Thanks,
Shenming

> Seems like there
> are lots of outstanding considerations and I'd also like to hear from
> the SVA folks about how this meshes with their work.  Thanks,
> 
> Alex
> 
> .
>

Re: [RFC PATCH v1 3/4] vfio: Try to enable IOPF for VFIO devices

2021-01-30 Thread Shenming Lu

On 2021/1/30 6:42, Alex Williamson wrote:
> On Mon, 25 Jan 2021 17:04:01 +0800
> Shenming Lu  wrote:
> 
>> If IOMMU_DEV_FEAT_IOPF is set for the VFIO device, which means that
>> the delivering of page faults of this device from the IOMMU is enabled,
>> we register the VFIO page fault handler to complete the whole faulting
>> path (HW+SW). And add a iopf_enabled field in struct vfio_device to
>> record it.
>>
>> Signed-off-by: Shenming Lu 
>> ---
>>  drivers/vfio/vfio.c | 20 
>>  1 file changed, 20 insertions(+)
>>
>> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
>> index ff7797260d0f..fd885d99ee0f 100644
>> --- a/drivers/vfio/vfio.c
>> +++ b/drivers/vfio/vfio.c
>> @@ -97,6 +97,7 @@ struct vfio_device {
>>  struct vfio_group   *group;
>>  struct list_headgroup_next;
>>  void*device_data;
>> +booliopf_enabled;
>>  };
>>  
>>  #ifdef CONFIG_VFIO_NOIOMMU
>> @@ -532,6 +533,21 @@ static struct vfio_group 
>> *vfio_group_get_from_dev(struct device *dev)
>>  /**
>>   * Device objects - create, release, get, put, search
>>   */
>> +
>> +static void vfio_device_enable_iopf(struct vfio_device *device)
>> +{
>> +struct device *dev = device->dev;
>> +
>> +if (!iommu_dev_has_feature(dev, IOMMU_DEV_FEAT_IOPF))
>> +return;
>> +
>> +if (WARN_ON(iommu_register_device_fault_handler(dev,
>> +vfio_iommu_dev_fault_handler, dev)))
> 
> The layering here is wrong, vfio-core doesn't manage the IOMMU, we have
> backend IOMMU drivers for that.  We can't even assume we have IOMMU API
> support here, that's what the type1 backend handles.  Thanks,

Thanks for pointing it out, I will correct it: maybe do the enabling via
the VFIO_IOMMU_ENABLE_IOPF ioctl mentioned in the cover and suggest the
user to call it before VFIO_IOMMU_MAP_DMA, also move the iopf_enabled field
from struct vfio_device to struct vfio_iommu...

Thanks,
Shenming

> 
> Alex
> 
>> +return;
>> +
>> +device->iopf_enabled = true;
>> +}
>> +
>>  static
>>  struct vfio_device *vfio_group_create_device(struct vfio_group *group,
>>   struct device *dev,
>> @@ -549,6 +565,8 @@ struct vfio_device *vfio_group_create_device(struct 
>> vfio_group *group,
>>  device->group = group;
>>  device->ops = ops;
>>  device->device_data = device_data;
>> +/* By default try to enable IOPF */
>> +vfio_device_enable_iopf(device);
>>  dev_set_drvdata(dev, device);
>>  
>>  /* No need to get group_lock, caller has group reference */
>> @@ -573,6 +591,8 @@ static void vfio_device_release(struct kref *kref)
>>  mutex_unlock(&group->device_lock);
>>  
>>  dev_set_drvdata(device->dev, NULL);
>> +if (device->iopf_enabled)
>> +WARN_ON(iommu_unregister_device_fault_handler(device->dev));
>>  
>>  kfree(device);
>>  
> 
> .
>

Re: [RFC PATCH v1 1/4] vfio/type1: Add a bitmap to track IOPF mapped pages

2021-01-30 Thread Shenming Lu

On 2021/1/30 6:58, Alex Williamson wrote:
> On Mon, 25 Jan 2021 17:03:59 +0800
> Shenming Lu  wrote:
> 
>> When IOPF enabled, the pages are pinned and mapped on demand, we add
>> a bitmap to track them.
>>
>> Signed-off-by: Shenming Lu 
>> ---
>>  drivers/vfio/vfio_iommu_type1.c | 12 
>>  1 file changed, 12 insertions(+)
>>
>> diff --git a/drivers/vfio/vfio_iommu_type1.c 
>> b/drivers/vfio/vfio_iommu_type1.c
>> index 0b4dedaa9128..f1d4de5ab094 100644
>> --- a/drivers/vfio/vfio_iommu_type1.c
>> +++ b/drivers/vfio/vfio_iommu_type1.c
>> @@ -95,6 +95,7 @@ struct vfio_dma {
>>  struct task_struct  *task;
>>  struct rb_root  pfn_list;   /* Ex-user pinned pfn list */
>>  unsigned long   *bitmap;
>> +unsigned long   *iommu_mapped_bitmap;   /* IOPF mapped bitmap */
>>  };
>>  
>>  struct vfio_group {
>> @@ -143,6 +144,8 @@ struct vfio_regions {
>>  #define DIRTY_BITMAP_PAGES_MAX   ((u64)INT_MAX)
>>  #define DIRTY_BITMAP_SIZE_MAX
>> DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
>>  
>> +#define IOMMU_MAPPED_BITMAP_BYTES(n) DIRTY_BITMAP_BYTES(n)
>> +
>>  static int put_pfn(unsigned long pfn, int prot);
>>  
>>  static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu 
>> *iommu,
>> @@ -949,6 +952,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, 
>> struct vfio_dma *dma)
>>  vfio_unlink_dma(iommu, dma);
>>  put_task_struct(dma->task);
>>  vfio_dma_bitmap_free(dma);
>> +kfree(dma->iommu_mapped_bitmap);
>>  kfree(dma);
>>  iommu->dma_avail++;
>>  }
>> @@ -1354,6 +1358,14 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>>  goto out_unlock;
>>  }
>>  
>> +dma->iommu_mapped_bitmap = kvzalloc(IOMMU_MAPPED_BITMAP_BYTES(size / 
>> PAGE_SIZE),
>> +GFP_KERNEL);
> 
> This is a lot of bloat for all the platforms that don't support this
> feature.  Thanks,

Yes, I will make this dedicated to IOPF.

Thanks,
Shenming

> 
> Alex
> 
>> +if (!dma->iommu_mapped_bitmap) {
>> +ret = -ENOMEM;
>> +kfree(dma);
>> +goto out_unlock;
>> +}
>> +
>>  iommu->dma_avail--;
>>  dma->iova = iova;
>>  dma->vaddr = vaddr;
> 
> .
>

Re: [RFC PATCH v1 2/4] vfio: Add a page fault handler

On 2021/1/28 1:42, Christoph Hellwig wrote:
> On Mon, Jan 25, 2021 at 05:04:00PM +0800, Shenming Lu wrote:
>> +EXPORT_SYMBOL_GPL(vfio_iommu_dev_fault_handler);
> 
> This function is only used in vfio.c itself, so it should not be
> exported, but rather marked static.
> .
> 

Yeah, it makes sense. Thanks,

Shenming

[PATCH v3 0/4] KVM: arm64: Add VLPI migration support on GICv4.1

Hi Marc, sorry for the late commit.

In GICv4.1, migration has been supported except for (directly-injected)
VLPI. And GICv4.1 Spec explicitly gives a way to get the VLPI's pending
state (which was crucially missing in GICv4.0). So we make VLPI migration
capable on GICv4.1 in this patch set.

In order to support VLPI migration, we need to save and restore all
required configuration information and pending states of VLPIs. But
in fact, the configuration information of VLPIs has already been saved
(or will be reallocated on the dst host...) in vgic(kvm) migration.
So we only have to migrate the pending states of VLPIs specially.

Below is the related workflow in migration.

On the save path:
In migration completion:
pause all vCPUs
|
call each VM state change handler:
pause other devices (just keep from sending interrupts, 
and
such as VFIO migration protocol has already realized it 
[1])
|
flush ITS tables into guest RAM
|
flush RDIST pending tables (also flush VLPI state here)
|
...
On the resume path:
load each device's state:
restore ITS tables (include pending tables) from guest RAM
|
for other (PCI) devices (paused), if configured to have VLPIs,
establish the forwarding paths of their VLPIs (and transfer
the pending states from kvm's vgic to VPT here)

We have tested this series in VFIO migration, and found some related
issues in QEMU [2].

Links:
[1] vfio: UAPI for migration interface for device state:

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a8a24f3f6e38103b77cf399c38eb54e1219d00d6
[2] vfio: Some fixes and optimizations for VFIO migration:
https://patchwork.ozlabs.org/cover/1413263/

History:

v2 -> v3
 - Add the vgic initialized check to ensure that the allocation and enabling
   of the doorbells have already been done before unmapping the vPEs.
 - Check all get_vlpi_state related conditions in save_pending_tables in one 
place.
 - Nit fixes.

v1 -> v2:
 - Get the VLPI state from the KVM side.
 - Nit fixes.

Thanks,
Shenming


Shenming Lu (3):
  KVM: arm64: GICv4.1: Add function to get VLPI state
  KVM: arm64: GICv4.1: Try to save hw pending state in
save_pending_tables
  KVM: arm64: GICv4.1: Give a chance to save VLPI's pending state

Zenghui Yu (1):
  KVM: arm64: GICv4.1: Restore VLPI's pending state to physical side

 .../virt/kvm/devices/arm-vgic-its.rst |  2 +-
 arch/arm64/kvm/vgic/vgic-its.c|  6 +-
 arch/arm64/kvm/vgic/vgic-v3.c | 61 +--
 arch/arm64/kvm/vgic/vgic-v4.c | 33 ++
 arch/arm64/kvm/vgic/vgic.h|  1 +
 5 files changed, 93 insertions(+), 10 deletions(-)

-- 
2.19.1

[PATCH v3 1/4] KVM: arm64: GICv4.1: Add function to get VLPI state

With GICv4.1 and the vPE unmapped, which indicates the invalidation
of any VPT caches associated with the vPE, we can get the VLPI state
by peeking at the VPT. So we add a function for this.

Signed-off-by: Shenming Lu 
---
 arch/arm64/kvm/vgic/vgic-v4.c | 19 +++
 arch/arm64/kvm/vgic/vgic.h|  1 +
 2 files changed, 20 insertions(+)

diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
index 66508b03094f..ac029ba3d337 100644
--- a/arch/arm64/kvm/vgic/vgic-v4.c
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@@ -203,6 +203,25 @@ void vgic_v4_configure_vsgis(struct kvm *kvm)
kvm_arm_resume_guest(kvm);
 }
 
+/*
+ * Must be called with GICv4.1 and the vPE unmapped, which
+ * indicates the invalidation of any VPT caches associated
+ * with the vPE, thus we can get the VLPI state by peeking
+ * at the VPT.
+ */
+void vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val)
+{
+   struct its_vpe *vpe = &irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
+   int mask = BIT(irq->intid % BITS_PER_BYTE);
+   void *va;
+   u8 *ptr;
+
+   va = page_address(vpe->vpt_page);
+   ptr = va + irq->intid / BITS_PER_BYTE;
+
+   *val = !!(*ptr & mask);
+}
+
 /**
  * vgic_v4_init - Initialize the GICv4 data structures
  * @kvm:   Pointer to the VM being initialized
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
index 64fcd750..d8cfd360838c 100644
--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -317,5 +317,6 @@ bool vgic_supports_direct_msis(struct kvm *kvm);
 int vgic_v4_init(struct kvm *kvm);
 void vgic_v4_teardown(struct kvm *kvm);
 void vgic_v4_configure_vsgis(struct kvm *kvm);
+void vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val);
 
 #endif
-- 
2.19.1

[PATCH v3 2/4] KVM: arm64: GICv4.1: Try to save hw pending state in save_pending_tables

After pausing all vCPUs and devices capable of interrupting, in order
to save the information of all interrupts, besides flushing the pending
states in kvm’s vgic, we also try to flush the states of VLPIs in the
virtual pending tables into guest RAM, but we need to have GICv4.1 and
safely unmap the vPEs first.

As for the saving of VSGIs, which needs the vPEs to be mapped and might
conflict with the saving of VLPIs, but since we will map the vPEs back
at the end of save_pending_tables and both savings require the kvm->lock
to be held (only happen serially), it will work fine.

Signed-off-by: Shenming Lu 
---
 arch/arm64/kvm/vgic/vgic-v3.c | 61 +++
 1 file changed, 55 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 52915b342351..06b1162b7a0a 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0-only
 
 #include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -356,6 +358,32 @@ int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, 
struct vgic_irq *irq)
return 0;
 }
 
+/*
+ * The deactivation of the doorbell interrupt will trigger the
+ * unmapping of the associated vPE.
+ */
+static void unmap_all_vpes(struct vgic_dist *dist)
+{
+   struct irq_desc *desc;
+   int i;
+
+   for (i = 0; i < dist->its_vm.nr_vpes; i++) {
+   desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
+   irq_domain_deactivate_irq(irq_desc_get_irq_data(desc));
+   }
+}
+
+static void map_all_vpes(struct vgic_dist *dist)
+{
+   struct irq_desc *desc;
+   int i;
+
+   for (i = 0; i < dist->its_vm.nr_vpes; i++) {
+   desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
+   irq_domain_activate_irq(irq_desc_get_irq_data(desc), false);
+   }
+}
+
 /**
  * vgic_v3_save_pending_tables - Save the pending tables into guest RAM
  * kvm lock and all vcpu lock must be held
@@ -365,14 +393,26 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
struct vgic_dist *dist = &kvm->arch.vgic;
struct vgic_irq *irq;
gpa_t last_ptr = ~(gpa_t)0;
-   int ret;
+   bool vlpi_avail = false;
+   int ret = 0;
u8 val;
 
+   /*
+* As a preparation for getting any VLPI states.
+* The vgic initialized check ensures that the allocation and
+* enabling of the doorbells have already been done.
+*/
+   if (kvm_vgic_global_state.has_gicv4_1 && 
!WARN_ON(!vgic_initialized(kvm))) {
+   unmap_all_vpes(dist);
+   vlpi_avail = true;
+   }
+
list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
int byte_offset, bit_nr;
struct kvm_vcpu *vcpu;
gpa_t pendbase, ptr;
bool stored;
+   bool is_pending = irq->pending_latch;
 
vcpu = irq->target_vcpu;
if (!vcpu)
@@ -387,24 +427,33 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
if (ptr != last_ptr) {
ret = kvm_read_guest_lock(kvm, ptr, &val, 1);
if (ret)
-   return ret;
+   goto out;
last_ptr = ptr;
}
 
stored = val & (1U << bit_nr);
-   if (stored == irq->pending_latch)
+
+   if (irq->hw && vlpi_avail)
+   vgic_v4_get_vlpi_state(irq, &is_pending);
+
+   if (stored == is_pending)
continue;
 
-   if (irq->pending_latch)
+   if (is_pending)
val |= 1 << bit_nr;
else
val &= ~(1 << bit_nr);
 
ret = kvm_write_guest_lock(kvm, ptr, &val, 1);
if (ret)
-   return ret;
+   goto out;
}
-   return 0;
+
+out:
+   if (vlpi_avail)
+   map_all_vpes(dist);
+
+   return ret;
 }
 
 /**
-- 
2.19.1

[PATCH v3 3/4] KVM: arm64: GICv4.1: Restore VLPI's pending state to physical side

From: Zenghui Yu 

When setting the forwarding path of a VLPI (switch to the HW mode),
we could also transfer the pending state from irq->pending_latch to
VPT (especially in migration, the pending states of VLPIs are restored
into kvm’s vgic first). And we currently send "INT+VSYNC" to trigger
a VLPI to pending.

Signed-off-by: Zenghui Yu 
Signed-off-by: Shenming Lu 
---
 arch/arm64/kvm/vgic/vgic-v4.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
index ac029ba3d337..a3542af6f04a 100644
--- a/arch/arm64/kvm/vgic/vgic-v4.c
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@@ -449,6 +449,20 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq,
irq->host_irq   = virq;
atomic_inc(&map.vpe->vlpi_count);
 
+   /* Transfer pending state */
+   if (irq->pending_latch) {
+   ret = irq_set_irqchip_state(irq->host_irq,
+   IRQCHIP_STATE_PENDING,
+   irq->pending_latch);
+   WARN_RATELIMIT(ret, "IRQ %d", irq->host_irq);
+
+   /*
+* Let it be pruned from ap_list later and don't bother
+* the List Register.
+*/
+   irq->pending_latch = false;
+   }
+
 out:
mutex_unlock(&its->its_lock);
return ret;
-- 
2.19.1

[PATCH v3 4/4] KVM: arm64: GICv4.1: Give a chance to save VLPI's pending state

Before GICv4.1, we don't have direct access to the VLPI's pending
state. So we simply let it fail early when encountering any VLPI.

But now we don't have to return -EACCES directly if on GICv4.1. So
let’s change the hard code and give a chance to save the VLPI's pending
state (and preserve the UAPI).

Signed-off-by: Shenming Lu 
---
 Documentation/virt/kvm/devices/arm-vgic-its.rst | 2 +-
 arch/arm64/kvm/vgic/vgic-its.c  | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Documentation/virt/kvm/devices/arm-vgic-its.rst 
b/Documentation/virt/kvm/devices/arm-vgic-its.rst
index 6c304fd2b1b4..d257eddbae29 100644
--- a/Documentation/virt/kvm/devices/arm-vgic-its.rst
+++ b/Documentation/virt/kvm/devices/arm-vgic-its.rst
@@ -80,7 +80,7 @@ KVM_DEV_ARM_VGIC_GRP_CTRL
 -EFAULT  Invalid guest ram access
 -EBUSY   One or more VCPUS are running
 -EACCES  The virtual ITS is backed by a physical GICv4 ITS, and the
-state is not available
+state is not available without GICv4.1
 ===  ==
 
 KVM_DEV_ARM_VGIC_GRP_ITS_REGS
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index 40cbaca81333..ec7543a9617c 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -2218,10 +2218,10 @@ static int vgic_its_save_itt(struct vgic_its *its, 
struct its_device *device)
/*
 * If an LPI carries the HW bit, this means that this
 * interrupt is controlled by GICv4, and we do not
-* have direct access to that state. Let's simply fail
-* the save operation...
+* have direct access to that state without GICv4.1.
+* Let's simply fail the save operation...
 */
-   if (ite->irq->hw)
+   if (ite->irq->hw && !kvm_vgic_global_state.has_gicv4_1)
return -EACCES;
 
ret = vgic_its_save_ite(its, device, ite, gpa, ite_esz);
-- 
2.19.1

[RFC PATCH v1 0/4] vfio: Add IOPF support for VFIO passthrough

2021-01-26 Thread Shenming Lu

Hi,

The static pinning and mapping problem in VFIO and possible solutions
have been discussed a lot [1, 2]. One of the solutions is to add I/O
page fault support for VFIO devices. Different from those relatively
complicated software approaches such as presenting a vIOMMU that provides
the DMA buffer information (might include para-virtualized optimizations),
IOPF mainly depends on the hardware faulting capability, such as the PCIe
PRI extension or Arm SMMU stall model. What's more, the IOPF support in
the IOMMU driver is being implemented in SVA [3]. So do we consider to
add IOPF support for VFIO passthrough based on the IOPF part of SVA at
present?

We have implemented a basic demo only for one stage of translation (GPA
-> HPA in virtualization, note that it can be configured at either stage),
and tested on Hisilicon Kunpeng920 board. The nested mode is more complicated
since VFIO only handles the second stage page faults (same as the non-nested
case), while the first stage page faults need to be further delivered to
the guest, which is being implemented in [4] on ARM. My thought on this
is to report the page faults to VFIO regardless of the occured stage (try
to carry the stage information), and handle respectively according to the
configured mode in VFIO. Or the IOMMU driver might evolve to support more...

Might TODO:
- Optimize the faulting path, and measure the performance (it might still
be a big issue).
- Add support for PRI.
- Add a MMU notifier to avoid pinning.
- Add support for the nested mode.
...

Any comments and suggestions are very welcome. :-)

Links:
[1] Lesokhin I, et al. Page Fault Support for Network Controllers. In ASPLOS,
2016.
[2] Tian K, et al. coIOMMU: A Virtual IOMMU with Cooperative DMA Buffer Tracking
for Efficient Memory Management in Direct I/O. In USENIX ATC, 2020.
[3] iommu: I/O page faults for SMMUv3:

https://patchwork.kernel.org/project/linux-arm-kernel/cover/20210121123623.2060416-1-jean-phili...@linaro.org/
[4] SMMUv3 Nested Stage Setup (VFIO part):

https://patchwork.kernel.org/project/kvm/cover/20201116110030.32335-1-eric.au...@redhat.com/

Thanks,
Shenming

Shenming Lu (4):
vfio/type1: Add a bitmap to track IOPF mapped pages
vfio: Add a page fault handler
vfio: Try to enable IOPF for VFIO devices
vfio: Allow to pin and map dynamically

drivers/vfio/vfio.c | 75 ++
drivers/vfio/vfio_iommu_type1.c | 131 +++-
include/linux/vfio.h| 6 ++
3 files changed, 211 insertions(+), 1 deletion(-)

--
2.19.1

[RFC PATCH v1 1/4] vfio/type1: Add a bitmap to track IOPF mapped pages

When IOPF enabled, the pages are pinned and mapped on demand, we add
a bitmap to track them.

Signed-off-by: Shenming Lu 
---
 drivers/vfio/vfio_iommu_type1.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 0b4dedaa9128..f1d4de5ab094 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -95,6 +95,7 @@ struct vfio_dma {
struct task_struct  *task;
struct rb_root  pfn_list;   /* Ex-user pinned pfn list */
unsigned long   *bitmap;
+   unsigned long   *iommu_mapped_bitmap;   /* IOPF mapped bitmap */
 };
 
 struct vfio_group {
@@ -143,6 +144,8 @@ struct vfio_regions {
 #define DIRTY_BITMAP_PAGES_MAX  ((u64)INT_MAX)
 #define DIRTY_BITMAP_SIZE_MAX   DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
 
+#define IOMMU_MAPPED_BITMAP_BYTES(n) DIRTY_BITMAP_BYTES(n)
+
 static int put_pfn(unsigned long pfn, int prot);
 
 static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
@@ -949,6 +952,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, 
struct vfio_dma *dma)
vfio_unlink_dma(iommu, dma);
put_task_struct(dma->task);
vfio_dma_bitmap_free(dma);
+   kfree(dma->iommu_mapped_bitmap);
kfree(dma);
iommu->dma_avail++;
 }
@@ -1354,6 +1358,14 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
goto out_unlock;
}
 
+   dma->iommu_mapped_bitmap = kvzalloc(IOMMU_MAPPED_BITMAP_BYTES(size / 
PAGE_SIZE),
+   GFP_KERNEL);
+   if (!dma->iommu_mapped_bitmap) {
+   ret = -ENOMEM;
+   kfree(dma);
+   goto out_unlock;
+   }
+
iommu->dma_avail--;
dma->iova = iova;
dma->vaddr = vaddr;
-- 
2.19.1

[RFC PATCH v1 4/4] vfio: Allow to pin and map dynamically

If IOPF enabled for the whole VFIO container, there is no need to
statically pin and map the entire DMA range, we can do it on demand.
And unmap and unpin according to the IOPF mapped bitmap when removing
the DMA mapping.

Signed-off-by: Shenming Lu 
---
 drivers/vfio/vfio.c | 20 +++
 drivers/vfio/vfio_iommu_type1.c | 61 -
 include/linux/vfio.h|  1 +
 3 files changed, 81 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index fd885d99ee0f..466959f4d661 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -2404,6 +2404,26 @@ int vfio_iommu_dev_fault_handler(struct iommu_fault 
*fault, void *data)
 }
 EXPORT_SYMBOL_GPL(vfio_iommu_dev_fault_handler);
 
+/*
+ * Return 0 if enabled.
+ */
+int vfio_device_iopf_enabled(struct device *dev, void *data)
+{
+   struct vfio_device *device;
+   int ret = 0;
+
+   device = vfio_device_get_from_dev(dev);
+   if (!device)
+   return -ENODEV;
+
+   if (!device->iopf_enabled)
+   ret = 1;
+
+   vfio_device_put(device);
+   return ret;
+}
+EXPORT_SYMBOL_GPL(vfio_device_iopf_enabled);
+
 /**
  * Module/class support
  */
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index ac6f00c97897..da84155513e4 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -864,6 +864,43 @@ static size_t unmap_unpin_slow(struct vfio_domain *domain,
return unmapped;
 }
 
+static long vfio_clear_iommu_mapped_bitmap(struct vfio_iommu *iommu,
+  struct vfio_dma *dma,
+  bool do_accounting)
+{
+   dma_addr_t iova = dma->iova;
+   size_t size = dma->size;
+   uint64_t i, npages = size / PAGE_SIZE;
+   long unlocked = 0;
+
+   for (i = 0; i < npages; i++, iova += PAGE_SIZE) {
+   if (IOMMU_MAPPED_BITMAP_GET(dma, i)) {
+   struct vfio_domain *d;
+   phys_addr_t phys;
+
+   d = list_first_entry(&iommu->domain_list,
+struct vfio_domain, next);
+   phys = iommu_iova_to_phys(d->domain, iova);
+   if (WARN_ON(!phys))
+   continue;
+
+   list_for_each_entry(d, &iommu->domain_list, next) {
+   iommu_unmap(d->domain, iova, PAGE_SIZE);
+   cond_resched();
+   }
+   vfio_unpin_pages_remote(dma, iova, phys >> PAGE_SHIFT,
+   1, do_accounting);
+
+   bitmap_clear(dma->iommu_mapped_bitmap, i, 1);
+   unlocked++;
+   }
+   }
+
+   if (do_accounting)
+   return 0;
+   return unlocked;
+}
+
 static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
 bool do_accounting)
 {
@@ -880,6 +917,10 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, 
struct vfio_dma *dma,
if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
return 0;
 
+   if (!dma->iommu_mapped)
+   return vfio_clear_iommu_mapped_bitmap(iommu, dma,
+ do_accounting);
+
/*
 * We use the IOMMU to track the physical addresses, otherwise we'd
 * need a much more complicated tracking system.  Unfortunately that
@@ -1302,6 +1343,23 @@ static bool vfio_iommu_iova_dma_valid(struct vfio_iommu 
*iommu,
return list_empty(iova);
 }
 
+static bool vfio_iommu_iopf_enabled(struct vfio_iommu *iommu)
+{
+   struct vfio_domain *d;
+
+   list_for_each_entry(d, &iommu->domain_list, next) {
+   struct vfio_group *g;
+
+   list_for_each_entry(g, &d->group_list, next) {
+   if (iommu_group_for_each_dev(g->iommu_group, NULL,
+vfio_device_iopf_enabled))
+   return false;
+   }
+   }
+
+   return true;
+}
+
 static int vfio_dma_do_map(struct vfio_iommu *iommu,
   struct vfio_iommu_type1_dma_map *map)
 {
@@ -1408,7 +1466,8 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
vfio_link_dma(iommu, dma);
 
/* Don't pin and map if container doesn't contain IOMMU capable domain*/
-   if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
+   if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) ||
+   vfio_iommu_iopf_enabled(iommu))
dma->size = size;
else
ret = vfio_pin_map_dma(iommu, dma, size);
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 6d535f029f21..cea1

[RFC PATCH v1 2/4] vfio: Add a page fault handler

VFIO manages the passthrough DMA mapping itself. In order to support
IOPF for VFIO devices, we need to add a VFIO page fault handler to
serve the reported page faults from the IOMMU driver.

Signed-off-by: Shenming Lu 
---
 drivers/vfio/vfio.c | 35 
 drivers/vfio/vfio_iommu_type1.c | 58 +
 include/linux/vfio.h|  5 +++
 3 files changed, 98 insertions(+)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 4ad8a35667a7..ff7797260d0f 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -2349,6 +2349,41 @@ struct iommu_domain *vfio_group_iommu_domain(struct 
vfio_group *group)
 }
 EXPORT_SYMBOL_GPL(vfio_group_iommu_domain);
 
+int vfio_iommu_dev_fault_handler(struct iommu_fault *fault, void *data)
+{
+   struct device *dev = (struct device *)data;
+   struct vfio_container *container;
+   struct vfio_group *group;
+   struct vfio_iommu_driver *driver;
+   int ret;
+
+   if (!dev)
+   return -EINVAL;
+
+   group = vfio_group_get_from_dev(dev);
+   if (!group)
+   return -ENODEV;
+
+   ret = vfio_group_add_container_user(group);
+   if (ret)
+   goto out;
+
+   container = group->container;
+   driver = container->iommu_driver;
+   if (likely(driver && driver->ops->dynamic_dma_map))
+   ret = driver->ops->dynamic_dma_map(container->iommu_data,
+  fault, dev);
+   else
+   ret = -ENOTTY;
+
+   vfio_group_try_dissolve_container(group);
+
+out:
+   vfio_group_put(group);
+   return ret;
+}
+EXPORT_SYMBOL_GPL(vfio_iommu_dev_fault_handler);
+
 /**
  * Module/class support
  */
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index f1d4de5ab094..ac6f00c97897 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -145,6 +145,8 @@ struct vfio_regions {
 #define DIRTY_BITMAP_SIZE_MAX   DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
 
 #define IOMMU_MAPPED_BITMAP_BYTES(n) DIRTY_BITMAP_BYTES(n)
+#define IOMMU_MAPPED_BITMAP_GET(dma, i)((dma->iommu_mapped_bitmap[i / 
BITS_PER_LONG]   \
+>> (i % BITS_PER_LONG)) & 0x1)
 
 static int put_pfn(unsigned long pfn, int prot);
 
@@ -2992,6 +2994,61 @@ static int vfio_iommu_type1_dma_rw(void *iommu_data, 
dma_addr_t user_iova,
return ret;
 }
 
+static int vfio_iommu_type1_dynamic_dma_map(void *iommu_data,
+   struct iommu_fault *fault,
+   struct device *dev)
+{
+   struct vfio_iommu *iommu = iommu_data;
+   dma_addr_t iova = ALIGN_DOWN(fault->prm.addr, PAGE_SIZE);
+   struct vfio_dma *dma;
+   int access_flags = 0;
+   unsigned long bit_offset, vaddr, pfn;
+   enum iommu_page_response_code status = IOMMU_PAGE_RESP_INVALID;
+   struct iommu_page_response resp = {0};
+
+   if (fault->type != IOMMU_FAULT_PAGE_REQ)
+   return -EOPNOTSUPP;
+
+   mutex_lock(&iommu->lock);
+
+   dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
+   if (!dma)
+   goto out_invalid;
+
+   if (fault->prm.perm & IOMMU_FAULT_PERM_READ)
+   access_flags |= IOMMU_READ;
+   if (fault->prm.perm & IOMMU_FAULT_PERM_WRITE)
+   access_flags |= IOMMU_WRITE;
+   if ((dma->prot & access_flags) != access_flags)
+   goto out_invalid;
+
+   bit_offset = (iova - dma->iova) >> PAGE_SHIFT;
+   if (IOMMU_MAPPED_BITMAP_GET(dma, bit_offset))
+   goto out_success;
+
+   vaddr = iova - dma->iova + dma->vaddr;
+   if (vfio_pin_page_external(dma, vaddr, &pfn, true))
+   goto out_invalid;
+
+   if (vfio_iommu_map(iommu, iova, pfn, 1, dma->prot)) {
+   vfio_unpin_page_external(dma, iova, true);
+   goto out_invalid;
+   }
+
+   bitmap_set(dma->iommu_mapped_bitmap, bit_offset, 1);
+
+out_success:
+   status = IOMMU_PAGE_RESP_SUCCESS;
+
+out_invalid:
+   mutex_unlock(&iommu->lock);
+   resp.version= IOMMU_PAGE_RESP_VERSION_1;
+   resp.grpid  = fault->prm.grpid;
+   resp.code   = status;
+   iommu_page_response(dev, &resp);
+   return 0;
+}
+
 static struct iommu_domain *
 vfio_iommu_type1_group_iommu_domain(void *iommu_data,
struct iommu_group *iommu_group)
@@ -3028,6 +3085,7 @@ static const struct vfio_iommu_driver_ops 
vfio_iommu_driver_ops_type1 = {
.register_notifier  = vfio_iommu_type1_register_notifier,
.unregister_notifier= vfio_iommu_type1_unregister_notifier,
.dma_rw = vfio_iommu_type1_dma_rw,
+   .dynamic_dma_map= vfio_iom

[RFC PATCH v1 3/4] vfio: Try to enable IOPF for VFIO devices

If IOMMU_DEV_FEAT_IOPF is set for the VFIO device, which means that
the delivering of page faults of this device from the IOMMU is enabled,
we register the VFIO page fault handler to complete the whole faulting
path (HW+SW). And add a iopf_enabled field in struct vfio_device to
record it.

Signed-off-by: Shenming Lu 
---
 drivers/vfio/vfio.c | 20 
 1 file changed, 20 insertions(+)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index ff7797260d0f..fd885d99ee0f 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -97,6 +97,7 @@ struct vfio_device {
struct vfio_group   *group;
struct list_headgroup_next;
void*device_data;
+   booliopf_enabled;
 };
 
 #ifdef CONFIG_VFIO_NOIOMMU
@@ -532,6 +533,21 @@ static struct vfio_group *vfio_group_get_from_dev(struct 
device *dev)
 /**
  * Device objects - create, release, get, put, search
  */
+
+static void vfio_device_enable_iopf(struct vfio_device *device)
+{
+   struct device *dev = device->dev;
+
+   if (!iommu_dev_has_feature(dev, IOMMU_DEV_FEAT_IOPF))
+   return;
+
+   if (WARN_ON(iommu_register_device_fault_handler(dev,
+   vfio_iommu_dev_fault_handler, dev)))
+   return;
+
+   device->iopf_enabled = true;
+}
+
 static
 struct vfio_device *vfio_group_create_device(struct vfio_group *group,
 struct device *dev,
@@ -549,6 +565,8 @@ struct vfio_device *vfio_group_create_device(struct 
vfio_group *group,
device->group = group;
device->ops = ops;
device->device_data = device_data;
+   /* By default try to enable IOPF */
+   vfio_device_enable_iopf(device);
dev_set_drvdata(dev, device);
 
/* No need to get group_lock, caller has group reference */
@@ -573,6 +591,8 @@ static void vfio_device_release(struct kref *kref)
mutex_unlock(&group->device_lock);
 
dev_set_drvdata(device->dev, NULL);
+   if (device->iopf_enabled)
+   WARN_ON(iommu_unregister_device_fault_handler(device->dev));
 
kfree(device);
 
-- 
2.19.1

Re: [RFC PATCH v2 2/4] KVM: arm64: GICv4.1: Try to save hw pending state in save_pending_tables

On 2021/1/5 21:47, Marc Zyngier wrote:
> On 2021-01-05 13:02, Shenming Lu wrote:
>> On 2021/1/5 17:13, Marc Zyngier wrote:
>>> On 2021-01-04 08:16, Shenming Lu wrote:
>>>> After pausing all vCPUs and devices capable of interrupting, in order
>>>> to save the information of all interrupts, besides flushing the pending
>>>> states in kvm’s vgic, we also try to flush the states of VLPIs in the
>>>> virtual pending tables into guest RAM, but we need to have GICv4.1 and
>>>> safely unmap the vPEs first.
>>>>
>>>> Signed-off-by: Shenming Lu 
>>>> ---
>>>>  arch/arm64/kvm/vgic/vgic-v3.c | 58 +++
>>>>  1 file changed, 52 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
>>>> index 9cdf39a94a63..a58c94127cb0 100644
>>>> --- a/arch/arm64/kvm/vgic/vgic-v3.c
>>>> +++ b/arch/arm64/kvm/vgic/vgic-v3.c
>>>> @@ -1,6 +1,8 @@
>>>>  // SPDX-License-Identifier: GPL-2.0-only
>>>>
>>>>  #include 
>>>> +#include 
>>>> +#include 
>>>>  #include 
>>>>  #include 
>>>>  #include 
>>>> @@ -356,6 +358,38 @@ int vgic_v3_lpi_sync_pending_status(struct kvm
>>>> *kvm, struct vgic_irq *irq)
>>>>  return 0;
>>>>  }
>>>>
>>>> +/*
>>>> + * The deactivation of the doorbell interrupt will trigger the
>>>> + * unmapping of the associated vPE.
>>>> + */
>>>> +static void unmap_all_vpes(struct vgic_dist *dist)
>>>> +{
>>>> +    struct irq_desc *desc;
>>>> +    int i;
>>>> +
>>>> +    if (!kvm_vgic_global_state.has_gicv4_1)
>>>> +    return;
>>>> +
>>>> +    for (i = 0; i < dist->its_vm.nr_vpes; i++) {
>>>> +    desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
>>>> +    irq_domain_deactivate_irq(irq_desc_get_irq_data(desc));
>>>> +    }
>>>> +}
>>>> +
>>>> +static void map_all_vpes(struct vgic_dist *dist)
>>>> +{
>>>> +    struct irq_desc *desc;
>>>> +    int i;
>>>> +
>>>> +    if (!kvm_vgic_global_state.has_gicv4_1)
>>>> +    return;
>>>> +
>>>> +    for (i = 0; i < dist->its_vm.nr_vpes; i++) {
>>>> +    desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
>>>> +    irq_domain_activate_irq(irq_desc_get_irq_data(desc), false);
>>>> +    }
>>>> +}
>>>> +
>>>>  /**
>>>>   * vgic_v3_save_pending_tables - Save the pending tables into guest RAM
>>>>   * kvm lock and all vcpu lock must be held
>>>> @@ -365,14 +399,18 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
>>>>  struct vgic_dist *dist = &kvm->arch.vgic;
>>>>  struct vgic_irq *irq;
>>>>  gpa_t last_ptr = ~(gpa_t)0;
>>>> -    int ret;
>>>> +    int ret = 0;
>>>>  u8 val;
>>>>
>>>> +    /* As a preparation for getting any VLPI states. */
>>>> +    unmap_all_vpes(dist);
>>>
>>> What if the VPEs are not mapped yet? Is it possible to snapshot a VM
>>> that has not run at all?
>>
>> What I see in QEMU is that the saving of the pending tables would only be
>> called when stopping the VM and it needs the current VM state to be RUNNING.
> 
> Sure, but that's what QEMU does, and a different userspace could well do
> something different. It looks to me that I should be able to start (or
> even restore) a guest, and snapshot it immediately. Here, I'm pretty
> sure this wouldn't do the right thing (I have the suspicion that the
> doorbells are not allocated, and that we'll end-up with an Oops at unmap
> time, though I haven't investigated it to be sure).
>

If we can't rely on the userspace, could we check whether it is allowed
(at the right time) before the unmapping? Maybe have a look at vmapp_count?
Although I think snapshot a VM that has not been started is almost impossible...

>>>
>>>> +
>>>>  list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
>>>>  int byte_offset, bit_nr;
>>>>  struct kvm_vcpu *vcpu;
>>>>  gpa_t pendbase, ptr;
>>>>  bool stored;
>>>> +    bool is_pending = irq->pending_latch;
>>&

Re: [RFC PATCH v2 2/4] KVM: arm64: GICv4.1: Try to save hw pending state in save_pending_tables

On 2021/1/5 19:40, Marc Zyngier wrote:
> On 2021-01-05 09:13, Marc Zyngier wrote:
>> On 2021-01-04 08:16, Shenming Lu wrote:
>>> After pausing all vCPUs and devices capable of interrupting, in order
>>> to save the information of all interrupts, besides flushing the pending
>>> states in kvm’s vgic, we also try to flush the states of VLPIs in the
>>> virtual pending tables into guest RAM, but we need to have GICv4.1 and
>>> safely unmap the vPEs first.
>>>
>>> Signed-off-by: Shenming Lu 
>>> ---
>>>  arch/arm64/kvm/vgic/vgic-v3.c | 58 +++
>>>  1 file changed, 52 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
>>> index 9cdf39a94a63..a58c94127cb0 100644
>>> --- a/arch/arm64/kvm/vgic/vgic-v3.c
>>> +++ b/arch/arm64/kvm/vgic/vgic-v3.c
>>> @@ -1,6 +1,8 @@
>>>  // SPDX-License-Identifier: GPL-2.0-only
>>>
>>>  #include 
>>> +#include 
>>> +#include 
>>>  #include 
>>>  #include 
>>>  #include 
>>> @@ -356,6 +358,38 @@ int vgic_v3_lpi_sync_pending_status(struct kvm
>>> *kvm, struct vgic_irq *irq)
>>>  return 0;
>>>  }
>>>
>>> +/*
>>> + * The deactivation of the doorbell interrupt will trigger the
>>> + * unmapping of the associated vPE.
>>> + */
>>> +static void unmap_all_vpes(struct vgic_dist *dist)
>>> +{
>>> +    struct irq_desc *desc;
>>> +    int i;
>>> +
>>> +    if (!kvm_vgic_global_state.has_gicv4_1)
>>> +    return;
>>> +
>>> +    for (i = 0; i < dist->its_vm.nr_vpes; i++) {
>>> +    desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
>>> +    irq_domain_deactivate_irq(irq_desc_get_irq_data(desc));
>>> +    }
>>> +}
>>> +
>>> +static void map_all_vpes(struct vgic_dist *dist)
>>> +{
>>> +    struct irq_desc *desc;
>>> +    int i;
>>> +
>>> +    if (!kvm_vgic_global_state.has_gicv4_1)
>>> +    return;
>>> +
>>> +    for (i = 0; i < dist->its_vm.nr_vpes; i++) {
>>> +    desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
>>> +    irq_domain_activate_irq(irq_desc_get_irq_data(desc), false);
>>> +    }
>>> +}
>>> +
>>>  /**
>>>   * vgic_v3_save_pending_tables - Save the pending tables into guest RAM
>>>   * kvm lock and all vcpu lock must be held
>>> @@ -365,14 +399,18 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
>>>  struct vgic_dist *dist = &kvm->arch.vgic;
>>>  struct vgic_irq *irq;
>>>  gpa_t last_ptr = ~(gpa_t)0;
>>> -    int ret;
>>> +    int ret = 0;
>>>  u8 val;
>>>
>>> +    /* As a preparation for getting any VLPI states. */
>>> +    unmap_all_vpes(dist);
>>
>> What if the VPEs are not mapped yet? Is it possible to snapshot a VM
>> that has not run at all?
> 
> More questions: what happens to vSGIs that were mapped to the VPEs?
> Can they safely be restarted? The spec is not saying much on the subject.

Since we have already paused all vCPUs, there would be no more vSGIs generated,
and also no vSGI would be delivered to the vPE. And the unmapping of the
vPE would not affect the (already) stored vSGI states... I think they could
be safely restarted.

> 
> Once the unmap has taken place, it won't be possible to read their state
> via GICR_VSGIRPEND, and only the memory state can be used. This probably
> needs to be tracked as well.

Yes, since we will map the vPEs back, could we assume that the saving of the
vLPI and vSGI states happen serially? In fact that's what QEMU does.

> 
> Thanks,
> 
>     M.

Re: [RFC PATCH v2 3/4] KVM: arm64: GICv4.1: Restore VLPI's pending state to physical side

On 2021/1/5 17:25, Marc Zyngier wrote:
> On 2021-01-04 08:16, Shenming Lu wrote:
>> From: Zenghui Yu 
>>
>> When setting the forwarding path of a VLPI (switch to the HW mode),
>> we could also transfer the pending state from irq->pending_latch to
>> VPT (especially in migration, the pending states of VLPIs are restored
>> into kvm’s vgic first). And we currently send "INT+VSYNC" to trigger
>> a VLPI to pending.
>>
>> Signed-off-by: Zenghui Yu 
>> Signed-off-by: Shenming Lu 
>> ---
>>  arch/arm64/kvm/vgic/vgic-v4.c | 12 
>>  1 file changed, 12 insertions(+)
>>
>> diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
>> index f211a7c32704..7945d6d09cdd 100644
>> --- a/arch/arm64/kvm/vgic/vgic-v4.c
>> +++ b/arch/arm64/kvm/vgic/vgic-v4.c
>> @@ -454,6 +454,18 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int 
>> virq,
>>  irq->host_irq    = virq;
>>  atomic_inc(&map.vpe->vlpi_count);
>>
>> +    /* Transfer pending state */
>> +    ret = irq_set_irqchip_state(irq->host_irq,
>> +    IRQCHIP_STATE_PENDING,
>> +    irq->pending_latch);
>> +    WARN_RATELIMIT(ret, "IRQ %d", irq->host_irq);
> 
> Why do this if pending_latch is 0, which is likely to be
> the overwhelming case?

Yes, there is no need to do this if pending_latch is 0.

> 
>> +
>> +    /*
>> + * Let it be pruned from ap_list later and don't bother
>> + * the List Register.
>> + */
>> +    irq->pending_latch = false;
> 
> What guarantees the pruning? Pruning only happens on vcpu exit,
> which means we may have the same interrupt via both the LR and
> the stream interface, which I don't believe is legal (it is
> like having two LRs holding the same interrupt).

Since the irq's pending_latch is set to false here, it will not be
populated to the LR in vgic_flush_lr_state() (vgic_target_oracle()
will return NULL).

> 
>> +
>>  out:
>>  mutex_unlock(&its->its_lock);
>>  return ret;
> 
> Thanks,
> 
>     M.

Re: [RFC PATCH v2 2/4] KVM: arm64: GICv4.1: Try to save hw pending state in save_pending_tables

On 2021/1/5 17:13, Marc Zyngier wrote:
> On 2021-01-04 08:16, Shenming Lu wrote:
>> After pausing all vCPUs and devices capable of interrupting, in order
>> to save the information of all interrupts, besides flushing the pending
>> states in kvm’s vgic, we also try to flush the states of VLPIs in the
>> virtual pending tables into guest RAM, but we need to have GICv4.1 and
>> safely unmap the vPEs first.
>>
>> Signed-off-by: Shenming Lu 
>> ---
>>  arch/arm64/kvm/vgic/vgic-v3.c | 58 +++
>>  1 file changed, 52 insertions(+), 6 deletions(-)
>>
>> diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
>> index 9cdf39a94a63..a58c94127cb0 100644
>> --- a/arch/arm64/kvm/vgic/vgic-v3.c
>> +++ b/arch/arm64/kvm/vgic/vgic-v3.c
>> @@ -1,6 +1,8 @@
>>  // SPDX-License-Identifier: GPL-2.0-only
>>
>>  #include 
>> +#include 
>> +#include 
>>  #include 
>>  #include 
>>  #include 
>> @@ -356,6 +358,38 @@ int vgic_v3_lpi_sync_pending_status(struct kvm
>> *kvm, struct vgic_irq *irq)
>>  return 0;
>>  }
>>
>> +/*
>> + * The deactivation of the doorbell interrupt will trigger the
>> + * unmapping of the associated vPE.
>> + */
>> +static void unmap_all_vpes(struct vgic_dist *dist)
>> +{
>> +    struct irq_desc *desc;
>> +    int i;
>> +
>> +    if (!kvm_vgic_global_state.has_gicv4_1)
>> +    return;
>> +
>> +    for (i = 0; i < dist->its_vm.nr_vpes; i++) {
>> +    desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
>> +    irq_domain_deactivate_irq(irq_desc_get_irq_data(desc));
>> +    }
>> +}
>> +
>> +static void map_all_vpes(struct vgic_dist *dist)
>> +{
>> +    struct irq_desc *desc;
>> +    int i;
>> +
>> +    if (!kvm_vgic_global_state.has_gicv4_1)
>> +    return;
>> +
>> +    for (i = 0; i < dist->its_vm.nr_vpes; i++) {
>> +    desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
>> +    irq_domain_activate_irq(irq_desc_get_irq_data(desc), false);
>> +    }
>> +}
>> +
>>  /**
>>   * vgic_v3_save_pending_tables - Save the pending tables into guest RAM
>>   * kvm lock and all vcpu lock must be held
>> @@ -365,14 +399,18 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
>>  struct vgic_dist *dist = &kvm->arch.vgic;
>>  struct vgic_irq *irq;
>>  gpa_t last_ptr = ~(gpa_t)0;
>> -    int ret;
>> +    int ret = 0;
>>  u8 val;
>>
>> +    /* As a preparation for getting any VLPI states. */
>> +    unmap_all_vpes(dist);
> 
> What if the VPEs are not mapped yet? Is it possible to snapshot a VM
> that has not run at all?

What I see in QEMU is that the saving of the pending tables would only be
called when stopping the VM and it needs the current VM state to be RUNNING.

> 
>> +
>>  list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
>>  int byte_offset, bit_nr;
>>  struct kvm_vcpu *vcpu;
>>  gpa_t pendbase, ptr;
>>  bool stored;
>> +    bool is_pending = irq->pending_latch;
>>
>>  vcpu = irq->target_vcpu;
>>  if (!vcpu)
>> @@ -387,24 +425,32 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
>>  if (ptr != last_ptr) {
>>  ret = kvm_read_guest_lock(kvm, ptr, &val, 1);
>>  if (ret)
>> -    return ret;
>> +    goto out;
>>  last_ptr = ptr;
>>  }
>>
>>  stored = val & (1U << bit_nr);
>> -    if (stored == irq->pending_latch)
>> +
>> +    if (irq->hw)
>> +    vgic_v4_get_vlpi_state(irq, &is_pending);
> 
> You don't check the return value here, so I wonder why the checks
> in vgic_v4_get_vlpi_state().

Since I have already checked the condition and reported in save_its_tables
(patch 4), I just check in get_vlpi_state and don't report again here.

> 
> Another thing that worries me is that vgic_v4_get_vlpi_state() doesn't
> have any cache invalidation, and can end-up hitting in the CPU cache
> (there is no guarantee of coherency between the GIC and the CPU, only
> that the GIC will have flushed its caches).
> 
> I'd expect this to happen at unmap time, though, in order to avoid
> repeated single byte invalidations.

Ok, I will add a cache invalidation at unmap time.

> 
>> +
>> +    if (stored == is_pending)
>>  continue;
>>
>> -    if (irq->pending_latch)
>> +    if (is_pending)
>>  val |= 1 << bit_nr;
>>  else
>>  val &= ~(1 << bit_nr);
>>
>>  ret = kvm_write_guest_lock(kvm, ptr, &val, 1);
>>  if (ret)
>> -    return ret;
>> +    goto out;
>>  }
>> -    return 0;
>> +
>> +out:
>> +    map_all_vpes(dist);
>> +
>> +    return ret;
>>  }
>>
>>  /**
> 
> Thanks,
> 
>     M.

[RFC PATCH v2 0/4] KVM: arm64: Add VLPI migration support on GICv4.1

In GICv4.1, migration has been supported except for (directly-injected)
VLPI. And GICv4.1 Spec explicitly gives a way to get the VLPI's pending
state (which was crucially missing in GICv4.0). So we make VLPI migration
capable on GICv4.1 in this patch set.

In order to support VLPI migration, we need to save and restore all
required configuration information and pending states of VLPIs. But
in fact, the configuration information of VLPIs has already been saved
(or will be reallocated on the dst host...) in vgic(kvm) migration.
So we only have to migrate the pending states of VLPIs specially.

Below is the related workflow in migration.

On the save path:
In migration completion:
pause all vCPUs
|
call each VM state change handler:
pause other devices (just keep from sending interrupts, 
and
such as VFIO migration protocol has already realized it 
[1])
|
flush ITS tables into guest RAM
|
flush RDIST pending tables (also flush VLPI state here)
|
...
On the resume path:
load each device's state:
restore ITS tables (include pending tables) from guest RAM
|
for other (PCI) devices (paused), if configured to have VLPIs,
establish the forwarding paths of their VLPIs (and transfer
the pending states from kvm's vgic to VPT here)

We have tested this series in VFIO migration, and found some related
issues in QEMU [2].

Links:
[1] vfio: UAPI for migration interface for device state:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/
commit/?id=a8a24f3f6e38103b77cf399c38eb54e1219d00d6
[2] vfio: Some fixes and optimizations for VFIO migration:
https://patchwork.ozlabs.org/cover/1413263/

History:

v1 -> v2:
 - Get the VLPI state from the KVM side.
 - Nit fixes.

Since there seems to be no better place to transfer the pending states
in patch 3, we just keep it unchanged.

Thanks,
Shenming


Shenming Lu (3):
  KVM: arm64: GICv4.1: Add function to get VLPI state
  KVM: arm64: GICv4.1: Try to save hw pending state in
save_pending_tables
  KVM: arm64: GICv4.1: Give a chance to save VLPI's pending state

Zenghui Yu (1):
  KVM: arm64: GICv4.1: Restore VLPI's pending state to physical side

 .../virt/kvm/devices/arm-vgic-its.rst |  2 +-
 arch/arm64/kvm/vgic/vgic-its.c|  6 +-
 arch/arm64/kvm/vgic/vgic-v3.c | 58 +--
 arch/arm64/kvm/vgic/vgic-v4.c | 36 
 arch/arm64/kvm/vgic/vgic.h|  1 +
 5 files changed, 93 insertions(+), 10 deletions(-)

-- 
2.19.1

[RFC PATCH v2 1/4] KVM: arm64: GICv4.1: Add function to get VLPI state

With GICv4.1 and the vPE unmapped, which indicates the invalidation
of any VPT caches associated with the vPE, we can get the VLPI state
by peeking at the VPT. So we add a function for this.

Signed-off-by: Shenming Lu 
---
 arch/arm64/kvm/vgic/vgic-v4.c | 24 
 arch/arm64/kvm/vgic/vgic.h|  1 +
 2 files changed, 25 insertions(+)

diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
index 66508b03094f..f211a7c32704 100644
--- a/arch/arm64/kvm/vgic/vgic-v4.c
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@@ -203,6 +203,30 @@ void vgic_v4_configure_vsgis(struct kvm *kvm)
kvm_arm_resume_guest(kvm);
 }
 
+/*
+ * Must be called with GICv4.1 and the vPE unmapped, which
+ * indicates the invalidation of any VPT caches associated
+ * with the vPE, thus we can get the VLPI state by peeking
+ * at the VPT.
+ */
+int vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val)
+{
+   struct its_vpe *vpe = &irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
+   int mask = BIT(irq->intid % BITS_PER_BYTE);
+   void *va;
+   u8 *ptr;
+
+   if (!irq->hw || !kvm_vgic_global_state.has_gicv4_1)
+   return -EINVAL;
+
+   va = page_address(vpe->vpt_page);
+   ptr = va + irq->intid / BITS_PER_BYTE;
+
+   *val = !!(*ptr & mask);
+
+   return 0;
+}
+
 /**
  * vgic_v4_init - Initialize the GICv4 data structures
  * @kvm:   Pointer to the VM being initialized
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
index 64fcd750..9c9b43e0d0b0 100644
--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -317,5 +317,6 @@ bool vgic_supports_direct_msis(struct kvm *kvm);
 int vgic_v4_init(struct kvm *kvm);
 void vgic_v4_teardown(struct kvm *kvm);
 void vgic_v4_configure_vsgis(struct kvm *kvm);
+int vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val);
 
 #endif
-- 
2.19.1

[RFC PATCH v2 2/4] KVM: arm64: GICv4.1: Try to save hw pending state in save_pending_tables

After pausing all vCPUs and devices capable of interrupting, in order
to save the information of all interrupts, besides flushing the pending
states in kvm’s vgic, we also try to flush the states of VLPIs in the
virtual pending tables into guest RAM, but we need to have GICv4.1 and
safely unmap the vPEs first.

Signed-off-by: Shenming Lu 
---
 arch/arm64/kvm/vgic/vgic-v3.c | 58 +++
 1 file changed, 52 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 9cdf39a94a63..a58c94127cb0 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0-only
 
 #include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -356,6 +358,38 @@ int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, 
struct vgic_irq *irq)
return 0;
 }
 
+/*
+ * The deactivation of the doorbell interrupt will trigger the
+ * unmapping of the associated vPE.
+ */
+static void unmap_all_vpes(struct vgic_dist *dist)
+{
+   struct irq_desc *desc;
+   int i;
+
+   if (!kvm_vgic_global_state.has_gicv4_1)
+   return;
+
+   for (i = 0; i < dist->its_vm.nr_vpes; i++) {
+   desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
+   irq_domain_deactivate_irq(irq_desc_get_irq_data(desc));
+   }
+}
+
+static void map_all_vpes(struct vgic_dist *dist)
+{
+   struct irq_desc *desc;
+   int i;
+
+   if (!kvm_vgic_global_state.has_gicv4_1)
+   return;
+
+   for (i = 0; i < dist->its_vm.nr_vpes; i++) {
+   desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
+   irq_domain_activate_irq(irq_desc_get_irq_data(desc), false);
+   }
+}
+
 /**
  * vgic_v3_save_pending_tables - Save the pending tables into guest RAM
  * kvm lock and all vcpu lock must be held
@@ -365,14 +399,18 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
struct vgic_dist *dist = &kvm->arch.vgic;
struct vgic_irq *irq;
gpa_t last_ptr = ~(gpa_t)0;
-   int ret;
+   int ret = 0;
u8 val;
 
+   /* As a preparation for getting any VLPI states. */
+   unmap_all_vpes(dist);
+
list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
int byte_offset, bit_nr;
struct kvm_vcpu *vcpu;
gpa_t pendbase, ptr;
bool stored;
+   bool is_pending = irq->pending_latch;
 
vcpu = irq->target_vcpu;
if (!vcpu)
@@ -387,24 +425,32 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
if (ptr != last_ptr) {
ret = kvm_read_guest_lock(kvm, ptr, &val, 1);
if (ret)
-   return ret;
+   goto out;
last_ptr = ptr;
}
 
stored = val & (1U << bit_nr);
-   if (stored == irq->pending_latch)
+
+   if (irq->hw)
+   vgic_v4_get_vlpi_state(irq, &is_pending);
+
+   if (stored == is_pending)
continue;
 
-   if (irq->pending_latch)
+   if (is_pending)
val |= 1 << bit_nr;
else
val &= ~(1 << bit_nr);
 
ret = kvm_write_guest_lock(kvm, ptr, &val, 1);
if (ret)
-   return ret;
+   goto out;
}
-   return 0;
+
+out:
+   map_all_vpes(dist);
+
+   return ret;
 }
 
 /**
-- 
2.19.1

[RFC PATCH v2 3/4] KVM: arm64: GICv4.1: Restore VLPI's pending state to physical side

From: Zenghui Yu 

When setting the forwarding path of a VLPI (switch to the HW mode),
we could also transfer the pending state from irq->pending_latch to
VPT (especially in migration, the pending states of VLPIs are restored
into kvm’s vgic first). And we currently send "INT+VSYNC" to trigger
a VLPI to pending.

Signed-off-by: Zenghui Yu 
Signed-off-by: Shenming Lu 
---
 arch/arm64/kvm/vgic/vgic-v4.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
index f211a7c32704..7945d6d09cdd 100644
--- a/arch/arm64/kvm/vgic/vgic-v4.c
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@@ -454,6 +454,18 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq,
irq->host_irq   = virq;
atomic_inc(&map.vpe->vlpi_count);
 
+   /* Transfer pending state */
+   ret = irq_set_irqchip_state(irq->host_irq,
+   IRQCHIP_STATE_PENDING,
+   irq->pending_latch);
+   WARN_RATELIMIT(ret, "IRQ %d", irq->host_irq);
+
+   /*
+* Let it be pruned from ap_list later and don't bother
+* the List Register.
+*/
+   irq->pending_latch = false;
+
 out:
mutex_unlock(&its->its_lock);
return ret;
-- 
2.19.1

[RFC PATCH v2 4/4] KVM: arm64: GICv4.1: Give a chance to save VLPI's pending state

Before GICv4.1, we do not have direct access to the VLPI's pending
state. So we simply let it fail early when encountering any VLPI.

But now we don't have to return -EACCES directly if on GICv4.1. So
let’s change the hard code and give a chance to save the VLPI's pending
state (and preserve the UAPI).

Signed-off-by: Shenming Lu 
---
 Documentation/virt/kvm/devices/arm-vgic-its.rst | 2 +-
 arch/arm64/kvm/vgic/vgic-its.c  | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Documentation/virt/kvm/devices/arm-vgic-its.rst 
b/Documentation/virt/kvm/devices/arm-vgic-its.rst
index 6c304fd2b1b4..d257eddbae29 100644
--- a/Documentation/virt/kvm/devices/arm-vgic-its.rst
+++ b/Documentation/virt/kvm/devices/arm-vgic-its.rst
@@ -80,7 +80,7 @@ KVM_DEV_ARM_VGIC_GRP_CTRL
 -EFAULT  Invalid guest ram access
 -EBUSY   One or more VCPUS are running
 -EACCES  The virtual ITS is backed by a physical GICv4 ITS, and the
-state is not available
+state is not available without GICv4.1
 ===  ==
 
 KVM_DEV_ARM_VGIC_GRP_ITS_REGS
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index 40cbaca81333..ec7543a9617c 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -2218,10 +2218,10 @@ static int vgic_its_save_itt(struct vgic_its *its, 
struct its_device *device)
/*
 * If an LPI carries the HW bit, this means that this
 * interrupt is controlled by GICv4, and we do not
-* have direct access to that state. Let's simply fail
-* the save operation...
+* have direct access to that state without GICv4.1.
+* Let's simply fail the save operation...
 */
-   if (ite->irq->hw)
+   if (ite->irq->hw && !kvm_vgic_global_state.has_gicv4_1)
return -EACCES;
 
ret = vgic_its_save_ite(its, device, ite, gpa, ite_esz);
-- 
2.19.1

Re: [PATCH RFC] KVM: arm64: vgic: Decouple the check of the EnableLPIs bit from the ITS LPI translation

2020-12-31 Thread Shenming Lu

On 2020/12/31 20:22, Marc Zyngier wrote:
> On 2020-12-31 11:58, Shenming Lu wrote:
>> On 2020/12/31 16:57, Marc Zyngier wrote:
>>> Hi Shemming,
>>>
>>> On 2020-12-31 06:28, Shenming Lu wrote:
>>>> When the EnableLPIs bit is set to 0, any ITS LPI requests in the
>>>> Redistributor would be ignored. And this check is independent from
>>>> the ITS LPI translation. So it might be better to move the check
>>>> of the EnableLPIs bit out of the LPI resolving, and also add it
>>>> to the path that uses the translation cache.
>>>
>>> But by doing that, you are moving the overhead of checking for
>>> EnableLPIs from the slow path (translation walk) to the fast
>>> path (cache hit), which seems counter-productive.
>>
>> Oh, I didn't notice the overhead of the checking, I thought it would
>> be negligible...
> 
> It probably doesn't show on a modern box, but some of the slower
> systems might see it. Overall, this is a design decision to keep
> the translation cache as simple and straightforward as possible:
> if anything affects the output of the cache, we invalidate it,
> and that's it.

Ok, get it.

> 
>>
>>>
>>>> Besides it seems that
>>>> by this the invalidating of the translation cache caused by the LPI
>>>> disabling is unnecessary.
>>>>
>>>> Not sure if I have missed something... Thanks.
>>>
>>> I am certainly missing the purpose of this patch.
>>>
>>> The effect of EnableLPIs being zero is to drop the result of any
>>> translation (a new pending bit) on the floor. Given that, it is
>>> immaterial whether this causes a new translation or hits in the
>>> cache, as the result is still to not pend a new interrupt.
>>>
>>> I get the feeling that you are trying to optimise for the unusual
>>> case where EnableLPIs is 0 *and* you have a screaming device
>>> injecting tons of interrupt. If that is the case, I don't think
>>> this is worth it.
>>
>> In fact, I just found (imagining) that if the EnableLPIs bit is 0,
>> the kvm_vgic_v4_set_forwarding() would fail when performing the LPI
>> translation, but indeed we don't try to pend any interrupts there...
>>
>> By the way, it seems that the LPI disabling would not affect the
>> injection of VLPIs...
> 
> Yes, good point. We could unmap the VPE from all ITS, which would result
> in all translations to be discarded, but this has the really bad side
> effect of *also* preventing the delivery of vSGIs, which isn't what
> you'd expect.
> 
> Overall, I don't think there is a good way to support this, and maybe
> we should just prevent EnableLPIs to be turned off when using direct
> injection. After all, the architecture does allow that for GICv3
> implementations, which is what we emulate.

Agreed, if there is no good way, we could just make the EnableLPIs clearing
unsupported...

Thanks(Happy 2021),
Shenming

> 
> Thanks,
> 
>     M.

Re: [PATCH RFC] KVM: arm64: vgic: Decouple the check of the EnableLPIs bit from the ITS LPI translation

2020-12-31 Thread Shenming Lu

On 2020/12/31 16:57, Marc Zyngier wrote:
> Hi Shemming,
> 
> On 2020-12-31 06:28, Shenming Lu wrote:
>> When the EnableLPIs bit is set to 0, any ITS LPI requests in the
>> Redistributor would be ignored. And this check is independent from
>> the ITS LPI translation. So it might be better to move the check
>> of the EnableLPIs bit out of the LPI resolving, and also add it
>> to the path that uses the translation cache.
> 
> But by doing that, you are moving the overhead of checking for
> EnableLPIs from the slow path (translation walk) to the fast
> path (cache hit), which seems counter-productive.

Oh, I didn't notice the overhead of the checking, I thought it would
be negligible...

> 
>> Besides it seems that
>> by this the invalidating of the translation cache caused by the LPI
>> disabling is unnecessary.
>>
>> Not sure if I have missed something... Thanks.
> 
> I am certainly missing the purpose of this patch.
> 
> The effect of EnableLPIs being zero is to drop the result of any
> translation (a new pending bit) on the floor. Given that, it is
> immaterial whether this causes a new translation or hits in the
> cache, as the result is still to not pend a new interrupt.
> 
> I get the feeling that you are trying to optimise for the unusual
> case where EnableLPIs is 0 *and* you have a screaming device
> injecting tons of interrupt. If that is the case, I don't think
> this is worth it.

In fact, I just found (imagining) that if the EnableLPIs bit is 0,
the kvm_vgic_v4_set_forwarding() would fail when performing the LPI
translation, but indeed we don't try to pend any interrupts there...

By the way, it seems that the LPI disabling would not affect the
injection of VLPIs...

Thanks,
Shenming

> 
> Thanks,
> 
>     M.
> 
>>
>> Signed-off-by: Shenming Lu 
>> ---
>>  arch/arm64/kvm/vgic/vgic-its.c | 9 +
>>  arch/arm64/kvm/vgic/vgic-mmio-v3.c | 4 +---
>>  2 files changed, 6 insertions(+), 7 deletions(-)
>>
>> diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
>> index 40cbaca81333..f53446bc154e 100644
>> --- a/arch/arm64/kvm/vgic/vgic-its.c
>> +++ b/arch/arm64/kvm/vgic/vgic-its.c
>> @@ -683,9 +683,6 @@ int vgic_its_resolve_lpi(struct kvm *kvm, struct
>> vgic_its *its,
>>  if (!vcpu)
>>  return E_ITS_INT_UNMAPPED_INTERRUPT;
>>
>> -    if (!vcpu->arch.vgic_cpu.lpis_enabled)
>> -    return -EBUSY;
>> -
>>  vgic_its_cache_translation(kvm, its, devid, eventid, ite->irq);
>>
>>  *irq = ite->irq;
>> @@ -738,6 +735,9 @@ static int vgic_its_trigger_msi(struct kvm *kvm,
>> struct vgic_its *its,
>>  if (err)
>>  return err;
>>
>> +    if (!irq->target_vcpu->arch.vgic_cpu.lpis_enabled)
>> +    return -EBUSY;
>> +
>>  if (irq->hw)
>>  return irq_set_irqchip_state(irq->host_irq,
>>   IRQCHIP_STATE_PENDING, true);
>> @@ -757,7 +757,8 @@ int vgic_its_inject_cached_translation(struct kvm
>> *kvm, struct kvm_msi *msi)
>>
>>  db = (u64)msi->address_hi << 32 | msi->address_lo;
>>  irq = vgic_its_check_cache(kvm, db, msi->devid, msi->data);
>> -    if (!irq)
>> +
>> +    if (!irq || !irq->target_vcpu->arch.vgic_cpu.lpis_enabled)
>>  return -EWOULDBLOCK;
>>
>>  raw_spin_lock_irqsave(&irq->irq_lock, flags);
>> diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c
>> b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
>> index 15a6c98ee92f..7b0749f7660d 100644
>> --- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c
>> +++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
>> @@ -242,10 +242,8 @@ static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu 
>> *vcpu,
>>
>>  vgic_cpu->lpis_enabled = val & GICR_CTLR_ENABLE_LPIS;
>>
>> -    if (was_enabled && !vgic_cpu->lpis_enabled) {
>> +    if (was_enabled && !vgic_cpu->lpis_enabled)
>>  vgic_flush_pending_lpis(vcpu);
>> -    vgic_its_invalidate_cache(vcpu->kvm);
>> -    }
>>
>>  if (!was_enabled && vgic_cpu->lpis_enabled)
>>  vgic_enable_lpis(vcpu);
>

[PATCH RFC] KVM: arm64: vgic: Decouple the check of the EnableLPIs bit from the ITS LPI translation

2020-12-30 Thread Shenming Lu

When the EnableLPIs bit is set to 0, any ITS LPI requests in the
Redistributor would be ignored. And this check is independent from
the ITS LPI translation. So it might be better to move the check
of the EnableLPIs bit out of the LPI resolving, and also add it
to the path that uses the translation cache. Besides it seems that
by this the invalidating of the translation cache caused by the LPI
disabling is unnecessary.

Not sure if I have missed something... Thanks.

Signed-off-by: Shenming Lu 
---
 arch/arm64/kvm/vgic/vgic-its.c | 9 +
 arch/arm64/kvm/vgic/vgic-mmio-v3.c | 4 +---
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index 40cbaca81333..f53446bc154e 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -683,9 +683,6 @@ int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its 
*its,
if (!vcpu)
return E_ITS_INT_UNMAPPED_INTERRUPT;
 
-   if (!vcpu->arch.vgic_cpu.lpis_enabled)
-   return -EBUSY;
-
vgic_its_cache_translation(kvm, its, devid, eventid, ite->irq);
 
*irq = ite->irq;
@@ -738,6 +735,9 @@ static int vgic_its_trigger_msi(struct kvm *kvm, struct 
vgic_its *its,
if (err)
return err;
 
+   if (!irq->target_vcpu->arch.vgic_cpu.lpis_enabled)
+   return -EBUSY;
+
if (irq->hw)
return irq_set_irqchip_state(irq->host_irq,
 IRQCHIP_STATE_PENDING, true);
@@ -757,7 +757,8 @@ int vgic_its_inject_cached_translation(struct kvm *kvm, 
struct kvm_msi *msi)
 
db = (u64)msi->address_hi << 32 | msi->address_lo;
irq = vgic_its_check_cache(kvm, db, msi->devid, msi->data);
-   if (!irq)
+
+   if (!irq || !irq->target_vcpu->arch.vgic_cpu.lpis_enabled)
return -EWOULDBLOCK;
 
raw_spin_lock_irqsave(&irq->irq_lock, flags);
diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c 
b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
index 15a6c98ee92f..7b0749f7660d 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
@@ -242,10 +242,8 @@ static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu,
 
vgic_cpu->lpis_enabled = val & GICR_CTLR_ENABLE_LPIS;
 
-   if (was_enabled && !vgic_cpu->lpis_enabled) {
+   if (was_enabled && !vgic_cpu->lpis_enabled)
vgic_flush_pending_lpis(vcpu);
-   vgic_its_invalidate_cache(vcpu->kvm);
-   }
 
if (!was_enabled && vgic_cpu->lpis_enabled)
vgic_enable_lpis(vcpu);
-- 
2.19.1

Re: [RFC PATCH v1 3/4] KVM: arm64: GICv4.1: Restore VLPI's pending state to physical side

2020-12-16 Thread Shenming Lu

On 2020/12/16 18:35, Auger Eric wrote:
> Hi Shenming,
> 
> On 12/1/20 1:15 PM, Shenming Lu wrote:
>> On 2020/12/1 19:50, Marc Zyngier wrote:
>>> On 2020-12-01 11:40, Shenming Lu wrote:
>>>> On 2020/12/1 18:55, Marc Zyngier wrote:
>>>>> On 2020-11-30 07:23, Shenming Lu wrote:
>>>>>
>>>>> Hi Shenming,
>>>>>
>>>>>> We are pondering over this problem these days, but still don't get a
>>>>>> good solution...
>>>>>> Could you give us some advice on this?
>>>>>>
>>>>>> Or could we move the restoring of the pending states (include the sync
>>>>>> from guest RAM and the transfer to HW) to the GIC VM state change 
>>>>>> handler,
>>>>>> which is completely corresponding to save_pending_tables (more 
>>>>>> symmetric?)
>>>>>> and don't expose GICv4...
>>>>>
>>>>> What is "the GIC VM state change handler"? Is that a QEMU thing?
>>>>
>>>> Yeah, it is a a QEMU thing...
>>>>
>>>>> We don't really have that concept in KVM, so I'd appreciate if you could
>>>>> be a bit more explicit on this.
>>>>
>>>> My thought is to add a new interface (to QEMU) for the restoring of
>>>> the pending states, which is completely corresponding to
>>>> KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES...
>>>> And it is called from the GIC VM state change handler in QEMU, which
>>>> is happening after the restoring (call kvm_vgic_v4_set_forwarding())
>>>> but before the starting (running) of the VFIO device.
>>>
>>> Right, that makes sense. I still wonder how much the GIC save/restore
>>> stuff differs from other architectures that implement similar features,
>>> such as x86 with VT-D.
>>
>> I am not familiar with it...
>>
>>>
>>> It is obviously too late to change the userspace interface, but I wonder
>>> whether we missed something at the time.
>>
>> The interface seems to be really asymmetrical?...
> 
> in qemu d5aa0c229a ("hw/intc/arm_gicv3_kvm: Implement pending table
> save") commit message, it is traced:
> 
> "There is no explicit restore as the tables are implicitly sync'ed
> on ITS table restore and on LPI enable at redistributor level."
> 
> At that time there was no real justification behind adding the RESTORE
> fellow attr.
> 
> Maybe a stupid question but isn't it possible to unset the forwarding
> when saving and rely on VFIO to automatically restore it when resuming
> on destination?

It seems that the unset_forwarding would not be called when saving, it would
be called after migration completion...
As for the resuming/set_forwarding, I still wonder: is it really improper to
transfer the pending states from vgic to VPT in set_forwarding (not only in
migration)?...  -_-

Thanks,
Shenming

> 
> Thanks
> 
> Eric
> 
> 
>>
>> Or is there a possibility that we could know which irq is hw before the VFIO
>> device calls kvm_vgic_v4_set_forwarding()?
>>
>> Thanks,
>> Shenming
>>
>>>
>>> Thanks,
>>>
>>>     M.
>>
> 
> .
>

[irqchip: irq/irqchip-next] irqchip/gic-v4.1: Reduce the delay when polling GICR_VPENDBASER.Dirty

2020-12-11 Thread irqchip-bot for Shenming Lu

The following commit has been merged into the irq/irqchip-next branch of 
irqchip:

Commit-ID: 0b39498230ae53e6af981141be99f4c7d5144de6
Gitweb:
https://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms/0b39498230ae53e6af981141be99f4c7d5144de6
Author:Shenming Lu 
AuthorDate:Sat, 28 Nov 2020 22:18:56 +08:00
Committer: Marc Zyngier 
CommitterDate: Fri, 11 Dec 2020 14:47:10 

irqchip/gic-v4.1: Reduce the delay when polling GICR_VPENDBASER.Dirty

The 10us delay of the poll on the GICR_VPENDBASER.Dirty bit is too
high, which might greatly affect the total scheduling latency of a
vCPU in our measurement. So we reduce it to 1 to lessen the impact.

Signed-off-by: Shenming Lu 
Signed-off-by: Marc Zyngier 
Link: https://lore.kernel.org/r/20201128141857.983-2-lushenm...@huawei.com
---
 drivers/irqchip/irq-gic-v3-its.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 4069c21..d74ef41 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -3808,7 +3808,7 @@ static void its_wait_vpt_parse_complete(void)
WARN_ON_ONCE(readq_relaxed_poll_timeout_atomic(vlpi_base + 
GICR_VPENDBASER,
   val,
   !(val & 
GICR_VPENDBASER_Dirty),
-  10, 500));
+  1, 500));
 }
 
 static void its_vpe_schedule(struct its_vpe *vpe)

Re: [RFC PATCH v1 3/4] KVM: arm64: GICv4.1: Restore VLPI's pending state to physical side

2020-12-08 Thread Shenming Lu

On 2020/12/1 20:15, Shenming Lu wrote:
> On 2020/12/1 19:50, Marc Zyngier wrote:
>> On 2020-12-01 11:40, Shenming Lu wrote:
>>> On 2020/12/1 18:55, Marc Zyngier wrote:
>>>> On 2020-11-30 07:23, Shenming Lu wrote:
>>>>
>>>> Hi Shenming,
>>>>
>>>>> We are pondering over this problem these days, but still don't get a
>>>>> good solution...
>>>>> Could you give us some advice on this?
>>>>>
>>>>> Or could we move the restoring of the pending states (include the sync
>>>>> from guest RAM and the transfer to HW) to the GIC VM state change handler,
>>>>> which is completely corresponding to save_pending_tables (more symmetric?)
>>>>> and don't expose GICv4...
>>>>
>>>> What is "the GIC VM state change handler"? Is that a QEMU thing?
>>>
>>> Yeah, it is a a QEMU thing...
>>>
>>>> We don't really have that concept in KVM, so I'd appreciate if you could
>>>> be a bit more explicit on this.
>>>
>>> My thought is to add a new interface (to QEMU) for the restoring of
>>> the pending states, which is completely corresponding to
>>> KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES...
>>> And it is called from the GIC VM state change handler in QEMU, which
>>> is happening after the restoring (call kvm_vgic_v4_set_forwarding())
>>> but before the starting (running) of the VFIO device.
>>
>> Right, that makes sense. I still wonder how much the GIC save/restore
>> stuff differs from other architectures that implement similar features,
>> such as x86 with VT-D.
> 
> I am not familiar with it...
> 
>>
>> It is obviously too late to change the userspace interface, but I wonder
>> whether we missed something at the time.
> 
> The interface seems to be really asymmetrical?...
> 
> Or is there a possibility that we could know which irq is hw before the VFIO
> device calls kvm_vgic_v4_set_forwarding()?
> 
> Thanks,
> Shenming
> 
>>
>> Thanks,
>>
>>     M.
> .
> 

Hi Marc,

I am learning VT-d Posted Interrupt (PI) these days.

As far as I can tell, the posted interrupts are firstly recorded in the Posted
Interrupt Request (*PIR*) field of the Posted Interrupt Descriptor (a temporary
storage area (data structure in memory) which is specific to PI), and when the
vCPU is running, a notification event (host vector) will be generated and sent
to the CPU (the target vCPU is currently scheduled on it), which will cause the
CPU to transfer the posted interrupt in the PIR field to the *Virtual-APIC page*
(a data structure in kvm, the virtual interrupts delivered through kvm are put
here, and it is also accessed by the VMX microcode (the layout matches the 
register
layout seen by the guest)) of the vCPU and directly deliver it to the vCPU.

So they only have to sync the PIR field to the Virtual-APIC page for the 
migration
saving, and do nothing for the resuming...

Besides, on x86 the setting of the IRQ bypass is independent of the VM interrupt
setup...

Not sure if I have missed something.

In addition, I found that the enabling of the vAPIC is at the end of the 
migration
(just before the VM start) on x86. So I am wondering if we could move the 
calling
of *vgic_enable_lpis()* back, and transfer the pending state to the VPT there 
if the
irq is hw (and I think the semantics of this function should include the 
transfer).
In fact, this function is dependent on the restoring of the vgic(lpi_list)...

After exploration, there seems to be no perfect place to transfer the pending 
states
to HW in order to be compatible with the existing interface and under the 
current
architecture, but we have to choose one solution?

Thanks,
Shenming

Re: [RFC PATCH v1 3/4] KVM: arm64: GICv4.1: Restore VLPI's pending state to physical side

2020-12-01 Thread Shenming Lu

On 2020/12/1 19:50, Marc Zyngier wrote:
> On 2020-12-01 11:40, Shenming Lu wrote:
>> On 2020/12/1 18:55, Marc Zyngier wrote:
>>> On 2020-11-30 07:23, Shenming Lu wrote:
>>>
>>> Hi Shenming,
>>>
>>>> We are pondering over this problem these days, but still don't get a
>>>> good solution...
>>>> Could you give us some advice on this?
>>>>
>>>> Or could we move the restoring of the pending states (include the sync
>>>> from guest RAM and the transfer to HW) to the GIC VM state change handler,
>>>> which is completely corresponding to save_pending_tables (more symmetric?)
>>>> and don't expose GICv4...
>>>
>>> What is "the GIC VM state change handler"? Is that a QEMU thing?
>>
>> Yeah, it is a a QEMU thing...
>>
>>> We don't really have that concept in KVM, so I'd appreciate if you could
>>> be a bit more explicit on this.
>>
>> My thought is to add a new interface (to QEMU) for the restoring of
>> the pending states, which is completely corresponding to
>> KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES...
>> And it is called from the GIC VM state change handler in QEMU, which
>> is happening after the restoring (call kvm_vgic_v4_set_forwarding())
>> but before the starting (running) of the VFIO device.
> 
> Right, that makes sense. I still wonder how much the GIC save/restore
> stuff differs from other architectures that implement similar features,
> such as x86 with VT-D.

I am not familiar with it...

> 
> It is obviously too late to change the userspace interface, but I wonder
> whether we missed something at the time.

The interface seems to be really asymmetrical?...

Or is there a possibility that we could know which irq is hw before the VFIO
device calls kvm_vgic_v4_set_forwarding()?

Thanks,
Shenming

> 
> Thanks,
> 
>     M.

Re: [RFC PATCH v1 3/4] KVM: arm64: GICv4.1: Restore VLPI's pending state to physical side

2020-12-01 Thread Shenming Lu

On 2020/12/1 18:55, Marc Zyngier wrote:
> On 2020-11-30 07:23, Shenming Lu wrote:
> 
> Hi Shenming,
> 
>> We are pondering over this problem these days, but still don't get a
>> good solution...
>> Could you give us some advice on this?
>>
>> Or could we move the restoring of the pending states (include the sync
>> from guest RAM and the transfer to HW) to the GIC VM state change handler,
>> which is completely corresponding to save_pending_tables (more symmetric?)
>> and don't expose GICv4...
> 
> What is "the GIC VM state change handler"? Is that a QEMU thing?

Yeah, it is a a QEMU thing...

> We don't really have that concept in KVM, so I'd appreciate if you could
> be a bit more explicit on this.

My thought is to add a new interface (to QEMU) for the restoring of the pending
states, which is completely corresponding to 
KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES...
And it is called from the GIC VM state change handler in QEMU, which is 
happening
after the restoring (call kvm_vgic_v4_set_forwarding()) but before the starting
(running) of the VFIO device.

Thanks,
Shenming

> 
> Thanks,
> 
>     M.

Re: [PATCH v2 2/2] KVM: arm64: Delay the execution of the polling on the GICR_VPENDBASER.Dirty bit

2020-11-30 Thread Shenming Lu

On 2020/11/30 19:22, Marc Zyngier wrote:
> On 2020-11-28 14:18, Shenming Lu wrote:
>> In order to further reduce the impact of the wait delay of the
>> VPT analysis, we can delay the execution of the polling on the
>> GICR_VPENDBASER.Dirty bit (call it from kvm_vgic_flush_hwstate()
>> corresponding to vPE resident), let the GIC and the CPU work in
>> parallel on the entry path.
>>
>> Signed-off-by: Shenming Lu 
>> ---
>>  arch/arm64/kvm/vgic/vgic-v4.c  | 16 
>>  arch/arm64/kvm/vgic/vgic.c |  3 +++
>>  drivers/irqchip/irq-gic-v3-its.c   | 16 
>>  drivers/irqchip/irq-gic-v4.c   | 11 +++
>>  include/kvm/arm_vgic.h |  3 +++
>>  include/linux/irqchip/arm-gic-v4.h |  4 
>>  6 files changed, 49 insertions(+), 4 deletions(-)
>>
>> diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
>> index b5fa73c9fd35..b0da74809187 100644
>> --- a/arch/arm64/kvm/vgic/vgic-v4.c
>> +++ b/arch/arm64/kvm/vgic/vgic-v4.c
>> @@ -353,6 +353,22 @@ int vgic_v4_load(struct kvm_vcpu *vcpu)
>>  return err;
>>  }
>>
>> +void vgic_v4_commit(struct kvm_vcpu *vcpu)
>> +{
>> +    struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
>> +
>> +    /*
>> + * No need to wait for the vPE to be ready across a shallow guest
>> + * exit, as only a vcpu_put will invalidate it.
>> + */
>> +    if (vpe->vpe_ready)
>> +    return;
>> +
>> +    its_commit_vpe(vpe);
>> +
>> +    vpe->vpe_ready = true;
> 
> This should be written as:
> 
> if (!ready)
>  commit();
> 
> and ready being driven by the commit() call itself.
> 
>> +}
>> +
>>  static struct vgic_its *vgic_get_its(struct kvm *kvm,
>>   struct kvm_kernel_irq_routing_entry *irq_entry)
>>  {
>> diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
>> index c3643b7f101b..1c597c9885fa 100644
>> --- a/arch/arm64/kvm/vgic/vgic.c
>> +++ b/arch/arm64/kvm/vgic/vgic.c
>> @@ -915,6 +915,9 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
>>
>>  if (can_access_vgic_from_kernel())
>>  vgic_restore_state(vcpu);
>> +
>> +    if (vgic_supports_direct_msis(vcpu->kvm))
>> +    vgic_v4_commit(vcpu);
>>  }
>>
>>  void kvm_vgic_load(struct kvm_vcpu *vcpu)
>> diff --git a/drivers/irqchip/irq-gic-v3-its.c 
>> b/drivers/irqchip/irq-gic-v3-its.c
>> index 22f427135c6b..f30aba14933e 100644
>> --- a/drivers/irqchip/irq-gic-v3-its.c
>> +++ b/drivers/irqchip/irq-gic-v3-its.c
>> @@ -3842,8 +3842,6 @@ static void its_vpe_schedule(struct its_vpe *vpe)
>>  val |= vpe->idai ? GICR_VPENDBASER_IDAI : 0;
>>  val |= GICR_VPENDBASER_Valid;
>>  gicr_write_vpendbaser(val, vlpi_base + GICR_VPENDBASER);
>> -
>> -    its_wait_vpt_parse_complete();
>>  }
>>
>>  static void its_vpe_deschedule(struct its_vpe *vpe)
>> @@ -3855,6 +3853,8 @@ static void its_vpe_deschedule(struct its_vpe *vpe)
>>
>>  vpe->idai = !!(val & GICR_VPENDBASER_IDAI);
>>  vpe->pending_last = !!(val & GICR_VPENDBASER_PendingLast);
>> +
>> +    vpe->vpe_ready = false;
> 
> This should be set from the its_make_vpe_non_resident() call.
> 
>>  }
>>
>>  static void its_vpe_invall(struct its_vpe *vpe)
>> @@ -3891,6 +3891,10 @@ static int its_vpe_set_vcpu_affinity(struct
>> irq_data *d, void *vcpu_info)
>>  its_vpe_deschedule(vpe);
>>  return 0;
>>
>> +    case COMMIT_VPE:
>> +    its_wait_vpt_parse_complete();
>> +    return 0;
>> +
>>  case INVALL_VPE:
>>  its_vpe_invall(vpe);
>>  return 0;
>> @@ -4052,8 +4056,6 @@ static void its_vpe_4_1_schedule(struct its_vpe *vpe,
>>  val |= FIELD_PREP(GICR_VPENDBASER_4_1_VPEID, vpe->vpe_id);
>>
>>  gicr_write_vpendbaser(val, vlpi_base + GICR_VPENDBASER);
>> -
>> -    its_wait_vpt_parse_complete();
>>  }
>>
>>  static void its_vpe_4_1_deschedule(struct its_vpe *vpe,
>> @@ -4091,6 +4093,8 @@ static void its_vpe_4_1_deschedule(struct its_vpe *vpe,
>>  GICR_VPENDBASER_PendingLast);
>>  vpe->pending_last = true;
>>  }
>> +
>> +    vpe->vpe_ready = false;
>>  }
>>
>>  static void its_vpe_4_1_invall(struct its_vpe *vpe)
>> @@ -4128,6 +4132,10 @@ static int its_vpe_4_1_set_vcpu_affinity(struct
>> irq_data *d, v

Re: [RFC PATCH v1 3/4] KVM: arm64: GICv4.1: Restore VLPI's pending state to physical side

2020-11-29 Thread Shenming Lu

On 2020/11/24 21:12, Shenming Lu wrote:
> On 2020/11/24 16:44, Marc Zyngier wrote:
>> On 2020-11-24 08:10, Shenming Lu wrote:
>>> On 2020/11/23 17:27, Marc Zyngier wrote:
>>>> On 2020-11-23 06:54, Shenming Lu wrote:
>>>>> From: Zenghui Yu 
>>>>>
>>>>> When setting the forwarding path of a VLPI, it is more consistent to
>>>>
>>>> I'm not sure it is more consistent. It is a *new* behaviour, because it 
>>>> only
>>>> matters for migration, which has been so far unsupported.
>>>
>>> Alright, consistent may not be accurate...
>>> But I have doubt that whether there is really no need to transfer the
>>> pending states
>>> from kvm'vgic to VPT in set_forwarding regardless of migration, and the 
>>> similar
>>> for unset_forwarding.
>>
>> If you have to transfer that state outside of the a save/restore, it means 
>> that
>> you have missed the programming of the PCI endpoint. This is an established
>> restriction that the MSI programming must occur *after* the translation has
>> been established using MAPI/MAPTI (see the large comment at the beginning of
>> vgic-v4.c).
>>
>> If you want to revisit this, fair enough. But you will need a lot more than
>> just opportunistically transfer the pending state.
> 
> Thanks, I will look at what you mentioned.
> 
>>
>>>
>>>>
>>>>> also transfer the pending state from irq->pending_latch to VPT (especially
>>>>> in migration, the pending states of VLPIs are restored into kvm’s vgic
>>>>> first). And we currently send "INT+VSYNC" to trigger a VLPI to pending.
>>>>>
>>>>> Signed-off-by: Zenghui Yu 
>>>>> Signed-off-by: Shenming Lu 
>>>>> ---
>>>>>  arch/arm64/kvm/vgic/vgic-v4.c | 12 
>>>>>  1 file changed, 12 insertions(+)
>>>>>
>>>>> diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
>>>>> index b5fa73c9fd35..cc3ab9cea182 100644
>>>>> --- a/arch/arm64/kvm/vgic/vgic-v4.c
>>>>> +++ b/arch/arm64/kvm/vgic/vgic-v4.c
>>>>> @@ -418,6 +418,18 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int 
>>>>> virq,
>>>>>  irq->host_irq    = virq;
>>>>>  atomic_inc(&map.vpe->vlpi_count);
>>>>>
>>>>> +    /* Transfer pending state */
>>>>> +    ret = irq_set_irqchip_state(irq->host_irq,
>>>>> +    IRQCHIP_STATE_PENDING,
>>>>> +    irq->pending_latch);
>>>>> +    WARN_RATELIMIT(ret, "IRQ %d", irq->host_irq);
>>>>> +
>>>>> +    /*
>>>>> + * Let it be pruned from ap_list later and don't bother
>>>>> + * the List Register.
>>>>> + */
>>>>> +    irq->pending_latch = false;
>>>>
>>>> It occurs to me that calling into irq_set_irqchip_state() for a large
>>>> number of interrupts can take a significant amount of time. It is also
>>>> odd that you dump the VPT with the VPE unmapped, but rely on the VPE
>>>> being mapped for the opposite operation.
>>>>
>>>> Shouldn't these be symmetric, all performed while the VPE is unmapped?
>>>> It would also save a lot of ITS traffic.
>>>>
>>>
>>> My thought was to use the existing interface directly without unmapping...
>>>
>>> If you want to unmap the vPE and poke the VPT here, as I said in the cover
>>> letter, set/unset_forwarding might also be called when all devices are 
>>> running
>>> at normal run time, in which case the unmapping of the vPE is not allowed...
>>
>> No, I'm suggesting that you don't do anything here, but instead as a 
>> by-product
>> of restoring the ITS tables. What goes wrong if you use the
>> KVM_DEV_ARM_ITS_RESTORE_TABLE backend instead?
> 
> There is an issue if we do it in the restoring of the ITS tables: the 
> transferring
> of the pending state needs the irq to be marked as hw before, which is done 
> by the
> pass-through device, but the configuring of the forwarding path of the VLPI 
> depends
> on the restoring of the vgic first... It is a circular dependency.
> 

Hi Marc,

We are pondering over this problem these days, but still don't get a good 
solution...
Could you give us some advice on this?

Or could we move the restoring of the pending states (include the sync from 
guest
RAM and the transfer to HW) to the GIC VM state change handler, which is 
completely
corresponding to save_pending_tables (more symmetric?) and don't expose GICv4...

Thanks,
Shenming

>>
>>> Another possible solution is to add a new dedicated interface to QEMU
>>> to transfer
>>> these pending states to HW in GIC VM state change handler corresponding to
>>> save_pending_tables?
>>
>> Userspace has no way to know we use GICv4, and I intend to keep it
>> completely out of the loop. The API is already pretty tortuous, and
>> I really don't want to add any extra complexity to it.
>>
>> Thanks,
>>
>>     M.

[PATCH v2 2/2] KVM: arm64: Delay the execution of the polling on the GICR_VPENDBASER.Dirty bit

2020-11-28 Thread Shenming Lu

In order to further reduce the impact of the wait delay of the
VPT analysis, we can delay the execution of the polling on the
GICR_VPENDBASER.Dirty bit (call it from kvm_vgic_flush_hwstate()
corresponding to vPE resident), let the GIC and the CPU work in
parallel on the entry path.

Signed-off-by: Shenming Lu 
---
 arch/arm64/kvm/vgic/vgic-v4.c  | 16 
 arch/arm64/kvm/vgic/vgic.c |  3 +++
 drivers/irqchip/irq-gic-v3-its.c   | 16 
 drivers/irqchip/irq-gic-v4.c   | 11 +++
 include/kvm/arm_vgic.h |  3 +++
 include/linux/irqchip/arm-gic-v4.h |  4 
 6 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
index b5fa73c9fd35..b0da74809187 100644
--- a/arch/arm64/kvm/vgic/vgic-v4.c
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@@ -353,6 +353,22 @@ int vgic_v4_load(struct kvm_vcpu *vcpu)
return err;
 }
 
+void vgic_v4_commit(struct kvm_vcpu *vcpu)
+{
+   struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
+
+   /*
+* No need to wait for the vPE to be ready across a shallow guest
+* exit, as only a vcpu_put will invalidate it.
+*/
+   if (vpe->vpe_ready)
+   return;
+
+   its_commit_vpe(vpe);
+
+   vpe->vpe_ready = true;
+}
+
 static struct vgic_its *vgic_get_its(struct kvm *kvm,
 struct kvm_kernel_irq_routing_entry 
*irq_entry)
 {
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index c3643b7f101b..1c597c9885fa 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -915,6 +915,9 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
 
if (can_access_vgic_from_kernel())
vgic_restore_state(vcpu);
+
+   if (vgic_supports_direct_msis(vcpu->kvm))
+   vgic_v4_commit(vcpu);
 }
 
 void kvm_vgic_load(struct kvm_vcpu *vcpu)
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 22f427135c6b..f30aba14933e 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -3842,8 +3842,6 @@ static void its_vpe_schedule(struct its_vpe *vpe)
val |= vpe->idai ? GICR_VPENDBASER_IDAI : 0;
val |= GICR_VPENDBASER_Valid;
gicr_write_vpendbaser(val, vlpi_base + GICR_VPENDBASER);
-
-   its_wait_vpt_parse_complete();
 }
 
 static void its_vpe_deschedule(struct its_vpe *vpe)
@@ -3855,6 +3853,8 @@ static void its_vpe_deschedule(struct its_vpe *vpe)
 
vpe->idai = !!(val & GICR_VPENDBASER_IDAI);
vpe->pending_last = !!(val & GICR_VPENDBASER_PendingLast);
+
+   vpe->vpe_ready = false;
 }
 
 static void its_vpe_invall(struct its_vpe *vpe)
@@ -3891,6 +3891,10 @@ static int its_vpe_set_vcpu_affinity(struct irq_data *d, 
void *vcpu_info)
its_vpe_deschedule(vpe);
return 0;
 
+   case COMMIT_VPE:
+   its_wait_vpt_parse_complete();
+   return 0;
+
case INVALL_VPE:
its_vpe_invall(vpe);
return 0;
@@ -4052,8 +4056,6 @@ static void its_vpe_4_1_schedule(struct its_vpe *vpe,
val |= FIELD_PREP(GICR_VPENDBASER_4_1_VPEID, vpe->vpe_id);
 
gicr_write_vpendbaser(val, vlpi_base + GICR_VPENDBASER);
-
-   its_wait_vpt_parse_complete();
 }
 
 static void its_vpe_4_1_deschedule(struct its_vpe *vpe,
@@ -4091,6 +4093,8 @@ static void its_vpe_4_1_deschedule(struct its_vpe *vpe,
GICR_VPENDBASER_PendingLast);
vpe->pending_last = true;
}
+
+   vpe->vpe_ready = false;
 }
 
 static void its_vpe_4_1_invall(struct its_vpe *vpe)
@@ -4128,6 +4132,10 @@ static int its_vpe_4_1_set_vcpu_affinity(struct irq_data 
*d, void *vcpu_info)
its_vpe_4_1_deschedule(vpe, info);
return 0;
 
+   case COMMIT_VPE:
+   its_wait_vpt_parse_complete();
+   return 0;
+
case INVALL_VPE:
its_vpe_4_1_invall(vpe);
return 0;
diff --git a/drivers/irqchip/irq-gic-v4.c b/drivers/irqchip/irq-gic-v4.c
index 0c18714ae13e..6cea71a4e68b 100644
--- a/drivers/irqchip/irq-gic-v4.c
+++ b/drivers/irqchip/irq-gic-v4.c
@@ -258,6 +258,17 @@ int its_make_vpe_resident(struct its_vpe *vpe, bool g0en, 
bool g1en)
return ret;
 }
 
+int its_commit_vpe(struct its_vpe *vpe)
+{
+   struct its_cmd_info info = {
+   .cmd_type = COMMIT_VPE,
+   };
+
+   WARN_ON(preemptible());
+
+   return its_send_vpe_cmd(vpe, &info);
+}
+
 int its_invall_vpe(struct its_vpe *vpe)
 {
struct its_cmd_info info = {
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index a8d8fdcd3723..f2170df6cf7c 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -401,7 +401,10 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int irq,
 int kvm_

[PATCH v2 0/2] KVM: arm64: Optimize the wait for the completion of the VPT analysis

2020-11-28 Thread Shenming Lu

Right after a vPE is made resident, the code starts polling the
GICR_VPENDBASER.Dirty bit until it becomes 0, where the delay_us
is set to 10. But in our measurement, it takes only hundreds of
nanoseconds, or 1~2 microseconds, to finish parsing the VPT in most
cases. What's more, we found that the MMIO delay on GICv4.1 system
(HiSilicon) is about 10 times higher than that on GICv4.0 system in
kvm-unit-tests (the specific data is as follows).

|   GICv4.1 emulator   |  GICv4.0 emulator
mmio_read_user (ns) |12811 |1598

After analysis, this is mainly caused by the 10 delay_us, so it might
really hurt performance.

To avoid this, we can set the delay_us to 1, which is more appropriate
in this situation and universal. Besides, we can delay the execution
of the polling, giving the GIC a chance to work in parallel with the CPU
on the entry path.

Shenming Lu (2):
  irqchip/gic-v4.1: Reduce the delay time of the poll on the
GICR_VPENDBASER.Dirty bit
  KVM: arm64: Delay the execution of the polling on the
GICR_VPENDBASER.Dirty bit

 arch/arm64/kvm/vgic/vgic-v4.c  | 16 
 arch/arm64/kvm/vgic/vgic.c |  3 +++
 drivers/irqchip/irq-gic-v3-its.c   | 18 +-
 drivers/irqchip/irq-gic-v4.c   | 11 +++
 include/kvm/arm_vgic.h |  3 +++
 include/linux/irqchip/arm-gic-v4.h |  4 
 6 files changed, 50 insertions(+), 5 deletions(-)

-- 
2.23.0

[PATCH v2 1/2] irqchip/gic-v4.1: Reduce the delay time of the poll on the GICR_VPENDBASER.Dirty bit

2020-11-28 Thread Shenming Lu

The 10 delay_us of the poll on the GICR_VPENDBASER.Dirty bit is too
high, which might greatly affect the total scheduling latency of a
vCPU in our measurement. So we reduce it to 1 to lessen the impact.

Signed-off-by: Shenming Lu 
---
 drivers/irqchip/irq-gic-v3-its.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 0fec31931e11..22f427135c6b 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -3809,7 +3809,7 @@ static void its_wait_vpt_parse_complete(void)
WARN_ON_ONCE(readq_relaxed_poll_timeout_atomic(vlpi_base + 
GICR_VPENDBASER,
   val,
   !(val & 
GICR_VPENDBASER_Dirty),
-  10, 500));
+  1, 500));
 }
 
 static void its_vpe_schedule(struct its_vpe *vpe)
-- 
2.23.0

Re: [PATCH] irqchip/gic-v4.1: Optimize the wait for the completion of the analysis of the VPT

2020-11-27 Thread Shenming Lu

On 2020/11/28 3:35, Marc Zyngier wrote:
> Shenming,
> 
> Somehow this patch ended up in the wrong folder.
> Apologies for the delay reviewing it.>
> On 2020-09-23 07:35, Shenming Lu wrote:
>> Right after a vPE is made resident, the code starts polling the
>> GICR_VPENDBASER.Dirty bit until it becomes 0, where the delay_us
>> is set to 10. But in our measurement, it takes only hundreds of
>> nanoseconds, or 1~2 microseconds, to finish parsing the VPT in most
>> cases. And we also measured the time from vcpu_load() (include it)
>> to __guest_enter() on Kunpeng 920. On average, it takes 2.55 microseconds
>> (not first run && the VPT is empty). So 10 microseconds delay might
>> really hurt performance.
>>
>> To avoid this, we can set the delay_us to 1, which is more appropriate
>> in this situation and universal. Besides, we can delay the execution
>> of its_wait_vpt_parse_complete() (call it from kvm_vgic_flush_hwstate()
>> corresponding to vPE resident), giving the GIC a chance to work in
>> parallel with the CPU on the entry path.
>>
>> Signed-off-by: Shenming Lu 
>> ---
>>  arch/arm64/kvm/vgic/vgic-v4.c  | 18 ++
>>  arch/arm64/kvm/vgic/vgic.c |  2 ++
>>  drivers/irqchip/irq-gic-v3-its.c   | 14 +++---
>>  drivers/irqchip/irq-gic-v4.c   | 11 +++
>>  include/kvm/arm_vgic.h |  3 +++
>>  include/linux/irqchip/arm-gic-v4.h |  4 
>>  6 files changed, 49 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
>> index b5fa73c9fd35..1d5d2d6894d3 100644
>> --- a/arch/arm64/kvm/vgic/vgic-v4.c
>> +++ b/arch/arm64/kvm/vgic/vgic-v4.c
>> @@ -353,6 +353,24 @@ int vgic_v4_load(struct kvm_vcpu *vcpu)
>>  return err;
>>  }
>>
>> +void vgic_v4_wait_vpt(struct kvm_vcpu *vcpu)
> 
> I'd like something a bit more abstract as a name.
> 
> vgic_v4_commit() seems more appropriate, and could be used for other
> purposes.

Yes, it looks more appropriate.

> 
>> +{
>> +    struct its_vpe *vpe;
>> +
>> +    if (kvm_vgic_global_state.type == VGIC_V2 ||
> 
> Why do you test for GICv2? Isn't the vgic_supports_direct_msis() test enough?
> And the test should be moved to kvm_vgic_flush_hwstate(), as we already have
> similar checks there.

Yes, the test for GICv2 is unnecessary I will correct it.

> 
>> !vgic_supports_direct_msis(vcpu->kvm))
>> +    return;
>> +
>> +    vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
>> +
>> +    if (vpe->vpt_ready)
>> +    return;
>> +
>> +    if (its_wait_vpt(vpe))
>> +    return;
> 
> How can that happen?

Yes, it seems that its_wait_vpt() would always return 0.

> 
>> +
>> +    vpe->vpt_ready = true;
> 
> This is nasty. You need to explain what happens with this state (you are
> trying not to access VPENDBASER across a shallow exit, as only a vcpu_put

Ok, I will add a comment here.

> will invalidate the GIC state). And something like vpe_ready is more
> generic (we try not to have too much of the GICv4 gunk in the KVM code).

Yes, that's better.

> 
>> +}
>> +
>>  static struct vgic_its *vgic_get_its(struct kvm *kvm,
>>   struct kvm_kernel_irq_routing_entry *irq_entry)
>>  {
>> diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
>> index c3643b7f101b..ed810a80cda2 100644
>> --- a/arch/arm64/kvm/vgic/vgic.c
>> +++ b/arch/arm64/kvm/vgic/vgic.c
>> @@ -915,6 +915,8 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
>>
>>  if (can_access_vgic_from_kernel())
>>  vgic_restore_state(vcpu);
>> +
>> +    vgic_v4_wait_vpt(vcpu);
>>  }
>>
>>  void kvm_vgic_load(struct kvm_vcpu *vcpu)
>> diff --git a/drivers/irqchip/irq-gic-v3-its.c 
>> b/drivers/irqchip/irq-gic-v3-its.c
>> index 548de7538632..b7cbc9bcab9d 100644
>> --- a/drivers/irqchip/irq-gic-v3-its.c
>> +++ b/drivers/irqchip/irq-gic-v3-its.c
>> @@ -3803,7 +3803,7 @@ static void its_wait_vpt_parse_complete(void)
>>  WARN_ON_ONCE(readq_relaxed_poll_timeout_atomic(vlpi_base + 
>> GICR_VPENDBASER,
>>     val,
>>     !(val & GICR_VPENDBASER_Dirty),
>> -   10, 500));
>> +   1, 500));
> 
> This really should be in a separate patch.

Ok, I will separate it.

> 
>>  }
>>
>>  static void its_vpe_schedule(struct its_vpe *vpe)
>> @@ -3837,7 +3837

Re: [RFC PATCH v1 3/4] KVM: arm64: GICv4.1: Restore VLPI's pending state to physical side

2020-11-24 Thread Shenming Lu

On 2020/11/24 16:44, Marc Zyngier wrote:
> On 2020-11-24 08:10, Shenming Lu wrote:
>> On 2020/11/23 17:27, Marc Zyngier wrote:
>>> On 2020-11-23 06:54, Shenming Lu wrote:
>>>> From: Zenghui Yu 
>>>>
>>>> When setting the forwarding path of a VLPI, it is more consistent to
>>>
>>> I'm not sure it is more consistent. It is a *new* behaviour, because it only
>>> matters for migration, which has been so far unsupported.
>>
>> Alright, consistent may not be accurate...
>> But I have doubt that whether there is really no need to transfer the
>> pending states
>> from kvm'vgic to VPT in set_forwarding regardless of migration, and the 
>> similar
>> for unset_forwarding.
> 
> If you have to transfer that state outside of the a save/restore, it means 
> that
> you have missed the programming of the PCI endpoint. This is an established
> restriction that the MSI programming must occur *after* the translation has
> been established using MAPI/MAPTI (see the large comment at the beginning of
> vgic-v4.c).
> 
> If you want to revisit this, fair enough. But you will need a lot more than
> just opportunistically transfer the pending state.

Thanks, I will look at what you mentioned.

> 
>>
>>>
>>>> also transfer the pending state from irq->pending_latch to VPT (especially
>>>> in migration, the pending states of VLPIs are restored into kvm’s vgic
>>>> first). And we currently send "INT+VSYNC" to trigger a VLPI to pending.
>>>>
>>>> Signed-off-by: Zenghui Yu 
>>>> Signed-off-by: Shenming Lu 
>>>> ---
>>>>  arch/arm64/kvm/vgic/vgic-v4.c | 12 
>>>>  1 file changed, 12 insertions(+)
>>>>
>>>> diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
>>>> index b5fa73c9fd35..cc3ab9cea182 100644
>>>> --- a/arch/arm64/kvm/vgic/vgic-v4.c
>>>> +++ b/arch/arm64/kvm/vgic/vgic-v4.c
>>>> @@ -418,6 +418,18 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int 
>>>> virq,
>>>>  irq->host_irq    = virq;
>>>>  atomic_inc(&map.vpe->vlpi_count);
>>>>
>>>> +    /* Transfer pending state */
>>>> +    ret = irq_set_irqchip_state(irq->host_irq,
>>>> +    IRQCHIP_STATE_PENDING,
>>>> +    irq->pending_latch);
>>>> +    WARN_RATELIMIT(ret, "IRQ %d", irq->host_irq);
>>>> +
>>>> +    /*
>>>> + * Let it be pruned from ap_list later and don't bother
>>>> + * the List Register.
>>>> + */
>>>> +    irq->pending_latch = false;
>>>
>>> It occurs to me that calling into irq_set_irqchip_state() for a large
>>> number of interrupts can take a significant amount of time. It is also
>>> odd that you dump the VPT with the VPE unmapped, but rely on the VPE
>>> being mapped for the opposite operation.
>>>
>>> Shouldn't these be symmetric, all performed while the VPE is unmapped?
>>> It would also save a lot of ITS traffic.
>>>
>>
>> My thought was to use the existing interface directly without unmapping...
>>
>> If you want to unmap the vPE and poke the VPT here, as I said in the cover
>> letter, set/unset_forwarding might also be called when all devices are 
>> running
>> at normal run time, in which case the unmapping of the vPE is not allowed...
> 
> No, I'm suggesting that you don't do anything here, but instead as a 
> by-product
> of restoring the ITS tables. What goes wrong if you use the
> KVM_DEV_ARM_ITS_RESTORE_TABLE backend instead?

There is an issue if we do it in the restoring of the ITS tables: the 
transferring
of the pending state needs the irq to be marked as hw before, which is done by 
the
pass-through device, but the configuring of the forwarding path of the VLPI 
depends
on the restoring of the vgic first... It is a circular dependency.

> 
>> Another possible solution is to add a new dedicated interface to QEMU
>> to transfer
>> these pending states to HW in GIC VM state change handler corresponding to
>> save_pending_tables?
> 
> Userspace has no way to know we use GICv4, and I intend to keep it
> completely out of the loop. The API is already pretty tortuous, and
> I really don't want to add any extra complexity to it.
> 
> Thanks,
> 
>     M.

Re: [RFC PATCH v1 2/4] KVM: arm64: GICv4.1: Try to save hw pending state in save_pending_tables

2020-11-24 Thread Shenming Lu

On 2020/11/24 16:26, Marc Zyngier wrote:
> On 2020-11-24 07:40, Shenming Lu wrote:
>> On 2020/11/23 17:18, Marc Zyngier wrote:
>>> On 2020-11-23 06:54, Shenming Lu wrote:
>>>> After pausing all vCPUs and devices capable of interrupting, in order
>>>     ^
>>> See my comment below about this.
>>>
>>>> to save the information of all interrupts, besides flushing the pending
>>>> states in kvm’s vgic, we also try to flush the states of VLPIs in the
>>>> virtual pending tables into guest RAM, but we need to have GICv4.1 and
>>>> safely unmap the vPEs first.
>>>>
>>>> Signed-off-by: Shenming Lu 
>>>> ---
>>>>  arch/arm64/kvm/vgic/vgic-v3.c | 62 +++
>>>>  1 file changed, 56 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
>>>> index 9cdf39a94a63..e1b3aa4b2b12 100644
>>>> --- a/arch/arm64/kvm/vgic/vgic-v3.c
>>>> +++ b/arch/arm64/kvm/vgic/vgic-v3.c
>>>> @@ -1,6 +1,8 @@
>>>>  // SPDX-License-Identifier: GPL-2.0-only
>>>>
>>>>  #include 
>>>> +#include 
>>>> +#include 
>>>>  #include 
>>>>  #include 
>>>>  #include 
>>>> @@ -356,6 +358,39 @@ int vgic_v3_lpi_sync_pending_status(struct kvm
>>>> *kvm, struct vgic_irq *irq)
>>>>  return 0;
>>>>  }
>>>>
>>>> +/*
>>>> + * With GICv4.1, we can get the VLPI's pending state after unmapping
>>>> + * the vPE. The deactivation of the doorbell interrupt will trigger
>>>> + * the unmapping of the associated vPE.
>>>> + */
>>>> +static void get_vlpi_state_pre(struct vgic_dist *dist)
>>>> +{
>>>> +    struct irq_desc *desc;
>>>> +    int i;
>>>> +
>>>> +    if (!kvm_vgic_global_state.has_gicv4_1)
>>>> +    return;
>>>> +
>>>> +    for (i = 0; i < dist->its_vm.nr_vpes; i++) {
>>>> +    desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
>>>> +    irq_domain_deactivate_irq(irq_desc_get_irq_data(desc));
>>>> +    }
>>>> +}
>>>> +
>>>> +static void get_vlpi_state_post(struct vgic_dist *dist)
>>>
>>> nit: the naming feels a bit... odd. Pre/post what?
>>
>> My understanding is that the unmapping is a preparation for get_vlpi_state...
>> Maybe just call it unmap/map_all_vpes?
> 
> Yes, much better.
> 
> [...]
> 
>>>> +    if (irq->hw) {
>>>> +    WARN_RATELIMIT(irq_get_irqchip_state(irq->host_irq,
>>>> +    IRQCHIP_STATE_PENDING, &is_pending),
>>>> +   "IRQ %d", irq->host_irq);
>>>
>>> Isn't this going to warn like mad on a GICv4.0 system where this, by 
>>> definition,
>>> will generate an error?
>>
>> As we have returned an error in save_its_tables if hw && !has_gicv4_1, we 
>> don't
>> have to warn this here?
> 
> Are you referring to the check in vgic_its_save_itt() that occurs in patch 4?
> Fair enough, though I think the use of irq_get_irqchip_state() isn't quite
> what we want, as per my comments on patch #1.
> 
>>>
>>>> +    }
>>>> +
>>>> +    if (stored == is_pending)
>>>>  continue;
>>>>
>>>> -    if (irq->pending_latch)
>>>> +    if (is_pending)
>>>>  val |= 1 << bit_nr;
>>>>  else
>>>>  val &= ~(1 << bit_nr);
>>>>
>>>>  ret = kvm_write_guest_lock(kvm, ptr, &val, 1);
>>>>  if (ret)
>>>> -    return ret;
>>>> +    goto out;
>>>>  }
>>>> -    return 0;
>>>> +
>>>> +out:
>>>> +    get_vlpi_state_post(dist);
>>>
>>> This bit worries me: you have unmapped the VPEs, so any interrupt that has 
>>> been
>>> generated during that phase is now forever lost (the GIC doesn't have 
>>> ownership
>>> of the pending tables).
>>
>> In my opinion, during this phase, the devices capable of interrupting
>> should have  already been paused (prevent from sending interrupts),
>> such as VFIO migration protocol has already realized it.
> 
> Is that a hard guarantee? Pausing devices *may* be possible for a limited
> set of endpoints, but I'm not sure that is universally possible to restart
> them and expect a consistent state (you have just dropped a bunch of network
> packets on the floor...).

No, as far as I know, if the VFIO device does not support pause, the migration 
would
fail early... And the specific action is decided by the vendor driver.
In fact, the VFIO migration is still in an experimental phase... I will pay 
attention
to the follow-up development.

> 
>>> Do you really expect the VM to be restartable from that point? I don't see 
>>> how
>>> this is possible.
>>>
>>
>> If the migration has encountered an error, the src VM might be
>> restarted, so we have to map the vPEs back.
> 
> As I said above, I doubt it is universally possible to do so, but
> after all, this probably isn't worse that restarting on the target...
> 
>     M.

Re: [RFC PATCH v1 3/4] KVM: arm64: GICv4.1: Restore VLPI's pending state to physical side

2020-11-24 Thread Shenming Lu

On 2020/11/23 17:27, Marc Zyngier wrote:
> On 2020-11-23 06:54, Shenming Lu wrote:
>> From: Zenghui Yu 
>>
>> When setting the forwarding path of a VLPI, it is more consistent to
> 
> I'm not sure it is more consistent. It is a *new* behaviour, because it only
> matters for migration, which has been so far unsupported.

Alright, consistent may not be accurate...
But I have doubt that whether there is really no need to transfer the pending 
states
from kvm'vgic to VPT in set_forwarding regardless of migration, and the similar
for unset_forwarding.

> 
>> also transfer the pending state from irq->pending_latch to VPT (especially
>> in migration, the pending states of VLPIs are restored into kvm’s vgic
>> first). And we currently send "INT+VSYNC" to trigger a VLPI to pending.
>>
>> Signed-off-by: Zenghui Yu 
>> Signed-off-by: Shenming Lu 
>> ---
>>  arch/arm64/kvm/vgic/vgic-v4.c | 12 
>>  1 file changed, 12 insertions(+)
>>
>> diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
>> index b5fa73c9fd35..cc3ab9cea182 100644
>> --- a/arch/arm64/kvm/vgic/vgic-v4.c
>> +++ b/arch/arm64/kvm/vgic/vgic-v4.c
>> @@ -418,6 +418,18 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int 
>> virq,
>>  irq->host_irq    = virq;
>>  atomic_inc(&map.vpe->vlpi_count);
>>
>> +    /* Transfer pending state */
>> +    ret = irq_set_irqchip_state(irq->host_irq,
>> +    IRQCHIP_STATE_PENDING,
>> +    irq->pending_latch);
>> +    WARN_RATELIMIT(ret, "IRQ %d", irq->host_irq);
>> +
>> +    /*
>> + * Let it be pruned from ap_list later and don't bother
>> + * the List Register.
>> + */
>> +    irq->pending_latch = false;
> 
> It occurs to me that calling into irq_set_irqchip_state() for a large
> number of interrupts can take a significant amount of time. It is also
> odd that you dump the VPT with the VPE unmapped, but rely on the VPE
> being mapped for the opposite operation.
> 
> Shouldn't these be symmetric, all performed while the VPE is unmapped?
> It would also save a lot of ITS traffic.
> 

My thought was to use the existing interface directly without unmapping...

If you want to unmap the vPE and poke the VPT here, as I said in the cover
letter, set/unset_forwarding might also be called when all devices are running
at normal run time, in which case the unmapping of the vPE is not allowed...

Another possible solution is to add a new dedicated interface to QEMU to 
transfer
these pending states to HW in GIC VM state change handler corresponding to
save_pending_tables?

>> +
>>  out:
>>  mutex_unlock(&its->its_lock);
>>  return ret;
> 
> Thanks,
> 
>     M.

Re: [RFC PATCH v1 2/4] KVM: arm64: GICv4.1: Try to save hw pending state in save_pending_tables

2020-11-23 Thread Shenming Lu

On 2020/11/23 17:18, Marc Zyngier wrote:
> On 2020-11-23 06:54, Shenming Lu wrote:
>> After pausing all vCPUs and devices capable of interrupting, in order
>     ^
> See my comment below about this.
> 
>> to save the information of all interrupts, besides flushing the pending
>> states in kvm’s vgic, we also try to flush the states of VLPIs in the
>> virtual pending tables into guest RAM, but we need to have GICv4.1 and
>> safely unmap the vPEs first.
>>
>> Signed-off-by: Shenming Lu 
>> ---
>>  arch/arm64/kvm/vgic/vgic-v3.c | 62 +++
>>  1 file changed, 56 insertions(+), 6 deletions(-)
>>
>> diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
>> index 9cdf39a94a63..e1b3aa4b2b12 100644
>> --- a/arch/arm64/kvm/vgic/vgic-v3.c
>> +++ b/arch/arm64/kvm/vgic/vgic-v3.c
>> @@ -1,6 +1,8 @@
>>  // SPDX-License-Identifier: GPL-2.0-only
>>
>>  #include 
>> +#include 
>> +#include 
>>  #include 
>>  #include 
>>  #include 
>> @@ -356,6 +358,39 @@ int vgic_v3_lpi_sync_pending_status(struct kvm
>> *kvm, struct vgic_irq *irq)
>>  return 0;
>>  }
>>
>> +/*
>> + * With GICv4.1, we can get the VLPI's pending state after unmapping
>> + * the vPE. The deactivation of the doorbell interrupt will trigger
>> + * the unmapping of the associated vPE.
>> + */
>> +static void get_vlpi_state_pre(struct vgic_dist *dist)
>> +{
>> +    struct irq_desc *desc;
>> +    int i;
>> +
>> +    if (!kvm_vgic_global_state.has_gicv4_1)
>> +    return;
>> +
>> +    for (i = 0; i < dist->its_vm.nr_vpes; i++) {
>> +    desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
>> +    irq_domain_deactivate_irq(irq_desc_get_irq_data(desc));
>> +    }
>> +}
>> +
>> +static void get_vlpi_state_post(struct vgic_dist *dist)
> 
> nit: the naming feels a bit... odd. Pre/post what?

My understanding is that the unmapping is a preparation for get_vlpi_state...
Maybe just call it unmap/map_all_vpes?

> 
>> +{
>> +    struct irq_desc *desc;
>> +    int i;
>> +
>> +    if (!kvm_vgic_global_state.has_gicv4_1)
>> +    return;
>> +
>> +    for (i = 0; i < dist->its_vm.nr_vpes; i++) {
>> +    desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
>> +    irq_domain_activate_irq(irq_desc_get_irq_data(desc), false);
>> +    }
>> +}
>> +
>>  /**
>>   * vgic_v3_save_pending_tables - Save the pending tables into guest RAM
>>   * kvm lock and all vcpu lock must be held
>> @@ -365,14 +400,17 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
>>  struct vgic_dist *dist = &kvm->arch.vgic;
>>  struct vgic_irq *irq;
>>  gpa_t last_ptr = ~(gpa_t)0;
>> -    int ret;
>> +    int ret = 0;
>>  u8 val;
>>
>> +    get_vlpi_state_pre(dist);
>> +
>>  list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
>>  int byte_offset, bit_nr;
>>  struct kvm_vcpu *vcpu;
>>  gpa_t pendbase, ptr;
>>  bool stored;
>> +    bool is_pending = irq->pending_latch;
>>
>>  vcpu = irq->target_vcpu;
>>  if (!vcpu)
>> @@ -387,24 +425,36 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
>>  if (ptr != last_ptr) {
>>  ret = kvm_read_guest_lock(kvm, ptr, &val, 1);
>>  if (ret)
>> -    return ret;
>> +    goto out;
>>  last_ptr = ptr;
>>  }
>>
>>  stored = val & (1U << bit_nr);
>> -    if (stored == irq->pending_latch)
>> +
>> +    /* also flush hw pending state */
> 
> This comment looks out of place, as we aren't flushing anything.

Ok, I will correct it.

> 
>> +    if (irq->hw) {
>> +    WARN_RATELIMIT(irq_get_irqchip_state(irq->host_irq,
>> +    IRQCHIP_STATE_PENDING, &is_pending),
>> +   "IRQ %d", irq->host_irq);
> 
> Isn't this going to warn like mad on a GICv4.0 system where this, by 
> definition,
> will generate an error?

As we have returned an error in save_its_tables if hw && !has_gicv4_1, we don't
have to warn this here?

> 
>> +    }
>> +
>> +    if (stored == is_pending)
>>  continue;
>>
>> -    if (irq->pending_latch)
>> +    if (is_pending)
>>  val |= 1 <

Re: [RFC PATCH v1 1/4] irqchip/gic-v4.1: Plumb get_irqchip_state VLPI callback

2020-11-23 Thread Shenming Lu

On 2020/11/23 17:01, Marc Zyngier wrote:
> On 2020-11-23 06:54, Shenming Lu wrote:
>> From: Zenghui Yu 
>>
>> Up to now, the irq_get_irqchip_state() callback of its_irq_chip
>> leaves unimplemented since there is no architectural way to get
>> the VLPI's pending state before GICv4.1. Yeah, there has one in
>> v4.1 for VLPIs.
>>
>> With GICv4.1, after unmapping the vPE, which cleans and invalidates
>> any caching of the VPT, we can get the VLPI's pending state by
> 
> This is a crucial note: without this unmapping and invalidation,
> the pending bits are not generally accessible (they could be cached
> in a GIC private structure, cache or otherwise).
> 
>> peeking at the VPT. So we implement the irq_get_irqchip_state()
>> callback of its_irq_chip to do it.
>>
>> Signed-off-by: Zenghui Yu 
>> Signed-off-by: Shenming Lu 
>> ---
>>  drivers/irqchip/irq-gic-v3-its.c | 38 
>>  1 file changed, 38 insertions(+)
>>
>> diff --git a/drivers/irqchip/irq-gic-v3-its.c 
>> b/drivers/irqchip/irq-gic-v3-its.c
>> index 0fec31931e11..287003cacac7 100644
>> --- a/drivers/irqchip/irq-gic-v3-its.c
>> +++ b/drivers/irqchip/irq-gic-v3-its.c
>> @@ -1695,6 +1695,43 @@ static void its_irq_compose_msi_msg(struct
>> irq_data *d, struct msi_msg *msg)
>>  iommu_dma_compose_msi_msg(irq_data_get_msi_desc(d), msg);
>>  }
>>
>> +static bool its_peek_vpt(struct its_vpe *vpe, irq_hw_number_t hwirq)
>> +{
>> +    int mask = hwirq % BITS_PER_BYTE;
> 
> nit: this isn't a mask, but a shift instead. BIT(hwirq % BPB) would give
> you a mask.

Ok, I will correct it.

> 
>> +    void *va;
>> +    u8 *pt;
>> +
>> +    va = page_address(vpe->vpt_page);
>> +    pt = va + hwirq / BITS_PER_BYTE;
>> +
>> +    return !!(*pt & (1U << mask));
>> +}
>> +
>> +static int its_irq_get_irqchip_state(struct irq_data *d,
>> + enum irqchip_irq_state which, bool *val)
>> +{
>> +    struct its_device *its_dev = irq_data_get_irq_chip_data(d);
>> +    struct its_vlpi_map *map = get_vlpi_map(d);
>> +
>> +    if (which != IRQCHIP_STATE_PENDING)
>> +    return -EINVAL;
>> +
>> +    /* not intended for physical LPI's pending state */
>> +    if (!map)
>> +    return -EINVAL;
>> +
>> +    /*
>> + * In GICv4.1, a VMAPP with {V,Alloc}=={0,1} cleans and invalidates
>> + * any caching of the VPT associated with the vPEID held in the GIC.
>> + */
>> +    if (!is_v4_1(its_dev->its) || atomic_read(&map->vpe->vmapp_count))
> 
> It isn't clear to me what prevents this from racing against a mapping of
> the VPE. Actually, since we only hold the LPI irqdesc lock, I'm pretty sure
> nothing prevents it.

Yes, should have the vmovp_lock held?
And is it necessary to also hold this lock in 
its_vpe_irq_domain_activate/deactivate?

> 
>> +    return -EACCES;
> 
> I can sort of buy EACCESS for a VPE that is currently mapped, but a non-4.1
> ITS should definitely return EINVAL.

Alright, EINVAL looks better.

> 
>> +
>> +    *val = its_peek_vpt(map->vpe, map->vintid);
>> +
>> +    return 0;
>> +}
>> +
>>  static int its_irq_set_irqchip_state(struct irq_data *d,
>>   enum irqchip_irq_state which,
>>   bool state)
>> @@ -1975,6 +2012,7 @@ static struct irq_chip its_irq_chip = {
>>  .irq_eoi    = irq_chip_eoi_parent,
>>  .irq_set_affinity    = its_set_affinity,
>>  .irq_compose_msi_msg    = its_irq_compose_msi_msg,
>> +    .irq_get_irqchip_state    = its_irq_get_irqchip_state,
> 
> My biggest issue with this is that it isn't a reliable interface.
> It happens to work in the context of KVM, because you make sure it
> is called at the right time, but that doesn't make it safe in general
> (anyone with the interrupt number is allowed to call this at any time).

We check the vmapp_count in it to ensure the unmapping of the vPE, and
let the caller do the unmapping (they should know whether it is the right
time). If the unmapping is not done, just return a failure.

> 
> Is there a problem with poking at the VPT page from the KVM side?
> The code should be exactly the same (maybe simpler even), and at least
> you'd be guaranteed to be in the correct context.

Yeah, that also seems a good choice.
If you prefer it, we can try to realize it in v2.

> 
>>  .irq_set_irqchip_state    = its_irq_set_irqchip_state,
>>  .irq_retrigger    = its_irq_retrigger,
>>  .irq_set_vcpu_affinity    = its_irq_set_vcpu_affinity,
> 
> Thanks,
> 
>     M.

[RFC PATCH v1 3/4] KVM: arm64: GICv4.1: Restore VLPI's pending state to physical side

From: Zenghui Yu 

When setting the forwarding path of a VLPI, it is more consistent to
also transfer the pending state from irq->pending_latch to VPT (especially
in migration, the pending states of VLPIs are restored into kvm’s vgic
first). And we currently send "INT+VSYNC" to trigger a VLPI to pending.

Signed-off-by: Zenghui Yu 
Signed-off-by: Shenming Lu 
---
 arch/arm64/kvm/vgic/vgic-v4.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
index b5fa73c9fd35..cc3ab9cea182 100644
--- a/arch/arm64/kvm/vgic/vgic-v4.c
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@@ -418,6 +418,18 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq,
irq->host_irq   = virq;
atomic_inc(&map.vpe->vlpi_count);
 
+   /* Transfer pending state */
+   ret = irq_set_irqchip_state(irq->host_irq,
+   IRQCHIP_STATE_PENDING,
+   irq->pending_latch);
+   WARN_RATELIMIT(ret, "IRQ %d", irq->host_irq);
+
+   /*
+* Let it be pruned from ap_list later and don't bother
+* the List Register.
+*/
+   irq->pending_latch = false;
+
 out:
mutex_unlock(&its->its_lock);
return ret;
-- 
2.23.0

[RFC PATCH v1 0/4] KVM: arm64: Add VLPI migration support on GICv4.1

In GICv4.1, migration has been supported except for (directly-injected)
VLPI. And GICv4.1 spec explicitly gives a way to get the VLPI's pending
state (which was crucially missing in GICv4.0). So we make VLPI migration
capable on GICv4.1 in this patch set.

In order to support VLPI migration, we need to save and restore all
required configuration information and pending states of VLPIs. But
in fact, the configuration information of VLPIs has already been saved
(or will be reallocated on the dst host...) in vgic(kvm) migration.
So we only have to migrate the pending states of VLPIs specially.

Below is the related workflow in migration.

On the save path:
In migration completion:
pause all vCPUs
|
call each VM state change handler:
pause other devices (just keep from sending interrupts, 
and
such as VFIO migration protocol has already realized it 
[1])
|
flush ITS tables into guest RAM
|
flush RDIST pending tables (also flush VLPI state here)
|
...
On the resume path:
load each device's state:
restore ITS tables (include pending tables) from guest RAM
|
for other (PCI) devices (paused), if configured to have VLPIs,
establish the forwarding paths of their VLPIs (and transfer
the pending states from kvm's vgic to VPT here)

Yet TODO:
 - For some reason, such as for VFIO PCI devices, there may be repeated
   resettings of HW VLPI configuration in load_state, resulting in the
   loss of pending state. A very intuitive solution is to retrieve the
   pending state in unset_forwarding (and this should be so regardless
   of migration). But at normal run time, this function may be called
   when all devices are running, in which case the unmapping of VPE is
   not allowed. It seems to be an almost insoluble bug...
   There are other possible solutions as follows:
   1) avoid unset_forwarding being called from QEMU in resuming (simply
   allocate all needed vectors first), which is more reasonable and
   efficient.
   2) add a new dedicated interface to transfer these pending states to
   HW in GIC VM state change handler corresponding to save_pending_tables.
   ...

Any comments and suggestions are very welcome.

Besides, we have tested this series in VFIO migration, and nothing else
goes wrong (with two issues committed [2][3]).

Links:
[1] vfio: UAPI for migration interface for device state:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/
commit/?id=a8a24f3f6e38103b77cf399c38eb54e1219d00d6
[2] vfio: Move the saving of the config space to the right place in VFIO 
migration:
https://patchwork.ozlabs.org/patch/1400246/
[3] vfio: Set the priority of VFIO VM state change handler explicitly:
https://patchwork.ozlabs.org/patch/1401280/

Shenming Lu (2):
  KVM: arm64: GICv4.1: Try to save hw pending state in
save_pending_tables
  KVM: arm64: GICv4.1: Give a chance to save VLPI's pending state

Zenghui Yu (2):
  irqchip/gic-v4.1: Plumb get_irqchip_state VLPI callback
  KVM: arm64: GICv4.1: Restore VLPI's pending state to physical side

 .../virt/kvm/devices/arm-vgic-its.rst |  2 +-
 arch/arm64/kvm/vgic/vgic-its.c|  6 +-
 arch/arm64/kvm/vgic/vgic-v3.c | 62 +--
 arch/arm64/kvm/vgic/vgic-v4.c | 12 
 drivers/irqchip/irq-gic-v3-its.c  | 38 
 5 files changed, 110 insertions(+), 10 deletions(-)

-- 
2.23.0

[RFC PATCH v1 2/4] KVM: arm64: GICv4.1: Try to save hw pending state in save_pending_tables

After pausing all vCPUs and devices capable of interrupting, in order
to save the information of all interrupts, besides flushing the pending
states in kvm’s vgic, we also try to flush the states of VLPIs in the
virtual pending tables into guest RAM, but we need to have GICv4.1 and
safely unmap the vPEs first.

Signed-off-by: Shenming Lu 
---
 arch/arm64/kvm/vgic/vgic-v3.c | 62 +++
 1 file changed, 56 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 9cdf39a94a63..e1b3aa4b2b12 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0-only
 
 #include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -356,6 +358,39 @@ int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, 
struct vgic_irq *irq)
return 0;
 }
 
+/*
+ * With GICv4.1, we can get the VLPI's pending state after unmapping
+ * the vPE. The deactivation of the doorbell interrupt will trigger
+ * the unmapping of the associated vPE.
+ */
+static void get_vlpi_state_pre(struct vgic_dist *dist)
+{
+   struct irq_desc *desc;
+   int i;
+
+   if (!kvm_vgic_global_state.has_gicv4_1)
+   return;
+
+   for (i = 0; i < dist->its_vm.nr_vpes; i++) {
+   desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
+   irq_domain_deactivate_irq(irq_desc_get_irq_data(desc));
+   }
+}
+
+static void get_vlpi_state_post(struct vgic_dist *dist)
+{
+   struct irq_desc *desc;
+   int i;
+
+   if (!kvm_vgic_global_state.has_gicv4_1)
+   return;
+
+   for (i = 0; i < dist->its_vm.nr_vpes; i++) {
+   desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
+   irq_domain_activate_irq(irq_desc_get_irq_data(desc), false);
+   }
+}
+
 /**
  * vgic_v3_save_pending_tables - Save the pending tables into guest RAM
  * kvm lock and all vcpu lock must be held
@@ -365,14 +400,17 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
struct vgic_dist *dist = &kvm->arch.vgic;
struct vgic_irq *irq;
gpa_t last_ptr = ~(gpa_t)0;
-   int ret;
+   int ret = 0;
u8 val;
 
+   get_vlpi_state_pre(dist);
+
list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
int byte_offset, bit_nr;
struct kvm_vcpu *vcpu;
gpa_t pendbase, ptr;
bool stored;
+   bool is_pending = irq->pending_latch;
 
vcpu = irq->target_vcpu;
if (!vcpu)
@@ -387,24 +425,36 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
if (ptr != last_ptr) {
ret = kvm_read_guest_lock(kvm, ptr, &val, 1);
if (ret)
-   return ret;
+   goto out;
last_ptr = ptr;
}
 
stored = val & (1U << bit_nr);
-   if (stored == irq->pending_latch)
+
+   /* also flush hw pending state */
+   if (irq->hw) {
+   WARN_RATELIMIT(irq_get_irqchip_state(irq->host_irq,
+   IRQCHIP_STATE_PENDING, 
&is_pending),
+  "IRQ %d", irq->host_irq);
+   }
+
+   if (stored == is_pending)
continue;
 
-   if (irq->pending_latch)
+   if (is_pending)
val |= 1 << bit_nr;
else
val &= ~(1 << bit_nr);
 
ret = kvm_write_guest_lock(kvm, ptr, &val, 1);
if (ret)
-   return ret;
+   goto out;
}
-   return 0;
+
+out:
+   get_vlpi_state_post(dist);
+
+   return ret;
 }
 
 /**
-- 
2.23.0

[RFC PATCH v1 4/4] KVM: arm64: GICv4.1: Give a chance to save VLPI's pending state

Before GICv4.1, we do not have direct access to the VLPI's pending
state. So we simply let it fail early when encountering any VLPI.

But now we don't have to return -EACCES directly if on GICv4.1. So
let’s change the hard code and give a chance to save the VLPI's pending
state (and preserve the interfaces).

Signed-off-by: Shenming Lu 
---
 Documentation/virt/kvm/devices/arm-vgic-its.rst | 2 +-
 arch/arm64/kvm/vgic/vgic-its.c  | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Documentation/virt/kvm/devices/arm-vgic-its.rst 
b/Documentation/virt/kvm/devices/arm-vgic-its.rst
index 6c304fd2b1b4..d257eddbae29 100644
--- a/Documentation/virt/kvm/devices/arm-vgic-its.rst
+++ b/Documentation/virt/kvm/devices/arm-vgic-its.rst
@@ -80,7 +80,7 @@ KVM_DEV_ARM_VGIC_GRP_CTRL
 -EFAULT  Invalid guest ram access
 -EBUSY   One or more VCPUS are running
 -EACCES  The virtual ITS is backed by a physical GICv4 ITS, and the
-state is not available
+state is not available without GICv4.1
 ===  ==
 
 KVM_DEV_ARM_VGIC_GRP_ITS_REGS
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index 40cbaca81333..ec7543a9617c 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -2218,10 +2218,10 @@ static int vgic_its_save_itt(struct vgic_its *its, 
struct its_device *device)
/*
 * If an LPI carries the HW bit, this means that this
 * interrupt is controlled by GICv4, and we do not
-* have direct access to that state. Let's simply fail
-* the save operation...
+* have direct access to that state without GICv4.1.
+* Let's simply fail the save operation...
 */
-   if (ite->irq->hw)
+   if (ite->irq->hw && !kvm_vgic_global_state.has_gicv4_1)
return -EACCES;
 
ret = vgic_its_save_ite(its, device, ite, gpa, ite_esz);
-- 
2.23.0

[RFC PATCH v1 1/4] irqchip/gic-v4.1: Plumb get_irqchip_state VLPI callback