Re: [Qemu-devel] [PATCH v3] vfio : add aer process

2016-08-18 Thread Zhou Jie

ping

On 2016/8/15 10:53, Zhou Jie wrote:

ping

On 2016/8/2 11:57, Zhou Jie wrote:

During aer err occurs and resume do following to
protect device from being accessed.
1. Make config space read only.
2. Disable INTx/MSI Interrupt.
3. Do nothing for bar regions.

Signed-off-by: Zhou Jie 
---
v2-v3:
   1. Call init_completion() in vfio_pci_probe.
   2. Call reinit_completion() in vfio_pci_aer_err_detected.
   3. Remove unnecessary brackets.

v1-v2:
   1. Add aer process to vfio driver.

 drivers/vfio/pci/vfio_pci.c | 48
+
 drivers/vfio/pci/vfio_pci_private.h |  2 ++
 include/uapi/linux/vfio.h   |  2 ++
 3 files changed, 52 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index d624a52..4c246a1 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -648,6 +648,15 @@ static long vfio_pci_ioctl(void *device_data,
 struct vfio_pci_device *vdev = device_data;
 unsigned long minsz;

+if (vdev->aer_error_in_progress && (cmd == VFIO_DEVICE_SET_IRQS ||
+cmd == VFIO_DEVICE_RESET || cmd == VFIO_DEVICE_PCI_HOT_RESET)) {
+int ret;
+ret = wait_for_completion_interruptible(
+&vdev->aer_error_completion);
+if (ret)
+return ret;
+}
+
 if (cmd == VFIO_DEVICE_GET_INFO) {
 struct vfio_device_info info;

@@ -664,6 +673,10 @@ static long vfio_pci_ioctl(void *device_data,
 if (vdev->reset_works)
 info.flags |= VFIO_DEVICE_FLAGS_RESET;

+info.flags |= VFIO_DEVICE_FLAGS_AERPROCESS;
+if (vdev->aer_error_in_progress)
+info.flags |= VFIO_DEVICE_FLAGS_INAERPROCESS;
+
 info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions;
 info.num_irqs = VFIO_PCI_NUM_IRQS;

@@ -1070,6 +1083,13 @@ static ssize_t vfio_pci_rw(void *device_data,
char __user *buf,

 switch (index) {
 case VFIO_PCI_CONFIG_REGION_INDEX:
+if (vdev->aer_error_in_progress && iswrite) {
+int ret;
+ret = wait_for_completion_interruptible(
+&vdev->aer_error_completion);
+if (ret)
+return ret;
+}
 return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite);

 case VFIO_PCI_ROM_REGION_INDEX:
@@ -1228,6 +1248,7 @@ static int vfio_pci_probe(struct pci_dev *pdev,
const struct pci_device_id *id)
 vdev->irq_type = VFIO_PCI_NUM_IRQS;
 mutex_init(&vdev->igate);
 spin_lock_init(&vdev->irqlock);
+init_completion(&vdev->aer_error_completion);

 ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
 if (ret) {
@@ -1300,6 +1321,11 @@ static pci_ers_result_t
vfio_pci_aer_err_detected(struct pci_dev *pdev,

 mutex_lock(&vdev->igate);

+vdev->aer_error_in_progress = true;
+reinit_completion(&vdev->aer_error_completion);
+vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
+VFIO_IRQ_SET_ACTION_TRIGGER,
+vdev->irq_type, 0, 0, NULL);
 if (vdev->err_trigger)
 eventfd_signal(vdev->err_trigger, 1);

@@ -1310,8 +1336,30 @@ static pci_ers_result_t
vfio_pci_aer_err_detected(struct pci_dev *pdev,
 return PCI_ERS_RESULT_CAN_RECOVER;
 }

+static void vfio_pci_aer_resume(struct pci_dev *pdev)
+{
+struct vfio_pci_device *vdev;
+struct vfio_device *device;
+
+device = vfio_device_get_from_dev(&pdev->dev);
+if (device == NULL)
+return;
+
+vdev = vfio_device_data(device);
+if (vdev == NULL) {
+vfio_device_put(device);
+return;
+}
+
+vdev->aer_error_in_progress = false;
+complete_all(&vdev->aer_error_completion);
+
+vfio_device_put(device);
+}
+
 static const struct pci_error_handlers vfio_err_handlers = {
 .error_detected = vfio_pci_aer_err_detected,
+.resume = vfio_pci_aer_resume,
 };

 static struct pci_driver vfio_pci_driver = {
diff --git a/drivers/vfio/pci/vfio_pci_private.h
b/drivers/vfio/pci/vfio_pci_private.h
index 2128de8..7430d92 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -91,6 +91,8 @@ struct vfio_pci_device {
 boolhas_vga;
 boolneeds_reset;
 boolnointx;
+boolaer_error_in_progress;
+struct completionaer_error_completion;
 struct pci_saved_state*pci_saved_state;
 intrefcnt;
 struct eventfd_ctx*err_trigger;
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 255a211..59b9cf6 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -198,6 +198,8 @@ struct vfio_device_info {
 #define VFIO_DEVICE_FLAGS_PCI(1 << 1)/* vfio-pci device */
 #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2)/* vfio-platform
device */
 #define VFIO_DEVICE_FLAG

Re: [PATCH v3] vfio : add aer process

2016-08-14 Thread Zhou Jie

ping

On 2016/8/2 11:57, Zhou Jie wrote:

During aer err occurs and resume do following to
protect device from being accessed.
1. Make config space read only.
2. Disable INTx/MSI Interrupt.
3. Do nothing for bar regions.

Signed-off-by: Zhou Jie 
---
v2-v3:
   1. Call init_completion() in vfio_pci_probe.
   2. Call reinit_completion() in vfio_pci_aer_err_detected.
   3. Remove unnecessary brackets.

v1-v2:
   1. Add aer process to vfio driver.

 drivers/vfio/pci/vfio_pci.c | 48 +
 drivers/vfio/pci/vfio_pci_private.h |  2 ++
 include/uapi/linux/vfio.h   |  2 ++
 3 files changed, 52 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index d624a52..4c246a1 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -648,6 +648,15 @@ static long vfio_pci_ioctl(void *device_data,
struct vfio_pci_device *vdev = device_data;
unsigned long minsz;

+   if (vdev->aer_error_in_progress && (cmd == VFIO_DEVICE_SET_IRQS ||
+   cmd == VFIO_DEVICE_RESET || cmd == VFIO_DEVICE_PCI_HOT_RESET)) {
+   int ret;
+   ret = wait_for_completion_interruptible(
+   &vdev->aer_error_completion);
+   if (ret)
+   return ret;
+   }
+
if (cmd == VFIO_DEVICE_GET_INFO) {
struct vfio_device_info info;

@@ -664,6 +673,10 @@ static long vfio_pci_ioctl(void *device_data,
if (vdev->reset_works)
info.flags |= VFIO_DEVICE_FLAGS_RESET;

+   info.flags |= VFIO_DEVICE_FLAGS_AERPROCESS;
+   if (vdev->aer_error_in_progress)
+   info.flags |= VFIO_DEVICE_FLAGS_INAERPROCESS;
+
info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions;
info.num_irqs = VFIO_PCI_NUM_IRQS;

@@ -1070,6 +1083,13 @@ static ssize_t vfio_pci_rw(void *device_data, char 
__user *buf,

switch (index) {
case VFIO_PCI_CONFIG_REGION_INDEX:
+   if (vdev->aer_error_in_progress && iswrite) {
+   int ret;
+   ret = wait_for_completion_interruptible(
+   &vdev->aer_error_completion);
+   if (ret)
+   return ret;
+   }
return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite);

case VFIO_PCI_ROM_REGION_INDEX:
@@ -1228,6 +1248,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const 
struct pci_device_id *id)
vdev->irq_type = VFIO_PCI_NUM_IRQS;
mutex_init(&vdev->igate);
spin_lock_init(&vdev->irqlock);
+   init_completion(&vdev->aer_error_completion);

ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
if (ret) {
@@ -1300,6 +1321,11 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct 
pci_dev *pdev,

mutex_lock(&vdev->igate);

+   vdev->aer_error_in_progress = true;
+   reinit_completion(&vdev->aer_error_completion);
+   vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
+   VFIO_IRQ_SET_ACTION_TRIGGER,
+   vdev->irq_type, 0, 0, NULL);
if (vdev->err_trigger)
eventfd_signal(vdev->err_trigger, 1);

@@ -1310,8 +1336,30 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct 
pci_dev *pdev,
return PCI_ERS_RESULT_CAN_RECOVER;
 }

+static void vfio_pci_aer_resume(struct pci_dev *pdev)
+{
+   struct vfio_pci_device *vdev;
+   struct vfio_device *device;
+
+   device = vfio_device_get_from_dev(&pdev->dev);
+   if (device == NULL)
+   return;
+
+   vdev = vfio_device_data(device);
+   if (vdev == NULL) {
+   vfio_device_put(device);
+   return;
+   }
+
+   vdev->aer_error_in_progress = false;
+   complete_all(&vdev->aer_error_completion);
+
+   vfio_device_put(device);
+}
+
 static const struct pci_error_handlers vfio_err_handlers = {
.error_detected = vfio_pci_aer_err_detected,
+   .resume = vfio_pci_aer_resume,
 };

 static struct pci_driver vfio_pci_driver = {
diff --git a/drivers/vfio/pci/vfio_pci_private.h 
b/drivers/vfio/pci/vfio_pci_private.h
index 2128de8..7430d92 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -91,6 +91,8 @@ struct vfio_pci_device {
boolhas_vga;
boolneeds_reset;
boolnointx;
+   boolaer_error_in_progress;
+   struct completion   aer_error_completion;
struct pci_saved_state  *pci_saved_state;
int refcnt;
struct eventfd_ctx  *err_trigger;
diff --git 

[PATCH v3] vfio : add aer process

2016-08-01 Thread Zhou Jie
During aer err occurs and resume do following to
protect device from being accessed.
1. Make config space read only.
2. Disable INTx/MSI Interrupt.
3. Do nothing for bar regions.

Signed-off-by: Zhou Jie 
---
v2-v3:
   1. Call init_completion() in vfio_pci_probe.
   2. Call reinit_completion() in vfio_pci_aer_err_detected.
   3. Remove unnecessary brackets.

v1-v2:
   1. Add aer process to vfio driver.

 drivers/vfio/pci/vfio_pci.c | 48 +
 drivers/vfio/pci/vfio_pci_private.h |  2 ++
 include/uapi/linux/vfio.h   |  2 ++
 3 files changed, 52 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index d624a52..4c246a1 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -648,6 +648,15 @@ static long vfio_pci_ioctl(void *device_data,
struct vfio_pci_device *vdev = device_data;
unsigned long minsz;
 
+   if (vdev->aer_error_in_progress && (cmd == VFIO_DEVICE_SET_IRQS ||
+   cmd == VFIO_DEVICE_RESET || cmd == VFIO_DEVICE_PCI_HOT_RESET)) {
+   int ret;
+   ret = wait_for_completion_interruptible(
+   &vdev->aer_error_completion);
+   if (ret)
+   return ret;
+   }
+
if (cmd == VFIO_DEVICE_GET_INFO) {
struct vfio_device_info info;
 
@@ -664,6 +673,10 @@ static long vfio_pci_ioctl(void *device_data,
if (vdev->reset_works)
info.flags |= VFIO_DEVICE_FLAGS_RESET;
 
+   info.flags |= VFIO_DEVICE_FLAGS_AERPROCESS;
+   if (vdev->aer_error_in_progress)
+   info.flags |= VFIO_DEVICE_FLAGS_INAERPROCESS;
+
info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions;
info.num_irqs = VFIO_PCI_NUM_IRQS;
 
@@ -1070,6 +1083,13 @@ static ssize_t vfio_pci_rw(void *device_data, char 
__user *buf,
 
switch (index) {
case VFIO_PCI_CONFIG_REGION_INDEX:
+   if (vdev->aer_error_in_progress && iswrite) {
+   int ret;
+   ret = wait_for_completion_interruptible(
+   &vdev->aer_error_completion);
+   if (ret)
+   return ret;
+   }
return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite);
 
case VFIO_PCI_ROM_REGION_INDEX:
@@ -1228,6 +1248,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const 
struct pci_device_id *id)
vdev->irq_type = VFIO_PCI_NUM_IRQS;
mutex_init(&vdev->igate);
spin_lock_init(&vdev->irqlock);
+   init_completion(&vdev->aer_error_completion);
 
ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
if (ret) {
@@ -1300,6 +1321,11 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct 
pci_dev *pdev,
 
mutex_lock(&vdev->igate);
 
+   vdev->aer_error_in_progress = true;
+   reinit_completion(&vdev->aer_error_completion);
+   vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
+   VFIO_IRQ_SET_ACTION_TRIGGER,
+   vdev->irq_type, 0, 0, NULL);
if (vdev->err_trigger)
eventfd_signal(vdev->err_trigger, 1);
 
@@ -1310,8 +1336,30 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct 
pci_dev *pdev,
return PCI_ERS_RESULT_CAN_RECOVER;
 }
 
+static void vfio_pci_aer_resume(struct pci_dev *pdev)
+{
+   struct vfio_pci_device *vdev;
+   struct vfio_device *device;
+
+   device = vfio_device_get_from_dev(&pdev->dev);
+   if (device == NULL)
+   return;
+
+   vdev = vfio_device_data(device);
+   if (vdev == NULL) {
+   vfio_device_put(device);
+   return;
+   }
+
+   vdev->aer_error_in_progress = false;
+   complete_all(&vdev->aer_error_completion);
+
+   vfio_device_put(device);
+}
+
 static const struct pci_error_handlers vfio_err_handlers = {
.error_detected = vfio_pci_aer_err_detected,
+   .resume = vfio_pci_aer_resume,
 };
 
 static struct pci_driver vfio_pci_driver = {
diff --git a/drivers/vfio/pci/vfio_pci_private.h 
b/drivers/vfio/pci/vfio_pci_private.h
index 2128de8..7430d92 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -91,6 +91,8 @@ struct vfio_pci_device {
boolhas_vga;
boolneeds_reset;
boolnointx;
+   boolaer_error_in_progress;
+   struct completion   aer_error_completion;
struct pci_saved_state  *pci_saved_state;
int refcnt;
struct eventfd_ctx  *err_trigger;
diff --git a/include/uapi/linux/vfio.h b/includ

Re: [Qemu-devel] [PATCH v2 2/2] vfio : add aer process

2016-08-01 Thread Zhou Jie

Hi, Alex


Clearly this has only been tested for a single instance of an AER error
event and resume per device.  Are the things you're intending to block
actually blocked for subsequent events?  Note how complete_all() fills
the done field to let all current and future waiters go through and
nowhere is there a call to reinit_completion() to drain that path.
Thanks,

Alex


Do you mean this condition?

For device 1:
error1 occurs  error1 resumes
 error2 occurs  error2 resumes
 error3 occurs  error3 resumes

In current code, I do complete_all() when error1 resumes.
And this will unblock the device
when error2 and error3 are still be processed.


So walk me through how this works.  On vfio_pci_open() we call
init_completion(), which sets aer_error_completion.done equal to zero
(BTW, a user can open the device file descriptor multiple times, so
there's already a bug here).

I will call init_completion() in vfio_pci_probe.


Let's assume that an error occurs and the
user stalls a single access on wait_for_completion_interruptible().
The bulk of this function happens here:

static inline long __sched
do_wait_for_common(struct completion *x,
   long (*action)(long), long timeout, int state)
{
if (!x->done) {
DECLARE_WAITQUEUE(wait, current);

__add_wait_queue_tail_exclusive(&x->wait, &wait);
do {
if (signal_pending_state(state, current)) {
timeout = -ERESTARTSYS;
break;
}
__set_current_state(state);
spin_unlock_irq(&x->wait.lock);
timeout = action(timeout);
spin_lock_irq(&x->wait.lock);
} while (!x->done && timeout);
__remove_wait_queue(&x->wait, &wait);
if (!x->done)
return timeout;
}
x->done--;
return timeout ?: 1;
}

So it waits within that do{}while loop for a completion, interruption,
or timeout.  Then we have:

void complete_all(struct completion *x)
{
unsigned long flags;

spin_lock_irqsave(&x->wait.lock, flags);
x->done += UINT_MAX/2;
__wake_up_locked(&x->wait, TASK_NORMAL, 0);
spin_unlock_irqrestore(&x->wait.lock, flags);
}

So aer_error_completion.done gets incremented to let a couple billion
completion waiters through...  Show me how another call to
wait_for_completion_interruptible() will ever block again within our
lifetime when the actual wait of do_wait_for_common() is only entered
when 'done' count is equal to zero.  This seems to be why
reinit_completion() exists, but it's not used here.  Thanks,

Alex


I will call reinit_completion() in vfio_pci_aer_err_detected when
an aer error is detected.
Thank you very much.

Sincerely
ZhouJie





Re: [Qemu-devel] [PATCH v2 2/2] vfio : add aer process

2016-07-31 Thread Zhou Jie

Hi, Alex

On 2016/7/30 1:12, Alex Williamson wrote:

On Tue, 19 Jul 2016 15:32:43 +0800
Zhou Jie  wrote:


From: Chen Fan 

During aer err occurs and resume do following to
protect device from being accessed.
1. Make config space read only.
2. Disable INTx/MSI Interrupt.
3. Do nothing for bar regions.

Signed-off-by: Zhou Jie 
---
 drivers/vfio/pci/vfio_pci.c | 30 ++
 drivers/vfio/pci/vfio_pci_private.h |  2 ++
 include/uapi/linux/vfio.h   |  2 ++
 3 files changed, 34 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 2d12b03..dd96b60 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -318,6 +318,7 @@ static int vfio_pci_open(void *device_data)
return -ENODEV;

mutex_lock(&driver_lock);
+   init_completion(&vdev->aer_error_completion);

if (!vdev->refcnt) {
ret = vfio_pci_enable(vdev);
@@ -571,6 +572,16 @@ static long vfio_pci_ioctl(void *device_data,
struct vfio_pci_device *vdev = device_data;
unsigned long minsz;

+   if (vdev->aer_error_in_progress && (cmd == VFIO_DEVICE_SET_IRQS ||
+   cmd == VFIO_DEVICE_RESET || cmd == VFIO_DEVICE_PCI_HOT_RESET)) {
+   int ret;
+   ret = wait_for_completion_interruptible(
+   &vdev->aer_error_completion);
+   if (ret) {
+   return ret;
+   }


No brackets necessary.


+   }
+
if (cmd == VFIO_DEVICE_GET_INFO) {
struct vfio_device_info info;

@@ -587,6 +598,10 @@ static long vfio_pci_ioctl(void *device_data,
if (vdev->reset_works)
info.flags |= VFIO_DEVICE_FLAGS_RESET;

+   info.flags |= VFIO_DEVICE_FLAGS_AERPROCESS;
+   if (vdev->aer_error_in_progress)
+   info.flags |= VFIO_DEVICE_FLAGS_INAERPROCESS;
+
info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions;
info.num_irqs = VFIO_PCI_NUM_IRQS;

@@ -996,6 +1011,14 @@ static ssize_t vfio_pci_rw(void *device_data, char __user 
*buf,

switch (index) {
case VFIO_PCI_CONFIG_REGION_INDEX:
+   if (vdev->aer_error_in_progress && iswrite) {
+   int ret;
+   ret = wait_for_completion_interruptible(
+   &vdev->aer_error_completion);
+   if (ret) {
+   return ret;
+   }
+   }
return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite);

case VFIO_PCI_ROM_REGION_INDEX:
@@ -1226,6 +1249,10 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct 
pci_dev *pdev,

mutex_lock(&vdev->igate);

+   vdev->aer_error_in_progress = true;
+   vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
+   VFIO_IRQ_SET_ACTION_TRIGGER,
+   vdev->irq_type, 0, 0, NULL);
if (vdev->err_trigger)
eventfd_signal(vdev->err_trigger, 1);

@@ -1252,6 +1279,9 @@ static void vfio_pci_aer_resume(struct pci_dev *pdev)
}

mutex_lock(&vdev->igate);
+
+   vdev->aer_error_in_progress = false;
+   complete_all(&vdev->aer_error_completion);
if (vdev->resume_trigger)
eventfd_signal(vdev->resume_trigger, 1);

diff --git a/drivers/vfio/pci/vfio_pci_private.h 
b/drivers/vfio/pci/vfio_pci_private.h
index 80d4ddd..2f151f5 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -84,6 +84,8 @@ struct vfio_pci_device {
boolhas_vga;
boolneeds_reset;
boolnointx;
+   boolaer_error_in_progress;
+   struct completion   aer_error_completion;
struct pci_saved_state  *pci_saved_state;
int refcnt;
struct eventfd_ctx  *err_trigger;
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 34ab138..276ce50 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -198,6 +198,8 @@ struct vfio_device_info {
 #define VFIO_DEVICE_FLAGS_PCI  (1 << 1)  /* vfio-pci device */
 #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2)  /* vfio-platform device */
 #define VFIO_DEVICE_FLAGS_AMBA  (1 << 3) /* vfio-amba device */
+#define VFIO_DEVICE_FLAGS_AERPROCESS  (1 << 4)   /* support aer error progress 
*/
+#define VFIO_DEVICE_FLAGS_INAERPROCESS  (1 << 5)/* status in aer error 
progress */
__u32   num_regions;/* Max region index + 1 */
__u32   num_irqs;   /* Max IRQ index + 1 */
 };


Clearly this has only been tested for a single instance of an AER error
event and 

Re: [PATCH v2 1/2] vfio : resume notifier

2016-07-31 Thread Zhou Jie

Hi, Alex

On 2016/7/30 1:11, Alex Williamson wrote:

On Tue, 19 Jul 2016 15:52:45 +0800
Zhou Jie  wrote:


From: Chen Fan 


An empty commit log is unacceptable for all but the most trivial
patches.

There's also no sign-off on this patch.

Sorry. I should note it.


I also don't know why we need this since you previously found that for
QEMU, ordering of error versus resume notifications is not guaranteed,
which is why I thought we went with a status flag within the struct
vfio_device_info.  I'm not adding an interrupt to the user that has
no users.  This was not part of the most recent discussion we had about
this, so I'm lost why this patch exists. Thanks,

Alex

I will remove the resume interrupt.

Sincerely
ZhouJie




Re: [Qemu-devel] [PATCH v2 0/2] vfio: add aer process

2016-07-25 Thread Zhou Jie

ping

On 2016/7/19 16:13, Zhou Jie wrote:

From: Chen Fan 

v1-v2:
   1. Add aer process to vfio driver.

Chen Fan (2):
  vfio : add aer process
  vfio : resume notifier

 drivers/vfio/pci/vfio_pci.c | 58 -
 drivers/vfio/pci/vfio_pci_intrs.c   | 18 
 drivers/vfio/pci/vfio_pci_private.h |  3 ++
 include/uapi/linux/vfio.h   |  3 ++
 4 files changed, 81 insertions(+), 1 deletion(-)






[PATCH v2 0/2] vfio: add aer process

2016-07-19 Thread Zhou Jie
From: Chen Fan 

v1-v2:
   1. Add aer process to vfio driver.

Chen Fan (2):
  vfio : add aer process
  vfio : resume notifier

 drivers/vfio/pci/vfio_pci.c | 58 -
 drivers/vfio/pci/vfio_pci_intrs.c   | 18 
 drivers/vfio/pci/vfio_pci_private.h |  3 ++
 include/uapi/linux/vfio.h   |  3 ++
 4 files changed, 81 insertions(+), 1 deletion(-)

-- 
1.8.3.1





[PATCH v2 1/2] vfio : resume notifier

2016-07-19 Thread Zhou Jie
From: Chen Fan 

---
 drivers/vfio/pci/vfio_pci.c | 28 +++-
 drivers/vfio/pci/vfio_pci_intrs.c   | 18 ++
 drivers/vfio/pci/vfio_pci_private.h |  1 +
 include/uapi/linux/vfio.h   |  1 +
 4 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 188b1ff..2d12b03 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -363,7 +363,8 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device 
*vdev, int irq_type)
 
return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
}
-   } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) {
+   } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX ||
+  irq_type == VFIO_PCI_RESUME_IRQ_INDEX) {
if (pci_is_pcie(vdev->pdev))
return 1;
} else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) {
@@ -731,6 +732,7 @@ static long vfio_pci_ioctl(void *device_data,
case VFIO_PCI_REQ_IRQ_INDEX:
break;
case VFIO_PCI_ERR_IRQ_INDEX:
+   case VFIO_PCI_RESUME_IRQ_INDEX:
if (pci_is_pcie(vdev->pdev))
break;
/* pass thru to return error */
@@ -1234,8 +1236,32 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct 
pci_dev *pdev,
return PCI_ERS_RESULT_CAN_RECOVER;
 }
 
+static void vfio_pci_aer_resume(struct pci_dev *pdev)
+{
+   struct vfio_pci_device *vdev;
+   struct vfio_device *device;
+
+   device = vfio_device_get_from_dev(&pdev->dev);
+   if (device == NULL)
+   return;
+
+   vdev = vfio_device_data(device);
+   if (vdev == NULL) {
+   vfio_device_put(device);
+   return;
+   }
+
+   mutex_lock(&vdev->igate);
+   if (vdev->resume_trigger)
+   eventfd_signal(vdev->resume_trigger, 1);
+
+   mutex_unlock(&vdev->igate);
+   vfio_device_put(device);
+}
+
 static const struct pci_error_handlers vfio_err_handlers = {
.error_detected = vfio_pci_aer_err_detected,
+   .resume = vfio_pci_aer_resume,
 };
 
 static struct pci_driver vfio_pci_driver = {
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c 
b/drivers/vfio/pci/vfio_pci_intrs.c
index 15ecfc9..3a01a62 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -617,6 +617,16 @@ static int vfio_pci_set_err_trigger(struct vfio_pci_device 
*vdev,
return vfio_pci_set_ctx_trigger_single(&vdev->err_trigger, flags, data);
 }
 
+static int vfio_pci_set_resume_trigger(struct vfio_pci_device *vdev,
+   unsigned index, unsigned start,
+   unsigned count, uint32_t flags, void *data)
+{
+   if (index != VFIO_PCI_RESUME_IRQ_INDEX)
+   return -EINVAL;
+
+   return vfio_pci_set_ctx_trigger_single(&vdev->resume_trigger, flags, 
data);
+}
+
 static int vfio_pci_set_req_trigger(struct vfio_pci_device *vdev,
unsigned index, unsigned start,
unsigned count, uint32_t flags, void *data)
@@ -676,6 +686,14 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, 
uint32_t flags,
break;
}
break;
+   case VFIO_PCI_RESUME_IRQ_INDEX:
+   switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
+   case VFIO_IRQ_SET_ACTION_TRIGGER:
+   if (pci_is_pcie(vdev->pdev))
+   func = vfio_pci_set_resume_trigger;
+   break;
+   }
+   break;
}
 
if (!func)
diff --git a/drivers/vfio/pci/vfio_pci_private.h 
b/drivers/vfio/pci/vfio_pci_private.h
index 016c14a..80d4ddd 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -88,6 +88,7 @@ struct vfio_pci_device {
int refcnt;
struct eventfd_ctx  *err_trigger;
struct eventfd_ctx  *req_trigger;
+   struct eventfd_ctx  *resume_trigger;
 };
 
 #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 255a211..34ab138 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -433,6 +433,7 @@ enum {
VFIO_PCI_MSIX_IRQ_INDEX,
VFIO_PCI_ERR_IRQ_INDEX,
VFIO_PCI_REQ_IRQ_INDEX,
+   VFIO_PCI_RESUME_IRQ_INDEX,
VFIO_PCI_NUM_IRQS
 };
 
-- 
1.8.3.1





[PATCH v2 0/2] vfio: add aer process

2016-07-19 Thread Zhou Jie
From: Chen Fan 

v1-v2:
   1. Add aer process to vfio driver.

Chen Fan (1):
  vfio : add aer process

root (1):
  vfio : resume notifier

 drivers/vfio/pci/vfio_pci.c | 58 -
 drivers/vfio/pci/vfio_pci_intrs.c   | 18 
 drivers/vfio/pci/vfio_pci_private.h |  3 ++
 include/uapi/linux/vfio.h   |  3 ++
 4 files changed, 81 insertions(+), 1 deletion(-)

-- 
1.8.3.1





[PATCH v2 2/2] vfio : add aer process

2016-07-19 Thread Zhou Jie
From: Chen Fan 

During aer err occurs and resume do following to
protect device from being accessed.
1. Make config space read only.
2. Disable INTx/MSI Interrupt.
3. Do nothing for bar regions.

Signed-off-by: Zhou Jie 
---
 drivers/vfio/pci/vfio_pci.c | 30 ++
 drivers/vfio/pci/vfio_pci_private.h |  2 ++
 include/uapi/linux/vfio.h   |  2 ++
 3 files changed, 34 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 2d12b03..dd96b60 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -318,6 +318,7 @@ static int vfio_pci_open(void *device_data)
return -ENODEV;
 
mutex_lock(&driver_lock);
+   init_completion(&vdev->aer_error_completion);
 
if (!vdev->refcnt) {
ret = vfio_pci_enable(vdev);
@@ -571,6 +572,16 @@ static long vfio_pci_ioctl(void *device_data,
struct vfio_pci_device *vdev = device_data;
unsigned long minsz;
 
+   if (vdev->aer_error_in_progress && (cmd == VFIO_DEVICE_SET_IRQS ||
+   cmd == VFIO_DEVICE_RESET || cmd == VFIO_DEVICE_PCI_HOT_RESET)) {
+   int ret;
+   ret = wait_for_completion_interruptible(
+   &vdev->aer_error_completion);
+   if (ret) {
+   return ret;
+   }
+   }
+
if (cmd == VFIO_DEVICE_GET_INFO) {
struct vfio_device_info info;
 
@@ -587,6 +598,10 @@ static long vfio_pci_ioctl(void *device_data,
if (vdev->reset_works)
info.flags |= VFIO_DEVICE_FLAGS_RESET;
 
+   info.flags |= VFIO_DEVICE_FLAGS_AERPROCESS;
+   if (vdev->aer_error_in_progress)
+   info.flags |= VFIO_DEVICE_FLAGS_INAERPROCESS;
+
info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions;
info.num_irqs = VFIO_PCI_NUM_IRQS;
 
@@ -996,6 +1011,14 @@ static ssize_t vfio_pci_rw(void *device_data, char __user 
*buf,
 
switch (index) {
case VFIO_PCI_CONFIG_REGION_INDEX:
+   if (vdev->aer_error_in_progress && iswrite) {
+   int ret;
+   ret = wait_for_completion_interruptible(
+   &vdev->aer_error_completion);
+   if (ret) {
+   return ret;
+   }
+   }
return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite);
 
case VFIO_PCI_ROM_REGION_INDEX:
@@ -1226,6 +1249,10 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct 
pci_dev *pdev,
 
mutex_lock(&vdev->igate);
 
+   vdev->aer_error_in_progress = true;
+   vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
+   VFIO_IRQ_SET_ACTION_TRIGGER,
+   vdev->irq_type, 0, 0, NULL);
if (vdev->err_trigger)
eventfd_signal(vdev->err_trigger, 1);
 
@@ -1252,6 +1279,9 @@ static void vfio_pci_aer_resume(struct pci_dev *pdev)
}
 
mutex_lock(&vdev->igate);
+
+   vdev->aer_error_in_progress = false;
+   complete_all(&vdev->aer_error_completion);
if (vdev->resume_trigger)
eventfd_signal(vdev->resume_trigger, 1);
 
diff --git a/drivers/vfio/pci/vfio_pci_private.h 
b/drivers/vfio/pci/vfio_pci_private.h
index 80d4ddd..2f151f5 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -84,6 +84,8 @@ struct vfio_pci_device {
boolhas_vga;
boolneeds_reset;
boolnointx;
+   boolaer_error_in_progress;
+   struct completion   aer_error_completion;
struct pci_saved_state  *pci_saved_state;
int refcnt;
struct eventfd_ctx  *err_trigger;
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 34ab138..276ce50 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -198,6 +198,8 @@ struct vfio_device_info {
 #define VFIO_DEVICE_FLAGS_PCI  (1 << 1)/* vfio-pci device */
 #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2)/* vfio-platform device */
 #define VFIO_DEVICE_FLAGS_AMBA  (1 << 3)   /* vfio-amba device */
+#define VFIO_DEVICE_FLAGS_AERPROCESS  (1 << 4) /* support aer error progress */
+#define VFIO_DEVICE_FLAGS_INAERPROCESS  (1 << 5)/* status in aer error 
progress */
__u32   num_regions;/* Max region index + 1 */
__u32   num_irqs;   /* Max IRQ index + 1 */
 };
-- 
1.8.3.1





[PATCH v2 1/2] vfio : resume notifier

2016-07-19 Thread Zhou Jie
From: root 

---
 drivers/vfio/pci/vfio_pci.c | 28 +++-
 drivers/vfio/pci/vfio_pci_intrs.c   | 18 ++
 drivers/vfio/pci/vfio_pci_private.h |  1 +
 include/uapi/linux/vfio.h   |  1 +
 4 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 188b1ff..2d12b03 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -363,7 +363,8 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device 
*vdev, int irq_type)
 
return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
}
-   } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) {
+   } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX ||
+  irq_type == VFIO_PCI_RESUME_IRQ_INDEX) {
if (pci_is_pcie(vdev->pdev))
return 1;
} else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) {
@@ -731,6 +732,7 @@ static long vfio_pci_ioctl(void *device_data,
case VFIO_PCI_REQ_IRQ_INDEX:
break;
case VFIO_PCI_ERR_IRQ_INDEX:
+   case VFIO_PCI_RESUME_IRQ_INDEX:
if (pci_is_pcie(vdev->pdev))
break;
/* pass thru to return error */
@@ -1234,8 +1236,32 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct 
pci_dev *pdev,
return PCI_ERS_RESULT_CAN_RECOVER;
 }
 
+static void vfio_pci_aer_resume(struct pci_dev *pdev)
+{
+   struct vfio_pci_device *vdev;
+   struct vfio_device *device;
+
+   device = vfio_device_get_from_dev(&pdev->dev);
+   if (device == NULL)
+   return;
+
+   vdev = vfio_device_data(device);
+   if (vdev == NULL) {
+   vfio_device_put(device);
+   return;
+   }
+
+   mutex_lock(&vdev->igate);
+   if (vdev->resume_trigger)
+   eventfd_signal(vdev->resume_trigger, 1);
+
+   mutex_unlock(&vdev->igate);
+   vfio_device_put(device);
+}
+
 static const struct pci_error_handlers vfio_err_handlers = {
.error_detected = vfio_pci_aer_err_detected,
+   .resume = vfio_pci_aer_resume,
 };
 
 static struct pci_driver vfio_pci_driver = {
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c 
b/drivers/vfio/pci/vfio_pci_intrs.c
index 15ecfc9..3a01a62 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -617,6 +617,16 @@ static int vfio_pci_set_err_trigger(struct vfio_pci_device 
*vdev,
return vfio_pci_set_ctx_trigger_single(&vdev->err_trigger, flags, data);
 }
 
+static int vfio_pci_set_resume_trigger(struct vfio_pci_device *vdev,
+   unsigned index, unsigned start,
+   unsigned count, uint32_t flags, void *data)
+{
+   if (index != VFIO_PCI_RESUME_IRQ_INDEX)
+   return -EINVAL;
+
+   return vfio_pci_set_ctx_trigger_single(&vdev->resume_trigger, flags, 
data);
+}
+
 static int vfio_pci_set_req_trigger(struct vfio_pci_device *vdev,
unsigned index, unsigned start,
unsigned count, uint32_t flags, void *data)
@@ -676,6 +686,14 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, 
uint32_t flags,
break;
}
break;
+   case VFIO_PCI_RESUME_IRQ_INDEX:
+   switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
+   case VFIO_IRQ_SET_ACTION_TRIGGER:
+   if (pci_is_pcie(vdev->pdev))
+   func = vfio_pci_set_resume_trigger;
+   break;
+   }
+   break;
}
 
if (!func)
diff --git a/drivers/vfio/pci/vfio_pci_private.h 
b/drivers/vfio/pci/vfio_pci_private.h
index 016c14a..80d4ddd 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -88,6 +88,7 @@ struct vfio_pci_device {
int refcnt;
struct eventfd_ctx  *err_trigger;
struct eventfd_ctx  *req_trigger;
+   struct eventfd_ctx  *resume_trigger;
 };
 
 #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 255a211..34ab138 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -433,6 +433,7 @@ enum {
VFIO_PCI_MSIX_IRQ_INDEX,
VFIO_PCI_ERR_IRQ_INDEX,
VFIO_PCI_REQ_IRQ_INDEX,
+   VFIO_PCI_RESUME_IRQ_INDEX,
VFIO_PCI_NUM_IRQS
 };
 
-- 
1.8.3.1





Re: [Qemu-devel] [RFC PATCH 0/3] x86: Add support for guest DMA dirty page tracking

2016-06-11 Thread Zhou Jie

Hi, Alex

On 2016/6/9 23:39, Alexander Duyck wrote:

On Thu, Jun 9, 2016 at 3:14 AM, Zhou Jie  wrote:

TO Alex
TO Michael

   In your solution you add a emulate PCI bridge to act as
   a bridge between direct assigned devices and the host bridge.
   Do you mean put all direct assigned devices to
   one emulate PCI bridge?
   If yes, this maybe bring some problems.

   We are writing a patchset to support aer feature in qemu.
   When assigning a vfio device with AER enabled, we must check whether
   the device supports a host bus reset (ie. hot reset) as this may be
   used by the guest OS in order to recover the device from an AER
   error.
   QEMU must therefore have the ability to perform a physical
   host bus reset using the existing vfio APIs in response to a virtual
   bus reset in the VM.
   A physical bus reset affects all of the devices on the host bus.
   Therefore all physical devices affected by a bus reset must be
   configured on the same virtual bus in the VM.
   And no devices unaffected by the bus reset,
   be configured on the same virtual bus.

   http://lists.nongnu.org/archive/html/qemu-devel/2016-05/msg02989.html

Sincerely,
Zhou Jie


That makes sense, but I don't think you have to worry much about this
at this point at least on my side as this was mostly just theory and I
haven't had a chance to put any of it into practice as of yet.

My idea has been evolving on this for a while.  One thought I had is
that we may want to have something like an emulated IOMMU and if
possible we would want to split it up over multiple domains just so we
can be certain that the virtual interfaces and the physical ones
existed in separate domains.  In regards to your concerns perhaps what
we could do is put each assigned device into its own domain to prevent
them from affecting each other.  To that end we could probably break
things up so that each device effectively lives in its own PCIe slot
in the emulated system.  Then when we start a migration of the guest
the assigned device domains would then have to be tracked for unmap
and sync calls when the direction is from the device.

I will keep your concerns in mind in the future when I get some time
to look at exploring this solution further.

- Alex


I am thinking about the practice of migration of passthrough device.

In your solution, you use a vendor specific configuration space to
negotiate with guest.
If you put each assigned device into its own domain,
how can qemu negotiate with guest?
Add the vendor specific configuration space to every pci bus which
is assigned a passthrough device?

Sincerely
Zhou Jie




Re: [Qemu-devel] [RFC PATCH 0/3] x86: Add support for guest DMA dirty page tracking

2016-06-09 Thread Zhou Jie

TO Alex
TO Michael

   In your solution you add a emulate PCI bridge to act as
   a bridge between direct assigned devices and the host bridge.
   Do you mean put all direct assigned devices to
   one emulate PCI bridge?
   If yes, this maybe bring some problems.

   We are writing a patchset to support aer feature in qemu.
   When assigning a vfio device with AER enabled, we must check whether
   the device supports a host bus reset (ie. hot reset) as this may be
   used by the guest OS in order to recover the device from an AER
   error.
   QEMU must therefore have the ability to perform a physical
   host bus reset using the existing vfio APIs in response to a virtual
   bus reset in the VM.
   A physical bus reset affects all of the devices on the host bus.
   Therefore all physical devices affected by a bus reset must be
   configured on the same virtual bus in the VM.
   And no devices unaffected by the bus reset,
   be configured on the same virtual bus.

   http://lists.nongnu.org/archive/html/qemu-devel/2016-05/msg02989.html

Sincerely,
Zhou Jie

On 2016/6/7 0:04, Alex Duyck wrote:

On Mon, Jun 6, 2016 at 2:18 AM, Zhou Jie  wrote:

Hi Alex,


On 2016/1/6 0:18, Alexander Duyck wrote:


On Tue, Jan 5, 2016 at 1:40 AM, Michael S. Tsirkin  wrote:


On Mon, Jan 04, 2016 at 07:11:25PM -0800, Alexander Duyck wrote:


The two mechanisms referenced above would likely require coordination
with
QEMU and as such are open to discussion.  I haven't attempted to
address
them as I am not sure there is a consensus as of yet.  My personal
preference would be to add a vendor-specific configuration block to
the
emulated pci-bridge interfaces created by QEMU that would allow us to
essentially extend shpc to support guest live migration with
pass-through
devices.



shpc?



That is kind of what I was thinking.  We basically need some mechanism
to allow for the host to ask the device to quiesce.  It has been
proposed to possibly even look at something like an ACPI interface
since I know ACPI is used by QEMU to manage hot-plug in the standard
case.

- Alex




Start by using hot-unplug for this!

Really use your patch guest side, and write host side
to allow starting migration with the device, but
defer completing it.



Yeah, I'm fully on board with this idea, though I'm not really working
on this right now since last I knew the folks on this thread from
Intel were working on it.  My patches were mostly meant to be a nudge
in this direction so that we could get away from the driver specific
code.



I have seen your email about live migration.

I conclude the idea you proposed as following.
1. Extend swiotlb to allow for a page dirtying functionality.
2. Use pci express capability to implement of a PCI bridge to act
   as a bridge between direct assigned devices and the host bridge.
3. Using APCI event or extend shpc driver to support device pause.
Is it right?

Will you implement the patchs for live migration?


That is pretty much the heart of the proposal I had.  I submitted an
RFC as a proof-of-concept for item 1 in the hopes that someone else
might try tackling items 2 and 3 but I haven't seen any updates since
then.  The trick is to find a way to make it so that item 1 doesn't
slow down standard SWIOTLB when you are not migrating a VM. If nothing
else we would probably just need to add a static key that we could
default to false unless there is a PCI bridge indicating we are
starting a migration.

I haven't had time to really work on this though. In addition I am not
that familiar with QEMU and the internals of live migration so pieces
2 and 3 would take me some additional time to work on.

- Alex


.






Re: Re: [Qemu-devel] [RFC PATCH 0/3] x86: Add support for guest DMA dirty page tracking

2016-06-06 Thread Zhou Jie

Hi Alex,

On 2016/1/6 0:18, Alexander Duyck wrote:

On Tue, Jan 5, 2016 at 1:40 AM, Michael S. Tsirkin  wrote:

On Mon, Jan 04, 2016 at 07:11:25PM -0800, Alexander Duyck wrote:

The two mechanisms referenced above would likely require coordination with
QEMU and as such are open to discussion.  I haven't attempted to address
them as I am not sure there is a consensus as of yet.  My personal
preference would be to add a vendor-specific configuration block to the
emulated pci-bridge interfaces created by QEMU that would allow us to
essentially extend shpc to support guest live migration with pass-through
devices.


shpc?


That is kind of what I was thinking.  We basically need some mechanism
to allow for the host to ask the device to quiesce.  It has been
proposed to possibly even look at something like an ACPI interface
since I know ACPI is used by QEMU to manage hot-plug in the standard
case.

- Alex



Start by using hot-unplug for this!

Really use your patch guest side, and write host side
to allow starting migration with the device, but
defer completing it.


Yeah, I'm fully on board with this idea, though I'm not really working
on this right now since last I knew the folks on this thread from
Intel were working on it.  My patches were mostly meant to be a nudge
in this direction so that we could get away from the driver specific
code.


I have seen your email about live migration.

I conclude the idea you proposed as following.
1. Extend swiotlb to allow for a page dirtying functionality.
2. Use pci express capability to implement of a PCI bridge to act
   as a bridge between direct assigned devices and the host bridge.
3. Using APCI event or extend shpc driver to support device pause.
Is it right?

Will you implement the patchs for live migration?

Sincerely,
Zhou Jie





So

1.- host tells guest to start tracking memory writes
2.- guest acks
3.- migration starts
4.- most memory is migrated
5.- host tells guest to eject device
6.- guest acks
7.- stop vm and migrate rest of state



Sounds about right.  The only way this differs from what I see as the
final solution for this is that instead of fully ejecting the device
in step 5 the driver would instead pause the device and give the host
something like 10 seconds to stop the VM and resume with the same
device connected if it is available.  We would probably also need to
look at a solution that would force the device to be ejected or abort
prior to starting the migration if it doesn't give us the ack in step
2.


It will already be a win since hot unplug after migration starts and
most memory has been migrated is better than hot unplug before migration
starts.


Right.  Generally the longer the VF can be maintained as a part of the
guest the longer the network performance is improved versus using a
purely virtual interface.


Then measure downtime and profile. Then we can look at ways
to quiesce device faster which really means step 5 is replaced
with "host tells guest to quiesce device and dirty (or just unmap!)
all memory mapped for write by device".


Step 5 will be the spot where we really need to start modifying
drivers.  Specifically we probably need to go through and clean-up
things so that we can reduce as many of the delays in the driver
suspend/resume path as possible.  I suspect there is quite a bit that
can be done there that would probably also improve boot and shutdown
times since those are also impacted by the devices.

- Alex



.