Re: [Qemu-devel] [PATCH v3] vfio : add aer process

2016-08-18 Thread Zhou Jie

ping

On 2016/8/15 10:53, Zhou Jie wrote:

ping

On 2016/8/2 11:57, Zhou Jie wrote:

During aer err occurs and resume do following to
protect device from being accessed.
1. Make config space read only.
2. Disable INTx/MSI Interrupt.
3. Do nothing for bar regions.

Signed-off-by: Zhou Jie 
---
v2-v3:
   1. Call init_completion() in vfio_pci_probe.
   2. Call reinit_completion() in vfio_pci_aer_err_detected.
   3. Remove unnecessary brackets.

v1-v2:
   1. Add aer process to vfio driver.

 drivers/vfio/pci/vfio_pci.c | 48
+
 drivers/vfio/pci/vfio_pci_private.h |  2 ++
 include/uapi/linux/vfio.h   |  2 ++
 3 files changed, 52 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index d624a52..4c246a1 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -648,6 +648,15 @@ static long vfio_pci_ioctl(void *device_data,
 struct vfio_pci_device *vdev = device_data;
 unsigned long minsz;

+if (vdev->aer_error_in_progress && (cmd == VFIO_DEVICE_SET_IRQS ||
+cmd == VFIO_DEVICE_RESET || cmd == VFIO_DEVICE_PCI_HOT_RESET)) {
+int ret;
+ret = wait_for_completion_interruptible(
+&vdev->aer_error_completion);
+if (ret)
+return ret;
+}
+
 if (cmd == VFIO_DEVICE_GET_INFO) {
 struct vfio_device_info info;

@@ -664,6 +673,10 @@ static long vfio_pci_ioctl(void *device_data,
 if (vdev->reset_works)
 info.flags |= VFIO_DEVICE_FLAGS_RESET;

+info.flags |= VFIO_DEVICE_FLAGS_AERPROCESS;
+if (vdev->aer_error_in_progress)
+info.flags |= VFIO_DEVICE_FLAGS_INAERPROCESS;
+
 info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions;
 info.num_irqs = VFIO_PCI_NUM_IRQS;

@@ -1070,6 +1083,13 @@ static ssize_t vfio_pci_rw(void *device_data,
char __user *buf,

 switch (index) {
 case VFIO_PCI_CONFIG_REGION_INDEX:
+if (vdev->aer_error_in_progress && iswrite) {
+int ret;
+ret = wait_for_completion_interruptible(
+&vdev->aer_error_completion);
+if (ret)
+return ret;
+}
 return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite);

 case VFIO_PCI_ROM_REGION_INDEX:
@@ -1228,6 +1248,7 @@ static int vfio_pci_probe(struct pci_dev *pdev,
const struct pci_device_id *id)
 vdev->irq_type = VFIO_PCI_NUM_IRQS;
 mutex_init(&vdev->igate);
 spin_lock_init(&vdev->irqlock);
+init_completion(&vdev->aer_error_completion);

 ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
 if (ret) {
@@ -1300,6 +1321,11 @@ static pci_ers_result_t
vfio_pci_aer_err_detected(struct pci_dev *pdev,

 mutex_lock(&vdev->igate);

+vdev->aer_error_in_progress = true;
+reinit_completion(&vdev->aer_error_completion);
+vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
+VFIO_IRQ_SET_ACTION_TRIGGER,
+vdev->irq_type, 0, 0, NULL);
 if (vdev->err_trigger)
 eventfd_signal(vdev->err_trigger, 1);

@@ -1310,8 +1336,30 @@ static pci_ers_result_t
vfio_pci_aer_err_detected(struct pci_dev *pdev,
 return PCI_ERS_RESULT_CAN_RECOVER;
 }

+static void vfio_pci_aer_resume(struct pci_dev *pdev)
+{
+struct vfio_pci_device *vdev;
+struct vfio_device *device;
+
+device = vfio_device_get_from_dev(&pdev->dev);
+if (device == NULL)
+return;
+
+vdev = vfio_device_data(device);
+if (vdev == NULL) {
+vfio_device_put(device);
+return;
+}
+
+vdev->aer_error_in_progress = false;
+complete_all(&vdev->aer_error_completion);
+
+vfio_device_put(device);
+}
+
 static const struct pci_error_handlers vfio_err_handlers = {
 .error_detected = vfio_pci_aer_err_detected,
+.resume = vfio_pci_aer_resume,
 };

 static struct pci_driver vfio_pci_driver = {
diff --git a/drivers/vfio/pci/vfio_pci_private.h
b/drivers/vfio/pci/vfio_pci_private.h
index 2128de8..7430d92 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -91,6 +91,8 @@ struct vfio_pci_device {
 boolhas_vga;
 boolneeds_reset;
 boolnointx;
+boolaer_error_in_progress;
+struct completionaer_error_completion;
 struct pci_saved_state*pci_saved_state;
 intrefcnt;
 struct eventfd_ctx*err_trigger;
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 255a211..59b9cf6 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -198,6 +198,8 @@ struct vfio_device_info {
 #define VFIO_DEVICE_FLAGS_PCI(1 << 1)/* vfio-pci device */
 #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2)/* vfio-platform
device */
 #define VFIO_DEVICE_FLAG

Re: [Qemu-devel] [PATCH v3] vfio : add aer process

2016-08-14 Thread Zhou Jie

ping

On 2016/8/2 11:57, Zhou Jie wrote:

During aer err occurs and resume do following to
protect device from being accessed.
1. Make config space read only.
2. Disable INTx/MSI Interrupt.
3. Do nothing for bar regions.

Signed-off-by: Zhou Jie 
---
v2-v3:
   1. Call init_completion() in vfio_pci_probe.
   2. Call reinit_completion() in vfio_pci_aer_err_detected.
   3. Remove unnecessary brackets.

v1-v2:
   1. Add aer process to vfio driver.

 drivers/vfio/pci/vfio_pci.c | 48 +
 drivers/vfio/pci/vfio_pci_private.h |  2 ++
 include/uapi/linux/vfio.h   |  2 ++
 3 files changed, 52 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index d624a52..4c246a1 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -648,6 +648,15 @@ static long vfio_pci_ioctl(void *device_data,
struct vfio_pci_device *vdev = device_data;
unsigned long minsz;

+   if (vdev->aer_error_in_progress && (cmd == VFIO_DEVICE_SET_IRQS ||
+   cmd == VFIO_DEVICE_RESET || cmd == VFIO_DEVICE_PCI_HOT_RESET)) {
+   int ret;
+   ret = wait_for_completion_interruptible(
+   &vdev->aer_error_completion);
+   if (ret)
+   return ret;
+   }
+
if (cmd == VFIO_DEVICE_GET_INFO) {
struct vfio_device_info info;

@@ -664,6 +673,10 @@ static long vfio_pci_ioctl(void *device_data,
if (vdev->reset_works)
info.flags |= VFIO_DEVICE_FLAGS_RESET;

+   info.flags |= VFIO_DEVICE_FLAGS_AERPROCESS;
+   if (vdev->aer_error_in_progress)
+   info.flags |= VFIO_DEVICE_FLAGS_INAERPROCESS;
+
info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions;
info.num_irqs = VFIO_PCI_NUM_IRQS;

@@ -1070,6 +1083,13 @@ static ssize_t vfio_pci_rw(void *device_data, char 
__user *buf,

switch (index) {
case VFIO_PCI_CONFIG_REGION_INDEX:
+   if (vdev->aer_error_in_progress && iswrite) {
+   int ret;
+   ret = wait_for_completion_interruptible(
+   &vdev->aer_error_completion);
+   if (ret)
+   return ret;
+   }
return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite);

case VFIO_PCI_ROM_REGION_INDEX:
@@ -1228,6 +1248,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const 
struct pci_device_id *id)
vdev->irq_type = VFIO_PCI_NUM_IRQS;
mutex_init(&vdev->igate);
spin_lock_init(&vdev->irqlock);
+   init_completion(&vdev->aer_error_completion);

ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
if (ret) {
@@ -1300,6 +1321,11 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct 
pci_dev *pdev,

mutex_lock(&vdev->igate);

+   vdev->aer_error_in_progress = true;
+   reinit_completion(&vdev->aer_error_completion);
+   vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
+   VFIO_IRQ_SET_ACTION_TRIGGER,
+   vdev->irq_type, 0, 0, NULL);
if (vdev->err_trigger)
eventfd_signal(vdev->err_trigger, 1);

@@ -1310,8 +1336,30 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct 
pci_dev *pdev,
return PCI_ERS_RESULT_CAN_RECOVER;
 }

+static void vfio_pci_aer_resume(struct pci_dev *pdev)
+{
+   struct vfio_pci_device *vdev;
+   struct vfio_device *device;
+
+   device = vfio_device_get_from_dev(&pdev->dev);
+   if (device == NULL)
+   return;
+
+   vdev = vfio_device_data(device);
+   if (vdev == NULL) {
+   vfio_device_put(device);
+   return;
+   }
+
+   vdev->aer_error_in_progress = false;
+   complete_all(&vdev->aer_error_completion);
+
+   vfio_device_put(device);
+}
+
 static const struct pci_error_handlers vfio_err_handlers = {
.error_detected = vfio_pci_aer_err_detected,
+   .resume = vfio_pci_aer_resume,
 };

 static struct pci_driver vfio_pci_driver = {
diff --git a/drivers/vfio/pci/vfio_pci_private.h 
b/drivers/vfio/pci/vfio_pci_private.h
index 2128de8..7430d92 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -91,6 +91,8 @@ struct vfio_pci_device {
boolhas_vga;
boolneeds_reset;
boolnointx;
+   boolaer_error_in_progress;
+   struct completion   aer_error_completion;
struct pci_saved_state  *pci_saved_state;
int refcnt;
struct eventfd_ctx  *err_trigger;
diff --git 

[Qemu-devel] [PATCH v3] vfio : add aer process

2016-08-01 Thread Zhou Jie
During aer err occurs and resume do following to
protect device from being accessed.
1. Make config space read only.
2. Disable INTx/MSI Interrupt.
3. Do nothing for bar regions.

Signed-off-by: Zhou Jie 
---
v2-v3:
   1. Call init_completion() in vfio_pci_probe.
   2. Call reinit_completion() in vfio_pci_aer_err_detected.
   3. Remove unnecessary brackets.

v1-v2:
   1. Add aer process to vfio driver.

 drivers/vfio/pci/vfio_pci.c | 48 +
 drivers/vfio/pci/vfio_pci_private.h |  2 ++
 include/uapi/linux/vfio.h   |  2 ++
 3 files changed, 52 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index d624a52..4c246a1 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -648,6 +648,15 @@ static long vfio_pci_ioctl(void *device_data,
struct vfio_pci_device *vdev = device_data;
unsigned long minsz;
 
+   if (vdev->aer_error_in_progress && (cmd == VFIO_DEVICE_SET_IRQS ||
+   cmd == VFIO_DEVICE_RESET || cmd == VFIO_DEVICE_PCI_HOT_RESET)) {
+   int ret;
+   ret = wait_for_completion_interruptible(
+   &vdev->aer_error_completion);
+   if (ret)
+   return ret;
+   }
+
if (cmd == VFIO_DEVICE_GET_INFO) {
struct vfio_device_info info;
 
@@ -664,6 +673,10 @@ static long vfio_pci_ioctl(void *device_data,
if (vdev->reset_works)
info.flags |= VFIO_DEVICE_FLAGS_RESET;
 
+   info.flags |= VFIO_DEVICE_FLAGS_AERPROCESS;
+   if (vdev->aer_error_in_progress)
+   info.flags |= VFIO_DEVICE_FLAGS_INAERPROCESS;
+
info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions;
info.num_irqs = VFIO_PCI_NUM_IRQS;
 
@@ -1070,6 +1083,13 @@ static ssize_t vfio_pci_rw(void *device_data, char 
__user *buf,
 
switch (index) {
case VFIO_PCI_CONFIG_REGION_INDEX:
+   if (vdev->aer_error_in_progress && iswrite) {
+   int ret;
+   ret = wait_for_completion_interruptible(
+   &vdev->aer_error_completion);
+   if (ret)
+   return ret;
+   }
return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite);
 
case VFIO_PCI_ROM_REGION_INDEX:
@@ -1228,6 +1248,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const 
struct pci_device_id *id)
vdev->irq_type = VFIO_PCI_NUM_IRQS;
mutex_init(&vdev->igate);
spin_lock_init(&vdev->irqlock);
+   init_completion(&vdev->aer_error_completion);
 
ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
if (ret) {
@@ -1300,6 +1321,11 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct 
pci_dev *pdev,
 
mutex_lock(&vdev->igate);
 
+   vdev->aer_error_in_progress = true;
+   reinit_completion(&vdev->aer_error_completion);
+   vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
+   VFIO_IRQ_SET_ACTION_TRIGGER,
+   vdev->irq_type, 0, 0, NULL);
if (vdev->err_trigger)
eventfd_signal(vdev->err_trigger, 1);
 
@@ -1310,8 +1336,30 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct 
pci_dev *pdev,
return PCI_ERS_RESULT_CAN_RECOVER;
 }
 
+static void vfio_pci_aer_resume(struct pci_dev *pdev)
+{
+   struct vfio_pci_device *vdev;
+   struct vfio_device *device;
+
+   device = vfio_device_get_from_dev(&pdev->dev);
+   if (device == NULL)
+   return;
+
+   vdev = vfio_device_data(device);
+   if (vdev == NULL) {
+   vfio_device_put(device);
+   return;
+   }
+
+   vdev->aer_error_in_progress = false;
+   complete_all(&vdev->aer_error_completion);
+
+   vfio_device_put(device);
+}
+
 static const struct pci_error_handlers vfio_err_handlers = {
.error_detected = vfio_pci_aer_err_detected,
+   .resume = vfio_pci_aer_resume,
 };
 
 static struct pci_driver vfio_pci_driver = {
diff --git a/drivers/vfio/pci/vfio_pci_private.h 
b/drivers/vfio/pci/vfio_pci_private.h
index 2128de8..7430d92 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -91,6 +91,8 @@ struct vfio_pci_device {
boolhas_vga;
boolneeds_reset;
boolnointx;
+   boolaer_error_in_progress;
+   struct completion   aer_error_completion;
struct pci_saved_state  *pci_saved_state;
int refcnt;
struct eventfd_ctx  *err_trigger;
diff --git a/include/uapi/linux/vfio.h b/includ

Re: [Qemu-devel] [PATCH v2 2/2] vfio : add aer process

2016-08-01 Thread Zhou Jie

Hi, Alex


Clearly this has only been tested for a single instance of an AER error
event and resume per device.  Are the things you're intending to block
actually blocked for subsequent events?  Note how complete_all() fills
the done field to let all current and future waiters go through and
nowhere is there a call to reinit_completion() to drain that path.
Thanks,

Alex


Do you mean this condition?

For device 1:
error1 occurs  error1 resumes
 error2 occurs  error2 resumes
 error3 occurs  error3 resumes

In current code, I do complete_all() when error1 resumes.
And this will unblock the device
when error2 and error3 are still be processed.


So walk me through how this works.  On vfio_pci_open() we call
init_completion(), which sets aer_error_completion.done equal to zero
(BTW, a user can open the device file descriptor multiple times, so
there's already a bug here).

I will call init_completion() in vfio_pci_probe.


Let's assume that an error occurs and the
user stalls a single access on wait_for_completion_interruptible().
The bulk of this function happens here:

static inline long __sched
do_wait_for_common(struct completion *x,
   long (*action)(long), long timeout, int state)
{
if (!x->done) {
DECLARE_WAITQUEUE(wait, current);

__add_wait_queue_tail_exclusive(&x->wait, &wait);
do {
if (signal_pending_state(state, current)) {
timeout = -ERESTARTSYS;
break;
}
__set_current_state(state);
spin_unlock_irq(&x->wait.lock);
timeout = action(timeout);
spin_lock_irq(&x->wait.lock);
} while (!x->done && timeout);
__remove_wait_queue(&x->wait, &wait);
if (!x->done)
return timeout;
}
x->done--;
return timeout ?: 1;
}

So it waits within that do{}while loop for a completion, interruption,
or timeout.  Then we have:

void complete_all(struct completion *x)
{
unsigned long flags;

spin_lock_irqsave(&x->wait.lock, flags);
x->done += UINT_MAX/2;
__wake_up_locked(&x->wait, TASK_NORMAL, 0);
spin_unlock_irqrestore(&x->wait.lock, flags);
}

So aer_error_completion.done gets incremented to let a couple billion
completion waiters through...  Show me how another call to
wait_for_completion_interruptible() will ever block again within our
lifetime when the actual wait of do_wait_for_common() is only entered
when 'done' count is equal to zero.  This seems to be why
reinit_completion() exists, but it's not used here.  Thanks,

Alex


I will call reinit_completion() in vfio_pci_aer_err_detected when
an aer error is detected.
Thank you very much.

Sincerely
ZhouJie






Re: [Qemu-devel] [PATCH v2 2/2] vfio : add aer process

2016-07-31 Thread Zhou Jie

Hi, Alex

On 2016/7/30 1:12, Alex Williamson wrote:

On Tue, 19 Jul 2016 15:32:43 +0800
Zhou Jie  wrote:


From: Chen Fan 

During aer err occurs and resume do following to
protect device from being accessed.
1. Make config space read only.
2. Disable INTx/MSI Interrupt.
3. Do nothing for bar regions.

Signed-off-by: Zhou Jie 
---
 drivers/vfio/pci/vfio_pci.c | 30 ++
 drivers/vfio/pci/vfio_pci_private.h |  2 ++
 include/uapi/linux/vfio.h   |  2 ++
 3 files changed, 34 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 2d12b03..dd96b60 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -318,6 +318,7 @@ static int vfio_pci_open(void *device_data)
return -ENODEV;

mutex_lock(&driver_lock);
+   init_completion(&vdev->aer_error_completion);

if (!vdev->refcnt) {
ret = vfio_pci_enable(vdev);
@@ -571,6 +572,16 @@ static long vfio_pci_ioctl(void *device_data,
struct vfio_pci_device *vdev = device_data;
unsigned long minsz;

+   if (vdev->aer_error_in_progress && (cmd == VFIO_DEVICE_SET_IRQS ||
+   cmd == VFIO_DEVICE_RESET || cmd == VFIO_DEVICE_PCI_HOT_RESET)) {
+   int ret;
+   ret = wait_for_completion_interruptible(
+   &vdev->aer_error_completion);
+   if (ret) {
+   return ret;
+   }


No brackets necessary.


+   }
+
if (cmd == VFIO_DEVICE_GET_INFO) {
struct vfio_device_info info;

@@ -587,6 +598,10 @@ static long vfio_pci_ioctl(void *device_data,
if (vdev->reset_works)
info.flags |= VFIO_DEVICE_FLAGS_RESET;

+   info.flags |= VFIO_DEVICE_FLAGS_AERPROCESS;
+   if (vdev->aer_error_in_progress)
+   info.flags |= VFIO_DEVICE_FLAGS_INAERPROCESS;
+
info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions;
info.num_irqs = VFIO_PCI_NUM_IRQS;

@@ -996,6 +1011,14 @@ static ssize_t vfio_pci_rw(void *device_data, char __user 
*buf,

switch (index) {
case VFIO_PCI_CONFIG_REGION_INDEX:
+   if (vdev->aer_error_in_progress && iswrite) {
+   int ret;
+   ret = wait_for_completion_interruptible(
+   &vdev->aer_error_completion);
+   if (ret) {
+   return ret;
+   }
+   }
return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite);

case VFIO_PCI_ROM_REGION_INDEX:
@@ -1226,6 +1249,10 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct 
pci_dev *pdev,

mutex_lock(&vdev->igate);

+   vdev->aer_error_in_progress = true;
+   vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
+   VFIO_IRQ_SET_ACTION_TRIGGER,
+   vdev->irq_type, 0, 0, NULL);
if (vdev->err_trigger)
eventfd_signal(vdev->err_trigger, 1);

@@ -1252,6 +1279,9 @@ static void vfio_pci_aer_resume(struct pci_dev *pdev)
}

mutex_lock(&vdev->igate);
+
+   vdev->aer_error_in_progress = false;
+   complete_all(&vdev->aer_error_completion);
if (vdev->resume_trigger)
eventfd_signal(vdev->resume_trigger, 1);

diff --git a/drivers/vfio/pci/vfio_pci_private.h 
b/drivers/vfio/pci/vfio_pci_private.h
index 80d4ddd..2f151f5 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -84,6 +84,8 @@ struct vfio_pci_device {
boolhas_vga;
boolneeds_reset;
boolnointx;
+   boolaer_error_in_progress;
+   struct completion   aer_error_completion;
struct pci_saved_state  *pci_saved_state;
int refcnt;
struct eventfd_ctx  *err_trigger;
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 34ab138..276ce50 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -198,6 +198,8 @@ struct vfio_device_info {
 #define VFIO_DEVICE_FLAGS_PCI  (1 << 1)  /* vfio-pci device */
 #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2)  /* vfio-platform device */
 #define VFIO_DEVICE_FLAGS_AMBA  (1 << 3) /* vfio-amba device */
+#define VFIO_DEVICE_FLAGS_AERPROCESS  (1 << 4)   /* support aer error progress 
*/
+#define VFIO_DEVICE_FLAGS_INAERPROCESS  (1 << 5)/* status in aer error 
progress */
__u32   num_regions;/* Max region index + 1 */
__u32   num_irqs;   /* Max IRQ index + 1 */
 };


Clearly this has only been tested for a single instance of an AER error
event and 

Re: [Qemu-devel] [PATCH v2 1/2] vfio : resume notifier

2016-07-31 Thread Zhou Jie

Hi, Alex

On 2016/7/30 1:11, Alex Williamson wrote:

On Tue, 19 Jul 2016 15:52:45 +0800
Zhou Jie  wrote:


From: Chen Fan 


An empty commit log is unacceptable for all but the most trivial
patches.

There's also no sign-off on this patch.

Sorry. I should note it.


I also don't know why we need this since you previously found that for
QEMU, ordering of error versus resume notifications is not guaranteed,
which is why I thought we went with a status flag within the struct
vfio_device_info.  I'm not adding an interrupt to the user that has
no users.  This was not part of the most recent discussion we had about
this, so I'm lost why this patch exists. Thanks,

Alex

I will remove the resume interrupt.

Sincerely
ZhouJie





Re: [Qemu-devel] [PATCH v2 0/2] vfio: add aer process

2016-07-25 Thread Zhou Jie

ping

On 2016/7/19 16:13, Zhou Jie wrote:

From: Chen Fan 

v1-v2:
   1. Add aer process to vfio driver.

Chen Fan (2):
  vfio : add aer process
  vfio : resume notifier

 drivers/vfio/pci/vfio_pci.c | 58 -
 drivers/vfio/pci/vfio_pci_intrs.c   | 18 
 drivers/vfio/pci/vfio_pci_private.h |  3 ++
 include/uapi/linux/vfio.h   |  3 ++
 4 files changed, 81 insertions(+), 1 deletion(-)







[Qemu-devel] [PATCH v2 0/2] vfio: add aer process

2016-07-19 Thread Zhou Jie
From: Chen Fan 

v1-v2:
   1. Add aer process to vfio driver.

Chen Fan (2):
  vfio : add aer process
  vfio : resume notifier

 drivers/vfio/pci/vfio_pci.c | 58 -
 drivers/vfio/pci/vfio_pci_intrs.c   | 18 
 drivers/vfio/pci/vfio_pci_private.h |  3 ++
 include/uapi/linux/vfio.h   |  3 ++
 4 files changed, 81 insertions(+), 1 deletion(-)

-- 
1.8.3.1






[Qemu-devel] [PATCH v2 1/2] vfio : resume notifier

2016-07-19 Thread Zhou Jie
From: Chen Fan 

---
 drivers/vfio/pci/vfio_pci.c | 28 +++-
 drivers/vfio/pci/vfio_pci_intrs.c   | 18 ++
 drivers/vfio/pci/vfio_pci_private.h |  1 +
 include/uapi/linux/vfio.h   |  1 +
 4 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 188b1ff..2d12b03 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -363,7 +363,8 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device 
*vdev, int irq_type)
 
return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
}
-   } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) {
+   } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX ||
+  irq_type == VFIO_PCI_RESUME_IRQ_INDEX) {
if (pci_is_pcie(vdev->pdev))
return 1;
} else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) {
@@ -731,6 +732,7 @@ static long vfio_pci_ioctl(void *device_data,
case VFIO_PCI_REQ_IRQ_INDEX:
break;
case VFIO_PCI_ERR_IRQ_INDEX:
+   case VFIO_PCI_RESUME_IRQ_INDEX:
if (pci_is_pcie(vdev->pdev))
break;
/* pass thru to return error */
@@ -1234,8 +1236,32 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct 
pci_dev *pdev,
return PCI_ERS_RESULT_CAN_RECOVER;
 }
 
+static void vfio_pci_aer_resume(struct pci_dev *pdev)
+{
+   struct vfio_pci_device *vdev;
+   struct vfio_device *device;
+
+   device = vfio_device_get_from_dev(&pdev->dev);
+   if (device == NULL)
+   return;
+
+   vdev = vfio_device_data(device);
+   if (vdev == NULL) {
+   vfio_device_put(device);
+   return;
+   }
+
+   mutex_lock(&vdev->igate);
+   if (vdev->resume_trigger)
+   eventfd_signal(vdev->resume_trigger, 1);
+
+   mutex_unlock(&vdev->igate);
+   vfio_device_put(device);
+}
+
 static const struct pci_error_handlers vfio_err_handlers = {
.error_detected = vfio_pci_aer_err_detected,
+   .resume = vfio_pci_aer_resume,
 };
 
 static struct pci_driver vfio_pci_driver = {
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c 
b/drivers/vfio/pci/vfio_pci_intrs.c
index 15ecfc9..3a01a62 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -617,6 +617,16 @@ static int vfio_pci_set_err_trigger(struct vfio_pci_device 
*vdev,
return vfio_pci_set_ctx_trigger_single(&vdev->err_trigger, flags, data);
 }
 
+static int vfio_pci_set_resume_trigger(struct vfio_pci_device *vdev,
+   unsigned index, unsigned start,
+   unsigned count, uint32_t flags, void *data)
+{
+   if (index != VFIO_PCI_RESUME_IRQ_INDEX)
+   return -EINVAL;
+
+   return vfio_pci_set_ctx_trigger_single(&vdev->resume_trigger, flags, 
data);
+}
+
 static int vfio_pci_set_req_trigger(struct vfio_pci_device *vdev,
unsigned index, unsigned start,
unsigned count, uint32_t flags, void *data)
@@ -676,6 +686,14 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, 
uint32_t flags,
break;
}
break;
+   case VFIO_PCI_RESUME_IRQ_INDEX:
+   switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
+   case VFIO_IRQ_SET_ACTION_TRIGGER:
+   if (pci_is_pcie(vdev->pdev))
+   func = vfio_pci_set_resume_trigger;
+   break;
+   }
+   break;
}
 
if (!func)
diff --git a/drivers/vfio/pci/vfio_pci_private.h 
b/drivers/vfio/pci/vfio_pci_private.h
index 016c14a..80d4ddd 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -88,6 +88,7 @@ struct vfio_pci_device {
int refcnt;
struct eventfd_ctx  *err_trigger;
struct eventfd_ctx  *req_trigger;
+   struct eventfd_ctx  *resume_trigger;
 };
 
 #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 255a211..34ab138 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -433,6 +433,7 @@ enum {
VFIO_PCI_MSIX_IRQ_INDEX,
VFIO_PCI_ERR_IRQ_INDEX,
VFIO_PCI_REQ_IRQ_INDEX,
+   VFIO_PCI_RESUME_IRQ_INDEX,
VFIO_PCI_NUM_IRQS
 };
 
-- 
1.8.3.1






[Qemu-devel] [PATCH v9 10/11] vfio: Add waiting for host aer error progress

2016-07-19 Thread Zhou Jie
From: Chen Fan 

For supporting aer recovery, host and guest would run the same aer
recovery code, that would do the secondary bus reset if the error
is fatal, the aer recovery process:
  1. error_detected
  2. reset_link (if fatal)
  3. slot_reset/mmio_enabled
  4. resume

It indicates that host will do secondary bus reset to reset
the physical devices under bus in step 2, that would cause
devices in D3 status in a short time. But in qemu, we register
an error detected handler, that would be invoked as host broadcasts
the error-detected event in step 1, in order to avoid guest do
reset_link when host do reset_link simultaneously. it may cause
fatal error. we poll the vfio_device_info to assure host reset
completely.
In qemu, the aer recovery process:
  1. Detect support for aer error progress
 If host vfio driver does not support for aer error progress,
 directly fail to boot up VM as with aer enabled.
  2. Immediately notify the VM on error detected.
  3. Wait for host aer error progress
 Poll the vfio_device_info, If it is still in aer error progress after
 some timeout, we would abort the guest directed bus reset
 altogether and unplug of the device to prevent it from further
 interacting with the VM.
  4. Reset bus.

Signed-off-by: Chen Fan 
Signed-off-by: Zhou Jie 
---
 hw/vfio/pci.c  | 51 +-
 hw/vfio/pci.h  |  1 +
 linux-headers/linux/vfio.h |  4 
 3 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 0e42786..777245c 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -35,6 +35,12 @@
 
 #define MSIX_CAP_LENGTH 12
 
+/*
+ * Timeout for waiting host aer error process, it is 3 seconds.
+ * For hardware bus reset 3 seconds will be enough.
+ */
+#define PCI_AER_PROCESS_TIMEOUT 300
+
 static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
 
@@ -1913,6 +1919,14 @@ static void vfio_check_hot_bus_reset(VFIOPCIDevice 
*vdev, Error **errp)
 VFIOGroup *group;
 int ret, i, devfn, range_limit;
 
+if (!(vdev->vbasedev.flags & VFIO_DEVICE_FLAGS_AERPROCESS)) {
+error_setg(errp, "vfio: Cannot enable AER for device %s,"
+   " host vfio driver does not support for"
+   " aer error progress",
+   vdev->vbasedev.name);
+return;
+}
+
 ret = vfio_get_hot_reset_info(vdev, &info);
 if (ret) {
 error_setg(errp, "vfio: Cannot enable AER for device %s,"
@@ -2679,6 +2693,11 @@ static void vfio_err_notifier_handler(void *opaque)
 msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
  PCI_ERR_ROOT_CMD_NONFATAL_EN;
 
+if (isfatal) {
+PCIDevice *dev_0 = pci_get_function_0(dev);
+VFIOPCIDevice *vdev_0 = DO_UPCAST(VFIOPCIDevice, pdev, dev_0);
+vdev_0->pci_aer_error_signaled = true;
+}
 pcie_aer_msg(dev, &msg);
 return;
 }
@@ -3163,6 +3182,19 @@ static void vfio_exitfn(PCIDevice *pdev)
 vfio_bars_exit(vdev);
 }
 
+static int vfio_aer_error_is_in_process(VFIOPCIDevice *vdev)
+{
+struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
+int ret;
+
+ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_INFO, &dev_info);
+if (ret) {
+error_report("vfio: error getting device info: %m");
+return ret;
+}
+return dev_info.flags & VFIO_DEVICE_FLAGS_INAERPROCESS ? 1 : 0;
+}
+
 static void vfio_pci_reset(DeviceState *dev)
 {
 PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev);
@@ -3176,7 +3208,24 @@ static void vfio_pci_reset(DeviceState *dev)
 if ((pci_get_word(br->config + PCI_BRIDGE_CONTROL) &
  PCI_BRIDGE_CTL_BUS_RESET)) {
 if (pci_get_function_0(pdev) == pdev) {
-vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
+if (!vdev->pci_aer_error_signaled) {
+vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
+} else {
+int i;
+for (i = 0; i < 1000; i++) {
+if (!vfio_aer_error_is_in_process(vdev)) {
+break;
+}
+g_usleep(PCI_AER_PROCESS_TIMEOUT / 1000);
+}
+
+if (i == 1000) {
+qdev_unplug(&vdev->pdev.qdev, NULL);
+} else {
+vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
+}
+vdev->pci_aer_error_signaled = false;
+}
 }
 return;
 }
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index ed14322..c9e0202 100644
--- a/hw/vfio/pci.h
+++ b/hw/vf

[Qemu-devel] [PATCH v9 04/11] vfio: refine function vfio_pci_host_match

2016-07-19 Thread Zhou Jie
From: Chen Fan 

Signed-off-by: Chen Fan 
---
 hw/vfio/pci.c | 21 +
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 11c895c..21fd801 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2060,14 +2060,27 @@ static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
 vfio_intx_enable(vdev);
 }
 
+static int vfio_pci_name_to_addr(const char *name, PCIHostDeviceAddress *addr)
+{
+if (strlen(name) != 12 ||
+sscanf(name, "%04x:%02x:%02x.%1x", &addr->domain,
+   &addr->bus, &addr->slot, &addr->function) != 4) {
+return -EINVAL;
+}
+
+return 0;
+}
+
 static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
 {
-char tmp[13];
+PCIHostDeviceAddress tmp;
 
-sprintf(tmp, "%04x:%02x:%02x.%1x", addr->domain,
-addr->bus, addr->slot, addr->function);
+if (vfio_pci_name_to_addr(name, &tmp)) {
+return false;
+}
 
-return (strcmp(tmp, name) == 0);
+return (tmp.domain == addr->domain && tmp.bus == addr->bus &&
+tmp.slot == addr->slot && tmp.function == addr->function);
 }
 
 static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
-- 
1.8.3.1






[Qemu-devel] [PATCH v9 09/11] vfio-pci: pass the aer error to guest

2016-07-19 Thread Zhou Jie
From: Chen Fan 

when the vfio device encounters an uncorrectable error in host,
the vfio_pci driver will signal the eventfd registered by this
vfio device, resulting in the qemu eventfd handler getting
invoked.

this patch is to pass the error to guest and let the guest driver
recover from the error.

Signed-off-by: Chen Fan 
---
 hw/vfio/pci.c | 60 +--
 1 file changed, 54 insertions(+), 6 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 0521652..0e42786 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2627,18 +2627,66 @@ static void vfio_put_device(VFIOPCIDevice *vdev)
 static void vfio_err_notifier_handler(void *opaque)
 {
 VFIOPCIDevice *vdev = opaque;
+PCIDevice *dev = &vdev->pdev;
+Error *local_err = NULL;
+PCIEAERMsg msg = {
+.severity = 0,
+.source_id = (pci_bus_num(dev->bus) << 8) | dev->devfn,
+};
 
 if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
 return;
 }
 
+
+if (!(vdev->features & VFIO_FEATURE_ENABLE_AER)) {
+goto stop;
+}
+
+/*
+ * in case the real hardware configuration has been changed,
+ * here we should recheck the bus reset capability.
+ */
+vfio_check_hot_bus_reset(vdev, &local_err);
+if (local_err) {
+error_report_err(local_err);
+goto stop;
+}
+
+/*
+ * we should read the error details from the real hardware
+ * configuration spaces, here we only need to do is signaling
+ * to guest an uncorrectable error has occurred.
+ */
+if (dev->exp.aer_cap) {
+uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
+uint32_t uncor_status;
+bool isfatal;
+
+uncor_status = vfio_pci_read_config(dev,
+   dev->exp.aer_cap + PCI_ERR_UNCOR_STATUS, 4);
+
+/*
+ * if the error is not emitted by this device, we can
+ * just ignore it.
+ */
+if (!(uncor_status & ~0UL)) {
+return;
+}
+
+isfatal = uncor_status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER);
+
+msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
+ PCI_ERR_ROOT_CMD_NONFATAL_EN;
+
+pcie_aer_msg(dev, &msg);
+return;
+}
+
+stop:
 /*
- * TBD. Retrieve the error details and decide what action
- * needs to be taken. One of the actions could be to pass
- * the error to the guest and have the guest driver recover
- * from the error. This requires that PCIe capabilities be
- * exposed to the guest. For now, we just terminate the
- * guest to contain the error.
+ * If the aer capability is not exposed to the guest. we just
+ * terminate the guest to contain the error.
  */
 
 error_report("%s(%s) Unrecoverable error detected. Please collect any data 
possible and then kill the guest", __func__, vdev->vbasedev.name);
-- 
1.8.3.1






[Qemu-devel] [PATCH v9 11/11] vfio: add 'aer' property to expose aercap

2016-07-19 Thread Zhou Jie
From: Chen Fan 

add 'aer' property to let user able to decide whether expose
the aer capability. by default we should disable aer feature,
because it needs configuration restrictions.

Signed-off-by: Chen Fan 
---
 hw/vfio/pci.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 777245c..fdc27e6 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3305,6 +3305,8 @@ static Property vfio_pci_dev_properties[] = {
 DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice,
sub_device_id, PCI_ANY_ID),
 DEFINE_PROP_UINT32("x-igd-gms", VFIOPCIDevice, igd_gms, 0),
+DEFINE_PROP_BIT("aer", VFIOPCIDevice, features,
+VFIO_FEATURE_ENABLE_AER_BIT, false),
 /*
  * TODO - support passed fds... is this necessary?
  * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name),
-- 
1.8.3.1






[Qemu-devel] [PATCH v9 03/11] vfio: add aer support for vfio device

2016-07-19 Thread Zhou Jie
From: Chen Fan 

Calling pcie_aer_init to initilize aer related registers for
vfio device, then reload physical related registers to expose
device capability.

Signed-off-by: Chen Fan 
---
 hw/vfio/pci.c | 75 ++-
 hw/vfio/pci.h |  3 +++
 2 files changed, 77 insertions(+), 1 deletion(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 6a6160b..11c895c 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1854,6 +1854,66 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t 
pos)
 return 0;
 }
 
+static int vfio_setup_aer(VFIOPCIDevice *vdev, uint8_t cap_ver,
+  int pos, uint16_t size)
+{
+PCIDevice *pdev = &vdev->pdev;
+PCIDevice *dev_iter;
+uint8_t type;
+uint32_t errcap;
+
+if (!(vdev->features & VFIO_FEATURE_ENABLE_AER)) {
+pcie_add_capability(pdev, PCI_EXT_CAP_ID_ERR,
+cap_ver, pos, size);
+return 0;
+}
+
+dev_iter = pci_bridge_get_device(pdev->bus);
+if (!dev_iter) {
+goto error;
+}
+
+while (dev_iter) {
+if (!pci_is_express(dev_iter)) {
+goto error;
+}
+
+type = pcie_cap_get_type(dev_iter);
+if ((type != PCI_EXP_TYPE_ROOT_PORT &&
+ type != PCI_EXP_TYPE_UPSTREAM &&
+ type != PCI_EXP_TYPE_DOWNSTREAM)) {
+goto error;
+}
+
+if (!dev_iter->exp.aer_cap) {
+goto error;
+}
+
+dev_iter = pci_bridge_get_device(dev_iter->bus);
+}
+
+errcap = vfio_pci_read_config(pdev, pos + PCI_ERR_CAP, 4);
+/*
+ * The ability to record multiple headers is depending on
+ * the state of the Multiple Header Recording Capable bit and
+ * enabled by the Multiple Header Recording Enable bit.
+ */
+if ((errcap & PCI_ERR_CAP_MHRC) &&
+(errcap & PCI_ERR_CAP_MHRE)) {
+pdev->exp.aer_log.log_max = PCIE_AER_LOG_MAX_DEFAULT;
+} else {
+pdev->exp.aer_log.log_max = 0;
+}
+
+pcie_cap_deverr_init(pdev);
+return pcie_aer_init(pdev, pos, size);
+
+error:
+error_report("vfio: Unable to enable AER for device %s, parent bus "
+ "does not support AER signaling", vdev->vbasedev.name);
+return -1;
+}
+
 static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
 {
 PCIDevice *pdev = &vdev->pdev;
@@ -1861,6 +1921,7 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
 uint16_t cap_id, next, size;
 uint8_t cap_ver;
 uint8_t *config;
+int ret = 0;
 
 /* Only add extended caps if we have them and the guest can see them */
 if (!pci_is_express(pdev) || !pci_bus_is_express(pdev->bus) ||
@@ -1914,6 +1975,9 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
PCI_EXT_CAP_NEXT_MASK);
 
 switch (cap_id) {
+case PCI_EXT_CAP_ID_ERR:
+ret = vfio_setup_aer(vdev, cap_ver, next, size);
+break;
 case PCI_EXT_CAP_ID_SRIOV: /* Read-only VF BARs confuse OVMF */
 trace_vfio_add_ext_cap_dropped(vdev->vbasedev.name, cap_id, next);
 break;
@@ -1921,6 +1985,9 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
 pcie_add_capability(pdev, cap_id, cap_ver, next, size);
 }
 
+if (ret) {
+goto out;
+}
 }
 
 /* Cleanup chain head ID if necessary */
@@ -1928,8 +1995,9 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
 pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0);
 }
 
+out:
 g_free(config);
-return 0;
+return ret;
 }
 
 static int vfio_add_capabilities(VFIOPCIDevice *vdev)
@@ -2698,6 +2766,11 @@ static int vfio_initfn(PCIDevice *pdev)
 goto out_teardown;
 }
 
+if ((vdev->features & VFIO_FEATURE_ENABLE_AER) &&
+!pdev->exp.aer_cap) {
+goto out_teardown;
+}
+
 if (vdev->vga) {
 vfio_vga_quirk_setup(vdev);
 }
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index 7d482d9..5483044 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -15,6 +15,7 @@
 #include "qemu-common.h"
 #include "exec/memory.h"
 #include "hw/pci/pci.h"
+#include "hw/pci/pci_bridge.h"
 #include "hw/vfio/vfio-common.h"
 #include "qemu/event_notifier.h"
 #include "qemu/queue.h"
@@ -132,6 +133,8 @@ typedef struct VFIOPCIDevice {
 #define VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT 2
 #define VFIO_FEATURE_ENABLE_IGD_OPREGION \
 (1 << VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT)
+#define VFIO_FEATURE_ENABLE_AER_BIT 3
+#define VFIO_FEATURE_ENABLE_AER (1 << VFIO_FEATURE_ENABLE_AER_BIT)
 int32_t bootindex;
 uint32_t igd_gms;
 uint8_t pm_cap;
-- 
1.8.3.1






[Qemu-devel] [PATCH v9 05/11] vfio: add check host bus reset is support or not

2016-07-19 Thread Zhou Jie
From: Chen Fan 

When assigning a vfio device with AER enabled, we must check whether
the device supports a host bus reset (ie. hot reset) as this may be
used by the guest OS in order to recover the device from an AER
error.  QEMU must therefore have the ability to perform a physical
host bus reset using the existing vfio APIs in response to a virtual
bus reset in the VM.  A physical bus reset affects all of the devices
on the host bus, therefore we place a few simplifying configuration
restriction on the VM:

 - All physical devices affected by a bus reset must be assigned to
   the VM with AER enabled on each and be configured on the same
   virtual bus in the VM.

 - No devices unaffected by the bus reset, be they physical, emulated,
   or paravirtual may be configured on the same virtual bus as a
   device supporting AER signaling through vfio.

In other words users wishing to enable AER on a multifunction device
need to assign all functions of the device to the same virtual bus
and enable AER support for each device.  The easiest way to
accomplish this is to identity map the physical functions to virtual
functions with multifunction enabled on the virtual device.

Signed-off-by: Chen Fan 
---
 hw/vfio/pci.c | 278 +-
 hw/vfio/pci.h |   1 +
 2 files changed, 256 insertions(+), 23 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 21fd801..242c1e4 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1693,6 +1693,42 @@ static void vfio_check_af_flr(VFIOPCIDevice *vdev, 
uint8_t pos)
 }
 }
 
+static int vfio_pci_name_to_addr(const char *name, PCIHostDeviceAddress *addr)
+{
+if (strlen(name) != 12 ||
+sscanf(name, "%04x:%02x:%02x.%1x", &addr->domain,
+   &addr->bus, &addr->slot, &addr->function) != 4) {
+return -EINVAL;
+}
+
+return 0;
+}
+
+static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
+{
+PCIHostDeviceAddress tmp;
+
+if (vfio_pci_name_to_addr(name, &tmp)) {
+return false;
+}
+
+return (tmp.domain == addr->domain && tmp.bus == addr->bus &&
+tmp.slot == addr->slot && tmp.function == addr->function);
+}
+
+static bool vfio_pci_host_match_slot(PCIHostDeviceAddress *addr,
+ const char *name)
+{
+PCIHostDeviceAddress tmp;
+
+if (vfio_pci_name_to_addr(name, &tmp)) {
+return false;
+}
+
+return (tmp.domain == addr->domain && tmp.bus == addr->bus &&
+tmp.slot == addr->slot);
+}
+
 /*
  * return negative with errno, return 0 on success.
  * if success, the point of ret_info fill with the affected device reset info.
@@ -1854,6 +1890,200 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, 
uint8_t pos)
 return 0;
 }
 
+static int vfio_device_range_limit(PCIBus *bus)
+{
+PCIDevice *br;
+
+br = pci_bridge_get_device(bus);
+if (!br ||
+!pci_is_express(br) ||
+!(br->exp.exp_cap) ||
+pcie_cap_is_arifwd_enabled(br)) {
+return 255;
+}
+
+return 8;
+}
+
+static void vfio_check_hot_bus_reset(VFIOPCIDevice *vdev, Error **errp)
+{
+PCIBus *bus = vdev->pdev.bus;
+struct vfio_pci_hot_reset_info *info = NULL;
+struct vfio_pci_dependent_device *devices;
+VFIOGroup *group;
+int ret, i, devfn, range_limit;
+
+ret = vfio_get_hot_reset_info(vdev, &info);
+if (ret) {
+error_setg(errp, "vfio: Cannot enable AER for device %s,"
+   " device does not support hot reset.",
+   vdev->vbasedev.name);
+return;
+}
+
+/* List all affected devices by bus reset */
+devices = &info->devices[0];
+
+/* Verify that we have all the groups required */
+for (i = 0; i < info->count; i++) {
+PCIHostDeviceAddress host;
+VFIOPCIDevice *tmp;
+VFIODevice *vbasedev_iter;
+bool found = false;
+
+host.domain = devices[i].segment;
+host.bus = devices[i].bus;
+host.slot = PCI_SLOT(devices[i].devfn);
+host.function = PCI_FUNC(devices[i].devfn);
+
+/* Skip the current device */
+if (vfio_pci_host_match(&host, vdev->vbasedev.name)) {
+continue;
+}
+
+/* Ensure we own the group of the affected device */
+QLIST_FOREACH(group, &vfio_group_list, next) {
+if (group->groupid == devices[i].group_id) {
+break;
+}
+}
+
+if (!group) {
+error_setg(errp, "vfio: Cannot enable AER for device %s, "
+   "depends on group %d which is not owned.",
+   vdev->vbasedev.name, devices[i].group_id);
+goto out;
+}
+
+/* Ensure affected devices for reset on the same bus */
+QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
+if (vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
+continue;
+}
+  

[Qemu-devel] [PATCH v9 08/11] vfio: vote the function 0 to do host bus reset when aer occurred

2016-07-19 Thread Zhou Jie
From: Chen Fan 

Due to all devices assigned to VM on the same way as host if enable
aer, so we can easily do the hot reset by selecting the function #0
to do the hot reset.

Signed-off-by: Chen Fan 
---
 hw/vfio/pci.c | 14 ++
 hw/vfio/pci.h |  1 +
 2 files changed, 15 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 8bcb26b..0521652 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1924,6 +1924,8 @@ static void vfio_check_hot_bus_reset(VFIOPCIDevice *vdev, 
Error **errp)
 /* List all affected devices by bus reset */
 devices = &info->devices[0];
 
+vdev->single_depend_dev = (info->count == 1);
+
 /* Verify that we have all the groups required */
 for (i = 0; i < info->count; i++) {
 PCIHostDeviceAddress host;
@@ -3120,6 +3122,18 @@ static void vfio_pci_reset(DeviceState *dev)
 
 trace_vfio_pci_reset(vdev->vbasedev.name);
 
+if (vdev->features & VFIO_FEATURE_ENABLE_AER) {
+PCIDevice *br = pci_bridge_get_device(pdev->bus);
+
+if ((pci_get_word(br->config + PCI_BRIDGE_CONTROL) &
+ PCI_BRIDGE_CTL_BUS_RESET)) {
+if (pci_get_function_0(pdev) == pdev) {
+vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
+}
+return;
+}
+}
+
 vfio_pci_pre_reset(vdev);
 
 if (vdev->resetfn && !vdev->resetfn(vdev)) {
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index e2faffb..ed14322 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -147,6 +147,7 @@ typedef struct VFIOPCIDevice {
 bool no_kvm_intx;
 bool no_kvm_msi;
 bool no_kvm_msix;
+bool single_depend_dev;
 } VFIOPCIDevice;
 
 uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
-- 
1.8.3.1






[Qemu-devel] [PATCH v9 07/11] vfio: add check aer functionality for hotplug device

2016-07-19 Thread Zhou Jie
From: Chen Fan 

when function 0 is hot-added, we can check the vfio device
whether support hot bus reset.

Signed-off-by: Chen Fan 
---
 hw/vfio/pci.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 242c1e4..8bcb26b 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3149,6 +3149,19 @@ post_reset:
 vfio_pci_post_reset(vdev);
 }
 
+static void vfio_pci_is_valid(PCIDevice *dev, Error **errp)
+{
+VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, dev);
+Error *local_err = NULL;
+
+if (vdev->features & VFIO_FEATURE_ENABLE_AER) {
+vfio_check_hot_bus_reset(vdev, &local_err);
+if (local_err) {
+error_propagate(errp, local_err);
+}
+}
+}
+
 static void vfio_instance_init(Object *obj)
 {
 PCIDevice *pci_dev = PCI_DEVICE(obj);
@@ -3206,6 +3219,7 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, 
void *data)
 set_bit(DEVICE_CATEGORY_MISC, dc->categories);
 pdc->init = vfio_initfn;
 pdc->exit = vfio_exitfn;
+pdc->is_valid_func = vfio_pci_is_valid;
 pdc->config_read = vfio_pci_read_config;
 pdc->config_write = vfio_pci_write_config;
 pdc->is_express = 1; /* We might be */
-- 
1.8.3.1






[Qemu-devel] [PATCH v9 00/11] vfio-pci: pass the aer error to guest

2016-07-19 Thread Zhou Jie
From: Chen Fan 

v8-v9:
   1. Don't use resume notification.
  Host vfio driver will process aer.
  Poll the vfio_device_info to assure host reset completely.

v7-v8:
   1. Use bitmap to record error and resume notification.

v6-v7:
   1. Stall any access to the device until resume is signaled.
  Do guest directed bus reset after receive resume notification.
  If don't get the resume notification after some timeout unplug the device.
   2. fix the patches 11/12 as Alex sugguestion.

v5-v6:
   1. register resume handler both qemu and kernel to ensure the
  reset in order.
   2. fix the patches 6/12, 7/12 as Alex and MST sugguestion.

v4-v5:
   1. add back the common function 0 hotplug code in pci core.
   2. fix a sporadic device stuck on D3 problem when doing aer recovery.
   3. fix patches 5/12 ~ 9/12 as Alex sugguestion.

v3-v4:
   1. rebase patchset to fit latest master branch.
   2. modifying patches 5/10 ~ 8/10 as Alex sugguestion(Thanks).

v2-v3:
   1. fix patch 4/9, 5/9 as Alex sugguestion.
   2. patches 5/9 ~ 8/9 are made to force limiting that all vfio functions
  are combined in the same way as on the host.

v1-v2:
   1. limit all devices on same bus in guest are on same bus in host in patch 

For now, for vfio pci passthough devices, when qemu receives
an error from host aer report, it just terminate the guest.
But usually user want to know what error occurred. So these patches add
aer capability support for vfio devices, and pass the error to guest.
Then let guest driver to recover from the error.

notes: this series patches enable aer support single/multi-function,
for multi-function, require all the function of the slot assigned to
VM and on the same slot.


Chen Fan (11):
  vfio: extract vfio_get_hot_reset_info as a single function
  vfio: squeeze out vfio_pci_do_hot_reset for support bus reset
  vfio: add aer support for vfio device
  vfio: refine function vfio_pci_host_match
  vfio: add check host bus reset is support or not
  pci: add a pci_function_is_valid callback to check function if valid
  vfio: add check aer functionality for hotplug device
  vfio: vote the function 0 to do host bus reset when aer occurred
  vfio-pci: pass the aer error to guest
  vfio: Add waiting for host aer error progress
  vfio: add 'aer' property to expose aercap

 hw/pci/pci.c   |  32 +++
 hw/vfio/pci.c  | 620 -
 hw/vfio/pci.h  |   6 +
 include/hw/pci/pci.h   |   1 +
 linux-headers/linux/vfio.h |   4 +
 5 files changed, 597 insertions(+), 66 deletions(-)

-- 
1.8.3.1






[Qemu-devel] [PATCH v9 02/11] vfio: squeeze out vfio_pci_do_hot_reset for support bus reset

2016-07-19 Thread Zhou Jie
From: Chen Fan 

Squeeze out vfio_pci_do_hot_reset to do host bus reset when AER recovery.

Signed-off-by: Chen Fan 
---
 hw/vfio/pci.c | 75 +++
 1 file changed, 44 insertions(+), 31 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index c1c7d31..6a6160b 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1738,6 +1738,48 @@ error:
 return ret;
 }
 
+static int vfio_pci_do_hot_reset(VFIOPCIDevice *vdev,
+ struct vfio_pci_hot_reset_info *info)
+{
+VFIOGroup *group;
+struct vfio_pci_hot_reset *reset;
+int32_t *fds;
+int ret, i, count;
+struct vfio_pci_dependent_device *devices;
+
+/* Determine how many group fds need to be passed */
+count = 0;
+devices = &info->devices[0];
+QLIST_FOREACH(group, &vfio_group_list, next) {
+for (i = 0; i < info->count; i++) {
+if (group->groupid == devices[i].group_id) {
+count++;
+break;
+}
+}
+}
+
+reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
+reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
+fds = &reset->group_fds[0];
+
+/* Fill in group fds */
+QLIST_FOREACH(group, &vfio_group_list, next) {
+for (i = 0; i < info->count; i++) {
+if (group->groupid == devices[i].group_id) {
+fds[reset->count++] = group->fd;
+break;
+}
+}
+}
+
+/* Bus reset! */
+ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
+g_free(reset);
+
+return ret;
+}
+
 static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos)
 {
 PCIDevice *pdev = &vdev->pdev;
@@ -1965,9 +2007,7 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool 
single)
 VFIOGroup *group;
 struct vfio_pci_hot_reset_info *info = NULL;
 struct vfio_pci_dependent_device *devices;
-struct vfio_pci_hot_reset *reset;
-int32_t *fds;
-int ret, i, count;
+int ret, i;
 bool multi = false;
 
 trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi");
@@ -2045,34 +2085,7 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool 
single)
 goto out_single;
 }
 
-/* Determine how many group fds need to be passed */
-count = 0;
-QLIST_FOREACH(group, &vfio_group_list, next) {
-for (i = 0; i < info->count; i++) {
-if (group->groupid == devices[i].group_id) {
-count++;
-break;
-}
-}
-}
-
-reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
-reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
-fds = &reset->group_fds[0];
-
-/* Fill in group fds */
-QLIST_FOREACH(group, &vfio_group_list, next) {
-for (i = 0; i < info->count; i++) {
-if (group->groupid == devices[i].group_id) {
-fds[reset->count++] = group->fd;
-break;
-}
-}
-}
-
-/* Bus reset! */
-ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
-g_free(reset);
+ret = vfio_pci_do_hot_reset(vdev, info);
 
 trace_vfio_pci_hot_reset_result(vdev->vbasedev.name,
 ret ? "%m" : "Success");
-- 
1.8.3.1






[Qemu-devel] [PATCH v9 06/11] pci: add a pci_function_is_valid callback to check function if valid

2016-07-19 Thread Zhou Jie
From: Chen Fan 

PCI hotplug requires that function 0 is added last to close the
slot.  Since vfio supporting AER, we require that the VM bus
contains the same set of devices as the host bus to support AER,
we can perform an AER validation test whenever a function 0 in
the VM is hot-added.

Signed-off-by: Chen Fan 
---
 hw/pci/pci.c | 32 
 include/hw/pci/pci.h |  1 +
 2 files changed, 33 insertions(+)

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 149994b..b292b42 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -1940,6 +1940,22 @@ PCIDevice *pci_find_device(PCIBus *bus, int bus_num, 
uint8_t devfn)
 return bus->devices[devfn];
 }
 
+static void pci_function_is_valid(PCIBus *bus, PCIDevice *d, void *opaque)
+{
+PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(d);
+Error **errp = opaque;
+
+if (*errp) {
+return;
+}
+
+if (!pc->is_valid_func) {
+return;
+}
+
+pc->is_valid_func(d, errp);
+}
+
 static void pci_qdev_realize(DeviceState *qdev, Error **errp)
 {
 PCIDevice *pci_dev = (PCIDevice *)qdev;
@@ -1982,6 +1998,22 @@ static void pci_qdev_realize(DeviceState *qdev, Error 
**errp)
 pci_qdev_unrealize(DEVICE(pci_dev), NULL);
 return;
 }
+
+/*
+ *  If the function number is 0, indicate the closure of the slot.
+ *  then we get the chance to check all functions on same device
+ *  if valid.
+ */
+if (DEVICE(pci_dev)->hotplugged &&
+pci_get_function_0(pci_dev) == pci_dev) {
+pci_for_each_device(bus, pci_bus_num(bus),
+pci_function_is_valid, &local_err);
+if (local_err) {
+error_propagate(errp, local_err);
+pci_qdev_unrealize(DEVICE(pci_dev), NULL);
+return;
+}
+}
 }
 
 static void pci_default_realize(PCIDevice *dev, Error **errp)
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index 9ed1624..68b6848 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -198,6 +198,7 @@ typedef struct PCIDeviceClass {
 
 void (*realize)(PCIDevice *dev, Error **errp);
 int (*init)(PCIDevice *dev);/* TODO convert to realize() and remove */
+void (*is_valid_func)(PCIDevice *dev, Error **errp);
 PCIUnregisterFunc *exit;
 PCIConfigReadFunc *config_read;
 PCIConfigWriteFunc *config_write;
-- 
1.8.3.1






[Qemu-devel] [PATCH v9 01/11] vfio: extract vfio_get_hot_reset_info as a single function

2016-07-19 Thread Zhou Jie
From: Chen Fan 

The function is used to get affected devices by bus reset.
So here extract it, and can used for aer soon.

Signed-off-by: Chen Fan 
---
 hw/vfio/pci.c | 66 +++
 1 file changed, 48 insertions(+), 18 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 44783c5..c1c7d31 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1693,6 +1693,51 @@ static void vfio_check_af_flr(VFIOPCIDevice *vdev, 
uint8_t pos)
 }
 }
 
+/*
+ * return negative with errno, return 0 on success.
+ * if success, the point of ret_info fill with the affected device reset info.
+ *
+ */
+static int vfio_get_hot_reset_info(VFIOPCIDevice *vdev,
+   struct vfio_pci_hot_reset_info **ret_info)
+{
+struct vfio_pci_hot_reset_info *info;
+int ret, count;
+
+*ret_info = NULL;
+
+info = g_malloc0(sizeof(*info));
+info->argsz = sizeof(*info);
+
+ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
+if (ret && errno != ENOSPC) {
+ret = -errno;
+goto error;
+}
+
+count = info->count;
+
+info = g_realloc(info, sizeof(*info) +
+ (count * sizeof(struct vfio_pci_dependent_device)));
+info->argsz = sizeof(*info) +
+  (count * sizeof(struct vfio_pci_dependent_device));
+
+ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
+if (ret) {
+ret = -errno;
+error_report("vfio: hot reset info failed: %m");
+goto error;
+}
+
+*ret_info = info;
+info = NULL;
+
+return 0;
+error:
+g_free(info);
+return ret;
+}
+
 static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos)
 {
 PCIDevice *pdev = &vdev->pdev;
@@ -1918,7 +1963,7 @@ static bool vfio_pci_host_match(PCIHostDeviceAddress 
*addr, const char *name)
 static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
 {
 VFIOGroup *group;
-struct vfio_pci_hot_reset_info *info;
+struct vfio_pci_hot_reset_info *info = NULL;
 struct vfio_pci_dependent_device *devices;
 struct vfio_pci_hot_reset *reset;
 int32_t *fds;
@@ -1930,12 +1975,8 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool 
single)
 vfio_pci_pre_reset(vdev);
 vdev->vbasedev.needs_reset = false;
 
-info = g_malloc0(sizeof(*info));
-info->argsz = sizeof(*info);
-
-ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
-if (ret && errno != ENOSPC) {
-ret = -errno;
+ret = vfio_get_hot_reset_info(vdev, &info);
+if (ret) {
 if (!vdev->has_pm_reset) {
 error_report("vfio: Cannot reset device %s, "
  "no available reset mechanism.", vdev->vbasedev.name);
@@ -1943,18 +1984,7 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool 
single)
 goto out_single;
 }
 
-count = info->count;
-info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices)));
-info->argsz = sizeof(*info) + (count * sizeof(*devices));
 devices = &info->devices[0];
-
-ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
-if (ret) {
-ret = -errno;
-error_report("vfio: hot reset info failed: %m");
-goto out_single;
-}
-
 trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name);
 
 /* Verify that we have all the groups required */
-- 
1.8.3.1






[Qemu-devel] [PATCH v2 2/2] vfio : add aer process

2016-07-19 Thread Zhou Jie
From: Chen Fan 

During aer err occurs and resume do following to
protect device from being accessed.
1. Make config space read only.
2. Disable INTx/MSI Interrupt.
3. Do nothing for bar regions.

Signed-off-by: Zhou Jie 
---
 drivers/vfio/pci/vfio_pci.c | 30 ++
 drivers/vfio/pci/vfio_pci_private.h |  2 ++
 include/uapi/linux/vfio.h   |  2 ++
 3 files changed, 34 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 2d12b03..dd96b60 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -318,6 +318,7 @@ static int vfio_pci_open(void *device_data)
return -ENODEV;
 
mutex_lock(&driver_lock);
+   init_completion(&vdev->aer_error_completion);
 
if (!vdev->refcnt) {
ret = vfio_pci_enable(vdev);
@@ -571,6 +572,16 @@ static long vfio_pci_ioctl(void *device_data,
struct vfio_pci_device *vdev = device_data;
unsigned long minsz;
 
+   if (vdev->aer_error_in_progress && (cmd == VFIO_DEVICE_SET_IRQS ||
+   cmd == VFIO_DEVICE_RESET || cmd == VFIO_DEVICE_PCI_HOT_RESET)) {
+   int ret;
+   ret = wait_for_completion_interruptible(
+   &vdev->aer_error_completion);
+   if (ret) {
+   return ret;
+   }
+   }
+
if (cmd == VFIO_DEVICE_GET_INFO) {
struct vfio_device_info info;
 
@@ -587,6 +598,10 @@ static long vfio_pci_ioctl(void *device_data,
if (vdev->reset_works)
info.flags |= VFIO_DEVICE_FLAGS_RESET;
 
+   info.flags |= VFIO_DEVICE_FLAGS_AERPROCESS;
+   if (vdev->aer_error_in_progress)
+   info.flags |= VFIO_DEVICE_FLAGS_INAERPROCESS;
+
info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions;
info.num_irqs = VFIO_PCI_NUM_IRQS;
 
@@ -996,6 +1011,14 @@ static ssize_t vfio_pci_rw(void *device_data, char __user 
*buf,
 
switch (index) {
case VFIO_PCI_CONFIG_REGION_INDEX:
+   if (vdev->aer_error_in_progress && iswrite) {
+   int ret;
+   ret = wait_for_completion_interruptible(
+   &vdev->aer_error_completion);
+   if (ret) {
+   return ret;
+   }
+   }
return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite);
 
case VFIO_PCI_ROM_REGION_INDEX:
@@ -1226,6 +1249,10 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct 
pci_dev *pdev,
 
mutex_lock(&vdev->igate);
 
+   vdev->aer_error_in_progress = true;
+   vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
+   VFIO_IRQ_SET_ACTION_TRIGGER,
+   vdev->irq_type, 0, 0, NULL);
if (vdev->err_trigger)
eventfd_signal(vdev->err_trigger, 1);
 
@@ -1252,6 +1279,9 @@ static void vfio_pci_aer_resume(struct pci_dev *pdev)
}
 
mutex_lock(&vdev->igate);
+
+   vdev->aer_error_in_progress = false;
+   complete_all(&vdev->aer_error_completion);
if (vdev->resume_trigger)
eventfd_signal(vdev->resume_trigger, 1);
 
diff --git a/drivers/vfio/pci/vfio_pci_private.h 
b/drivers/vfio/pci/vfio_pci_private.h
index 80d4ddd..2f151f5 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -84,6 +84,8 @@ struct vfio_pci_device {
boolhas_vga;
boolneeds_reset;
boolnointx;
+   boolaer_error_in_progress;
+   struct completion   aer_error_completion;
struct pci_saved_state  *pci_saved_state;
int refcnt;
struct eventfd_ctx  *err_trigger;
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 34ab138..276ce50 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -198,6 +198,8 @@ struct vfio_device_info {
 #define VFIO_DEVICE_FLAGS_PCI  (1 << 1)/* vfio-pci device */
 #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2)/* vfio-platform device */
 #define VFIO_DEVICE_FLAGS_AMBA  (1 << 3)   /* vfio-amba device */
+#define VFIO_DEVICE_FLAGS_AERPROCESS  (1 << 4) /* support aer error progress */
+#define VFIO_DEVICE_FLAGS_INAERPROCESS  (1 << 5)/* status in aer error 
progress */
__u32   num_regions;/* Max region index + 1 */
__u32   num_irqs;   /* Max IRQ index + 1 */
 };
-- 
1.8.3.1






[Qemu-devel] [PATCH v2 1/2] vfio : resume notifier

2016-07-19 Thread Zhou Jie
From: root 

---
 drivers/vfio/pci/vfio_pci.c | 28 +++-
 drivers/vfio/pci/vfio_pci_intrs.c   | 18 ++
 drivers/vfio/pci/vfio_pci_private.h |  1 +
 include/uapi/linux/vfio.h   |  1 +
 4 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 188b1ff..2d12b03 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -363,7 +363,8 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device 
*vdev, int irq_type)
 
return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
}
-   } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) {
+   } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX ||
+  irq_type == VFIO_PCI_RESUME_IRQ_INDEX) {
if (pci_is_pcie(vdev->pdev))
return 1;
} else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) {
@@ -731,6 +732,7 @@ static long vfio_pci_ioctl(void *device_data,
case VFIO_PCI_REQ_IRQ_INDEX:
break;
case VFIO_PCI_ERR_IRQ_INDEX:
+   case VFIO_PCI_RESUME_IRQ_INDEX:
if (pci_is_pcie(vdev->pdev))
break;
/* pass thru to return error */
@@ -1234,8 +1236,32 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct 
pci_dev *pdev,
return PCI_ERS_RESULT_CAN_RECOVER;
 }
 
+static void vfio_pci_aer_resume(struct pci_dev *pdev)
+{
+   struct vfio_pci_device *vdev;
+   struct vfio_device *device;
+
+   device = vfio_device_get_from_dev(&pdev->dev);
+   if (device == NULL)
+   return;
+
+   vdev = vfio_device_data(device);
+   if (vdev == NULL) {
+   vfio_device_put(device);
+   return;
+   }
+
+   mutex_lock(&vdev->igate);
+   if (vdev->resume_trigger)
+   eventfd_signal(vdev->resume_trigger, 1);
+
+   mutex_unlock(&vdev->igate);
+   vfio_device_put(device);
+}
+
 static const struct pci_error_handlers vfio_err_handlers = {
.error_detected = vfio_pci_aer_err_detected,
+   .resume = vfio_pci_aer_resume,
 };
 
 static struct pci_driver vfio_pci_driver = {
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c 
b/drivers/vfio/pci/vfio_pci_intrs.c
index 15ecfc9..3a01a62 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -617,6 +617,16 @@ static int vfio_pci_set_err_trigger(struct vfio_pci_device 
*vdev,
return vfio_pci_set_ctx_trigger_single(&vdev->err_trigger, flags, data);
 }
 
+static int vfio_pci_set_resume_trigger(struct vfio_pci_device *vdev,
+   unsigned index, unsigned start,
+   unsigned count, uint32_t flags, void *data)
+{
+   if (index != VFIO_PCI_RESUME_IRQ_INDEX)
+   return -EINVAL;
+
+   return vfio_pci_set_ctx_trigger_single(&vdev->resume_trigger, flags, 
data);
+}
+
 static int vfio_pci_set_req_trigger(struct vfio_pci_device *vdev,
unsigned index, unsigned start,
unsigned count, uint32_t flags, void *data)
@@ -676,6 +686,14 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, 
uint32_t flags,
break;
}
break;
+   case VFIO_PCI_RESUME_IRQ_INDEX:
+   switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
+   case VFIO_IRQ_SET_ACTION_TRIGGER:
+   if (pci_is_pcie(vdev->pdev))
+   func = vfio_pci_set_resume_trigger;
+   break;
+   }
+   break;
}
 
if (!func)
diff --git a/drivers/vfio/pci/vfio_pci_private.h 
b/drivers/vfio/pci/vfio_pci_private.h
index 016c14a..80d4ddd 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -88,6 +88,7 @@ struct vfio_pci_device {
int refcnt;
struct eventfd_ctx  *err_trigger;
struct eventfd_ctx  *req_trigger;
+   struct eventfd_ctx  *resume_trigger;
 };
 
 #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 255a211..34ab138 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -433,6 +433,7 @@ enum {
VFIO_PCI_MSIX_IRQ_INDEX,
VFIO_PCI_ERR_IRQ_INDEX,
VFIO_PCI_REQ_IRQ_INDEX,
+   VFIO_PCI_RESUME_IRQ_INDEX,
VFIO_PCI_NUM_IRQS
 };
 
-- 
1.8.3.1






[Qemu-devel] [PATCH v2 0/2] vfio: add aer process

2016-07-19 Thread Zhou Jie
From: Chen Fan 

v1-v2:
   1. Add aer process to vfio driver.

Chen Fan (1):
  vfio : add aer process

root (1):
  vfio : resume notifier

 drivers/vfio/pci/vfio_pci.c | 58 -
 drivers/vfio/pci/vfio_pci_intrs.c   | 18 
 drivers/vfio/pci/vfio_pci_private.h |  3 ++
 include/uapi/linux/vfio.h   |  3 ++
 4 files changed, 81 insertions(+), 1 deletion(-)

-- 
1.8.3.1






Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume

2016-07-12 Thread Zhou Jie

Hi Alex,


I will use workable state support flag
to let user know whether the kenerl support block feature.
And make configure space writing and ioctl function blocked.


And what of my suggestion that a user may desire to poll the state of
the device?

I will also add a poll function to vfio_fops.


Can you explain how this will work?  I was only suggesting that one of
the flag bits in vfio_device_info be allocated to report the current
state of blocking and the user could poll by repeatedly calling the
DEVICE_INFO ioctl.  Are you thinking of using POLLOUT/POLLIN?  I'm not
sure if those are a perfect match since it's really only the PCI config
region and a few ioctls where access is blocked, other operations may
proceed normally.

Sorry, I had misunderstood your meaning.
I will use one of the flag bits in vfio_device_info to
 report the current state of blocking.


The following code will be modified.
1. vfio_pci_ioctl
add a flag in vfio_device_info for workable_state support
2. vfio_pci_ioctl
During err occurs and resume:
if (cmd == VFIO_DEVICE_SET_IRQS || VFIO_DEVICE_RESET
|| VFIO_DEVICE_GET_PCI_HOT_RESET_INFO || VFIO_DEVICE_PCI_HOT_RESET)
block for workable_state clearing
3. vfio_pci_write
During err occurs and resume:
block write to configure space
4. vfio_pci_aer_err_detected
Set workable_state to false in "struct vfio_pci_device"
teardown the interrupt
5. vfio_pci_aer_resume
Set workable_state to true in "struct vfio_pci_device"
6. vfio_fops
Add poll function


I would still suggest that the name "workable_state" is quite vague.
Something like aer_error_in_progress is much more specific.  Thanks,

OK, I will alter the name.

Sincerely
Zhou Jie






Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume

2016-07-12 Thread Zhou Jie

Hi Alex,


I will use workable state support flag
to let user know whether the kenerl support block feature.
And make configure space writing and ioctl function blocked.


And what of my suggestion that a user may desire to poll the state of
the device?

I will also add a poll function to vfio_fops.


A user does know what the vfio driver has done if you define the
behavior that on an AER error reported event, as signaled to the user
via the error notification interrupt, vfio-pci will teardown device
interrupts to an uninitialized state.  The difference between the
command register approach you suggest and the teardown I suggest is
that the command register is simply masking interrupt deliver while the
teardown approach returns the device to an uninitialized interrupt
state.  Take a look at the device state when a bus reset occurs, what
state is saved and restored and what is left at a default PCI value.
The command register is saved and restored, so any manipulation we do
of it is racing the host kernel AER handling and bus reset.  What about
MSI and MSI-X?  Looks to me like those are left at the PCI default
initialization state, so now after an AER error we have irq handlers
and eventfds configured, while in fact the device has been
de-programmed.  To handle that we're expecting users to teardown the
interrupt state and re-establish it?  Again, why not just teardown the
interrupt state ourselves?  I dont' see the value in simply masking the
command register, especially when it doesn't handle the no-DisINTx case.

I understand.
Thank you very much to explain this to me.
I will teardown the interrupt state.


We cannot depend on the behavior of any given driver and the fact that
the guest driver may teardown interrupts anyway is not a justification
that vfio shouldn't be doing this to make the device state presented to
the user consistent.  Thanks,

I understand.

The following code will be modified.
1. vfio_pci_ioctl
   add a flag in vfio_device_info for workable_state support
2. vfio_pci_ioctl
   During err occurs and resume:
   if (cmd == VFIO_DEVICE_SET_IRQS || VFIO_DEVICE_RESET
   || VFIO_DEVICE_GET_PCI_HOT_RESET_INFO || VFIO_DEVICE_PCI_HOT_RESET)
   block for workable_state clearing
3. vfio_pci_write
   During err occurs and resume:
   block write to configure space
4. vfio_pci_aer_err_detected
   Set workable_state to false in "struct vfio_pci_device"
   teardown the interrupt
5. vfio_pci_aer_resume
   Set workable_state to true in "struct vfio_pci_device"
6. vfio_fops
   Add poll function

Sincerely
Zhou Jie






Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume

2016-07-11 Thread Zhou Jie

Hi Alex,


The variable clearly isn't visible to the user, so the user can know
whether the kernel supports this feature, but not whether the feature
is currently active.  Perhaps there's no way to avoid races completely,
but don't you expect that if we define that certain operations are
blocked after an error notification that a user may want some way to
poll for whether the block is active after a reset rather than simply
calling a blocked interface to probe for it?

Yes, I will use access blocked function, not the variable.


I don't understand what this means.

I will use workable state support flag
to let user know whether the kenerl support block feature.
And make configure space writing and ioctl function blocked.


Is that acceptable?  This is part of the problem I have with silently
disabling interrupt delivery via the command register across reset.  It
seems more non-deterministic than properly disabling interrupts and
requiring the user to reinitialize them after error.

What is "properly disabling interrupts"?
User don't know what vfio driver has done.
What's the difference of "disabling interrupt delivery via
the command register" and "doing a teardown of interrupts"?
The interrupts will still lost silently.


Or does it make more sense to
simply disable the interrupts as done in vfio_pci_disable() and define
that the user needs to re-establish interrupts before continuing after
an error event?  Thanks,

If user invoked the vfio_pci_disable by ioctl function.


I'm in no way suggesting that a user invoke vfio_pci_disable(), I'm just
trying to point out that vfio_pci_disable() already does a teardown of
interrupts, similar to what seems to be required here.


Yes, user should re-establish interrupts before
continuing after an error event.


So if we define that users should re-establish interrupts after an
error event, then what's the point of only doing command register
masking of the interrupts and requiring the user to both tear-down the
interrupts and re-establish them?  Thanks,

Take "Intel(R) 10GbE PCI Express Virtual Function" as an example.
In drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
static const struct pci_error_handlers ixgbevf_err_handler = {
.error_detected = ixgbevf_io_error_detected,
.slot_reset = ixgbevf_io_slot_reset,
.resume = ixgbevf_io_resume,
};
User tear-down the interrupts in ixgbevf_io_error_detected function.
And up the interrupts in ixgbevf_io_resume.

Guest OS driver will do both tear-down the interrupts
and re-establish them.
Because it don't know what host vfio driver has done.

I disable the interrupts to pretend them interfere with device reset.

Sincerely
Zhou Jie





Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume

2016-07-09 Thread Zhou Jie

Hi Alex,


The variable clearly isn't visible to the user, so the user can know
whether the kernel supports this feature, but not whether the feature
is currently active.  Perhaps there's no way to avoid races completely,
but don't you expect that if we define that certain operations are
blocked after an error notification that a user may want some way to
poll for whether the block is active after a reset rather than simply
calling a blocked interface to probe for it?

Yes, I will use access blocked function, not the variable.


As we've discussed before, the AER notification needs to be relayed to
the user without delay, otherwise we only increase the gap where the
user might consume bogus data.  It also only seems reasonable to modify
the behavior of the interfaces (ie. blocking) if the user is notified,
which would be through the existing error notifier.  We can never
depend on a specific behavior from the user, we may be dealing with a
malicious user.

We already disable interrupts in vfio_pci_disable() simply by calling
the ioctl function directly.

Sorry, I want to know where is vfio_pci_disable invoked.
I can't find it in ioctl function.


If we simply disable and re-enable interrupts as you propose,
how does the user deal with edge triggered
interrupts that may have occurred during that period?  Are they lost?
Should we instead leave the interrupts enabled but skip
eventfd_signal() in the interrupts handlers, queuing interrupts for
re-delivery after the device is resumed?

Yes, they will lost.


Or does it make more sense to
simply disable the interrupts as done in vfio_pci_disable() and define
that the user needs to re-establish interrupts before continuing after
an error event?  Thanks,

If user invoked the vfio_pci_disable by ioctl function.
Yes, user should re-establish interrupts before
continuing after an error event.

Sincerely
Zhoujie





Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume

2016-07-07 Thread Zhou Jie

Hi Alex,


The following code will be modified.
1. vfio_pci_ioctl
   add a flag in vfio_device_info for workable_state support
   return workable_state in "struct vfio_pci_device" when user get info


Seems like two flags are required, one to indicate the presence of this
feature and another to indicate the state.  I'd prefer something like
access_blocked.

User can get the state by state flag.
And also maybe blocked by ioctl or write functons.
User has choice to invoke which functions.


Let's imagine there's one flag bit, there are two possible polarities,
a) the bit is set when access is available, b) the bit is set when
access is blocked.

Let's examine a), if the bit is not set, does that means that access is
not available or does that mean the kernel doesn't support that
feature?  There's no way to know.  Fail.  So we switch to b), an error
occurs, the bit is not set, does that mean access is blocked or does
that mean that the kernel we're using doesn't support the feature.
Fail.  If there's a way to do this with one bit, please explain it to
me.  Relying on a function to block, which may not be a valid
assumption on the kernel we're using also fails.  Userspace must be
able to know, in a deterministic and backwards compatible way, the
features of the kernel and the behavior to expect.

Sorry, I didn't say clearly.
There is one flag and one variable.
1) workable state support flag
   This flag let user know whether the kenerl support block feature.
2) workable state variable in "struct vfio_pci_device"
   This variable let user know whether the device is blocked.


2. vfio_pci_ioctl
   During err occurs and resume:
   if (cmd == VFIO_DEVICE_SET_IRQS || VFIO_DEVICE_RESET
   || VFIO_DEVICE_GET_PCI_HOT_RESET_INFO || VFIO_DEVICE_PCI_HOT_RESET)
   block for workable_state clearing
3. vfio_pci_write
   During err occurs and resume:
   ignore and return 0


This is contradictory to your comment "Do nothing for bar regions".
ISTR that returning 0 for read/write calls is an easy way to break
users since we've return neither the desired bytes nor an error code.

No, there is not change for read.
Just return 0 for write.
Return 0 mean that there is no byte has been written.
Consider that the aer has occurred,
it is better to not to write any thing to device.
User can still read/write bar regions by mmap address,
this may generate some date errors,
but it doesn't matter as device is going to be reset.


My statement still stands, you state "Do nothing for bar regions" and
"return 0 for write".  Those are contradictory and as I indicate,
returning 0 is problematic for userspace.  Additionally, making
vfio_pci_write return zero while still allowing writes through the BAR
mmap is inconsistent.

I understand.
I will block writing to configure space.
And don't change for writing bar regions.


4. vfio_pci_aer_err_detected
   Set workable_state to false in "struct vfio_pci_device"
   Disable INTx:
 If Disable INTx is support
   disable by PCI_COMMAND
 else
   disable by disable_irq function
   Disable MSI:
   disable by clearing the "Bus Master Enable" bit of PCI_COMMAND


I've suggested repeatedly to properly teardown these interrupts.  I
disagree with your proposed approach here.  If the device is intended to
be in a state that requires re-initialization then the interrupt setup
should be part of that.

I have traced the INTx functions.
-vfio_pci_intx_unmask_handler
-vfio_pci_intx_mask
-vfio_intx_set_signal

They are invoked by User.
-vfio_pci_write
-vfio_pci_ioctl

During err occurs and resume above functions are blocked.
So, User cann't set the INTx.
I will disable the INTx in vfio_pci_aer_err_detected.
And reset the INTx in vfio_pci_aer_resume
according the original user setting(vdev->ctx[0].masked).


Again, you're missing the point.  If the device is expected to be
reinitialized after resume, why don't we return the device to an
initial state where interrupts are not configured?  I think this
presents inconsistent behavior to the user.

About return the device to an
initial state where interrupts are not configured.
Do you mean free_irq?
User request_irq by vfio_pci_ioctl.
If we free_irq, how can the user know it?
Do you think guest will reinitializ the device after resume,
so it doesn't matter?

Maybe we should not expect what the guest will do.
What I want to do in vfio driver is as following.
1. aer occurs
2. Disable INTx and MSI
3. aer driver reset the device
4. Restore INTx and MSI
5. user process the aer event

Sincerely
Zhou Jie





Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume

2016-07-05 Thread Zhou Jie

Hi Alex,


Due to weekend and holiday in my country, there were zero regular
working hours between your emails.

I wish you had a good time.


The following code will be modified.
1. vfio_pci_ioctl
   add a flag in vfio_device_info for workable_state support
   return workable_state in "struct vfio_pci_device" when user get info


Seems like two flags are required, one to indicate the presence of this
feature and another to indicate the state.  I'd prefer something like
access_blocked.

User can get the state by state flag.
And also maybe blocked by ioctl or write functons.
User has choice to invoke which functions.


2. vfio_pci_ioctl
   During err occurs and resume:
   if (cmd == VFIO_DEVICE_SET_IRQS || VFIO_DEVICE_RESET
   || VFIO_DEVICE_GET_PCI_HOT_RESET_INFO || VFIO_DEVICE_PCI_HOT_RESET)
   block for workable_state clearing
3. vfio_pci_write
   During err occurs and resume:
   ignore and return 0


This is contradictory to your comment "Do nothing for bar regions".
ISTR that returning 0 for read/write calls is an easy way to break
users since we've return neither the desired bytes nor an error code.

No, there is not change for read.
Just return 0 for write.
Return 0 mean that there is no byte has been written.
Consider that the aer has occurred,
it is better to not to write any thing to device.
User can still read/write bar regions by mmap address,
this may generate some date errors,
but it doesn't matter as device is going to be reset.


4. vfio_pci_aer_err_detected
   Set workable_state to false in "struct vfio_pci_device"
   Disable INTx:
 If Disable INTx is support
   disable by PCI_COMMAND
 else
   disable by disable_irq function
   Disable MSI:
   disable by clearing the "Bus Master Enable" bit of PCI_COMMAND


I've suggested repeatedly to properly teardown these interrupts.  I
disagree with your proposed approach here.  If the device is intended to
be in a state that requires re-initialization then the interrupt setup
should be part of that.

I have traced the INTx functions.
-vfio_pci_intx_unmask_handler
-vfio_pci_intx_mask
-vfio_intx_set_signal

They are invoked by User.
-vfio_pci_write
-vfio_pci_ioctl

During err occurs and resume above functions are blocked.
So, User cann't set the INTx.
I will disable the INTx in vfio_pci_aer_err_detected.
And reset the INTx in vfio_pci_aer_resume
according the original user setting(vdev->ctx[0].masked).


5. vfio_pci_aer_resume
   Set workable_state to true in "struct vfio_pci_device"
   About INTx:
 According to the value of "vdev->ctx[0].masked"
 to decide whether to enable INTx
   About MSI:
 After reset the "Bus Master Enable" bit is default to 0.
 So user should process this after reset.


Again, I think this is error prone, teardown the interrupts and define
that the device state, including interrupts, needs to be reinitialized
after error.  Why are you not incorporating this feedback?  Thanks,

The reinitialization depend on user.
For VFIO driver the process is following.
1. aer occurs
2. disable the following functions of the device
   write(except the bar regions), ioctl and interrupt
3. aer driver reset the device
4. renable the device for user
5. user process the aer event
   Maybe reset the device and reinitialization

What I do is make sure the following points.
1. Host can reset the device between step 2 and 4.
2. The user settings is the same at step 1 and 5.

Sincerely
Zhou Jie





Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume

2016-07-04 Thread Zhou Jie

ping

On 2016/7/3 12:00, Zhou Jie wrote:

Hi Alex,

On 2016/6/30 9:45, Zhou Jie wrote:

Hi Alex,

On 2016/6/30 2:22, Alex Williamson wrote:

On Wed, 29 Jun 2016 16:54:05 +0800
Zhou Jie  wrote:


Hi Alex,


And yet we have struct pci_dev.broken_intx_masking and we test for
working DisINTx via pci_intx_mask_supported() rather than simply
looking for a PCIe device.  Some devices are broken and some simply
don't follow the spec, so you're going to need to deal with that or
exclude those devices.

For those devices I have no way to disable the INTx.


disable_irq()?  Clearly vfio-pci already manages these types of devices
and can disable INTx.  This is why I keep suggesting that maybe tearing
the interrupt setup down completely is a more complete and reliable
approach than masking in the command register.  Unless we're going to
exclude such devices from supporting this mode (which I don't condone),
we must deal with them.

Thank you for tell me that.
Yes, I can use disable_irq to disable the pci device irq.
But should I enable the irq after reset?
I will dig into it.


I will alter the VFIO driver like following.
During err occurs and resume:
1. Make config space read only.
2. Disable INTx/MSI Interrupt.
3. Do nothing for bar regions.

The following code will be modified.
1. vfio_pci_ioctl
   add a flag in vfio_device_info for workable_state support
   return workable_state in "struct vfio_pci_device" when user get info
2. vfio_pci_ioctl
   During err occurs and resume:
   if (cmd == VFIO_DEVICE_SET_IRQS || VFIO_DEVICE_RESET
   || VFIO_DEVICE_GET_PCI_HOT_RESET_INFO || VFIO_DEVICE_PCI_HOT_RESET)
   block for workable_state clearing
3. vfio_pci_write
   During err occurs and resume:
   ignore and return 0
4. vfio_pci_aer_err_detected
   Set workable_state to false in "struct vfio_pci_device"
   Disable INTx:
 If Disable INTx is support
   disable by PCI_COMMAND
 else
   disable by disable_irq function
   Disable MSI:
   disable by clearing the "Bus Master Enable" bit of PCI_COMMAND

5. vfio_pci_aer_resume
   Set workable_state to true in "struct vfio_pci_device"
   About INTx:
 According to the value of "vdev->ctx[0].masked"
 to decide whether to enable INTx
   About MSI:
 After reset the "Bus Master Enable" bit is default to 0.
 So user should process this after reset.

Sincerely
Zhou Jie






--

周潔
Dept 1
No. 6 Wenzhu Road,
Nanjing, 210012, China
TEL:+86+25-86630566-8557
FUJITSU INTERNAL:7998-8557
E-Mail:zhoujie2...@cn.fujitsu.com






Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume

2016-07-02 Thread Zhou Jie

Hi Alex,

On 2016/6/30 9:45, Zhou Jie wrote:

Hi Alex,

On 2016/6/30 2:22, Alex Williamson wrote:

On Wed, 29 Jun 2016 16:54:05 +0800
Zhou Jie  wrote:


Hi Alex,


And yet we have struct pci_dev.broken_intx_masking and we test for
working DisINTx via pci_intx_mask_supported() rather than simply
looking for a PCIe device.  Some devices are broken and some simply
don't follow the spec, so you're going to need to deal with that or
exclude those devices.

For those devices I have no way to disable the INTx.


disable_irq()?  Clearly vfio-pci already manages these types of devices
and can disable INTx.  This is why I keep suggesting that maybe tearing
the interrupt setup down completely is a more complete and reliable
approach than masking in the command register.  Unless we're going to
exclude such devices from supporting this mode (which I don't condone),
we must deal with them.

Thank you for tell me that.
Yes, I can use disable_irq to disable the pci device irq.
But should I enable the irq after reset?
I will dig into it.


I will alter the VFIO driver like following.
During err occurs and resume:
1. Make config space read only.
2. Disable INTx/MSI Interrupt.
3. Do nothing for bar regions.

The following code will be modified.
1. vfio_pci_ioctl
   add a flag in vfio_device_info for workable_state support
   return workable_state in "struct vfio_pci_device" when user get info
2. vfio_pci_ioctl
   During err occurs and resume:
   if (cmd == VFIO_DEVICE_SET_IRQS || VFIO_DEVICE_RESET
   || VFIO_DEVICE_GET_PCI_HOT_RESET_INFO || VFIO_DEVICE_PCI_HOT_RESET)
   block for workable_state clearing
3. vfio_pci_write
   During err occurs and resume:
   ignore and return 0
4. vfio_pci_aer_err_detected
   Set workable_state to false in "struct vfio_pci_device"
   Disable INTx:
 If Disable INTx is support
   disable by PCI_COMMAND
 else
   disable by disable_irq function
   Disable MSI:
   disable by clearing the "Bus Master Enable" bit of PCI_COMMAND

5. vfio_pci_aer_resume
   Set workable_state to true in "struct vfio_pci_device"
   About INTx:
 According to the value of "vdev->ctx[0].masked"
 to decide whether to enable INTx
   About MSI:
 After reset the "Bus Master Enable" bit is default to 0.
 So user should process this after reset.

Sincerely
Zhou Jie





Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume

2016-06-29 Thread Zhou Jie

Hi Alex,

On 2016/6/30 2:22, Alex Williamson wrote:

On Wed, 29 Jun 2016 16:54:05 +0800
Zhou Jie  wrote:


Hi Alex,


And yet we have struct pci_dev.broken_intx_masking and we test for
working DisINTx via pci_intx_mask_supported() rather than simply
looking for a PCIe device.  Some devices are broken and some simply
don't follow the spec, so you're going to need to deal with that or
exclude those devices.

For those devices I have no way to disable the INTx.


disable_irq()?  Clearly vfio-pci already manages these types of devices
and can disable INTx.  This is why I keep suggesting that maybe tearing
the interrupt setup down completely is a more complete and reliable
approach than masking in the command register.  Unless we're going to
exclude such devices from supporting this mode (which I don't condone),
we must deal with them.

Thank you for tell me that.
Yes, I can use disable_irq to disable the pci device irq.
But should I enable the irq after reset?
I will dig into it.

Sincerely
Zhou Jie


How does that happen, aren't we notifying the user at the point the
error occurs, while the device is still in the process or being reset?
My question is how does the user know that the host reset is complete
in order to begin their own re-initialization?

I will add a state in "struct vfio_pci_device".
The state is set when the device can not work such as a aer error
  occured.
And the state is clear when the device can work such as resume
  received.
Return the state when user get info by vfio_pci_ioctl.


The interrupt status will be cleared by hardware.
So the hardware is the same as the state when the
vfio device fd is opened.


The PCI-core in Linux will save and restore the device state around
reset, how do we know that vfio-pci itself is not racing that reset and
whether PCI-core will restore the state including our interrupt masking
or a state without it?  Do we need to restore the state to the one we
saved when we originally opened the device?  Shouldn't that mean we
teardown the interrupt setup the user had prior to the error event?

For above you said.
Maybe disable the interrupt is not a good idea.
Think about what will happend in the interrupt handler.
Maybe read/write configure space and region bar.
I will make the configure space read only.
Do nothing for region bar which used by userd.


I'm thinking that vfio-pci will be attempting to mask the interrupts
via the PCI command register, which is potentially in a state of flux
due to the host reset and yet we're somehow expecting that our write to
the command register sticks.  We certainly have the ability to a)
discard interrupts received between AER error and resume, and b) if we
want to be consistent with requiring the user to reinitialize the
device, then the user interrupt setup should likely be torn down.
Thanks,






Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume

2016-06-29 Thread Zhou Jie

Hi Alex,


And yet we have struct pci_dev.broken_intx_masking and we test for
working DisINTx via pci_intx_mask_supported() rather than simply
looking for a PCIe device.  Some devices are broken and some simply
don't follow the spec, so you're going to need to deal with that or
exclude those devices.

For those devices I have no way to disable the INTx.


How does that happen, aren't we notifying the user at the point the
error occurs, while the device is still in the process or being reset?
My question is how does the user know that the host reset is complete
in order to begin their own re-initialization?

I will add a state in "struct vfio_pci_device".
The state is set when the device can not work such as a aer error
 occured.
And the state is clear when the device can work such as resume
 received.
Return the state when user get info by vfio_pci_ioctl.


The interrupt status will be cleared by hardware.
So the hardware is the same as the state when the
vfio device fd is opened.


The PCI-core in Linux will save and restore the device state around
reset, how do we know that vfio-pci itself is not racing that reset and
whether PCI-core will restore the state including our interrupt masking
or a state without it?  Do we need to restore the state to the one we
saved when we originally opened the device?  Shouldn't that mean we
teardown the interrupt setup the user had prior to the error event?

For above you said.
Maybe disable the interrupt is not a good idea.
Think about what will happend in the interrupt handler.
Maybe read/write configure space and region bar.
I will make the configure space read only.
Do nothing for region bar which used by userd.


How will the user know when the device is
ready to be reset?  Which of the ioctls that you're blocking can they
poll w/o any unwanted side-effects or awkward interactions?  Should
flag bits in the device info ioctl indicate not only support for this
behavior but also the current status?  Thanks,

I can block the reset ioctl and config write.
I will not add flag for the device current status,
because I don't depend on user to prevent awkward interactions.


Ok, so that's a reason to block rather than return -EAGAIN.  Still we
need some way to indicate to the user whether the device supports this
new interaction rather than the existing behavior.  Thanks,

Because write configure space maybe happened in interrupt handler.
I think block is not a good choice.

Sincerely
Zhou Jie





Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume

2016-06-27 Thread Zhou Jie

Hi Alex,

On 2016/6/28 11:58, Alex Williamson wrote:

On Tue, 28 Jun 2016 11:26:33 +0800
Zhou Jie  wrote:


Hi Alex,


The INTx/MSI part needs further definition for the user.  Are we
actually completely tearing down interrupts with the expectation that
the user will re-enable them or are we just masking them such that the
user needs to unmask?  Also note that not all devices support DisINTx.


After reset, the "Bus Master Enable" bit of "Command Register"
should be cleared, so MSI/MSI- X interrupt Messages is still disabled.
After reset, the "Interrupt Disable" bit of "Command Register"
should be cleared, so INTx interrupts is enabled.
If the device doesn't support INTx, "Interrupt Disable" bit will
hardware to 0, it is OK here.

After fatal-error occurs, the user should reset the device and
reinitialize the device.
So I disable the interrupt before host reset the device,
and let user to do the reinitialization.


I'm dubious here.  When DisINTx is not supported by the device or it's
marked broken in host quirks, then we can't trust the device to stop
sending INTx.  It's hardwired to zero, meaning that it doesn't work or
it's been found to be broken in other ways.  So COMMAND register
masking is not sufficient for all devices.

For Endpoints that generate INTx interrupts, this bit is required.
For Endpoints that do not generate IN Tx interrupts this bit is
optional.  If not implemented, this bit must be hardwired to 0b.
For Root Ports, Switch Ports, and Bridges that generate INTx
interrupts on their own behalf, this bit is required.

The above is from "7.5.1.1." of "PCI Express Base Specification 3.1a".
So I think "Interrupt Disable" bit must be supported by the device
which can generate INTx interrupts.


Also, any time we start
changing the state of the device from what the user expects, we risk
consistency problems.  We need to consider how the user last saw the
device and whether we can legitimately expect them to handle the device
in a new state.  If we expect the user to re-initialize the device then
would it be more correct to teardown all interrupt signaling such that
the device is effectively in the same state as initial handoff when the
vfio device fd is opened?

Before the user re-initialize the device, host has reseted the device.
The interrupt status will be cleared by hardware.
So the hardware is the same as the state when the
vfio device fd is opened.


How will the user know when the device is
ready to be reset?  Which of the ioctls that you're blocking can they
poll w/o any unwanted side-effects or awkward interactions?  Should
flag bits in the device info ioctl indicate not only support for this
behavior but also the current status?  Thanks,

I can block the reset ioctl and config write.
I will not add flag for the device current status,
because I don't depend on user to prevent awkward interactions.

Sincerely
Zhou Jie





Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume

2016-06-27 Thread Zhou Jie

Hi Alex,


The INTx/MSI part needs further definition for the user.  Are we
actually completely tearing down interrupts with the expectation that
the user will re-enable them or are we just masking them such that the
user needs to unmask?  Also note that not all devices support DisINTx.


After reset, the "Bus Master Enable" bit of "Command Register"
should be cleared, so MSI/MSI- X interrupt Messages is still disabled.
After reset, the "Interrupt Disable" bit of "Command Register"
should be cleared, so INTx interrupts is enabled.
If the device doesn't support INTx, "Interrupt Disable" bit will
hardware to 0, it is OK here.

After fatal-error occurs, the user should reset the device and
reinitialize the device.
So I disable the interrupt before host reset the device,
and let user to do the reinitialization.


Otherwise it seems like a reasonable approach, but I can't guarantee we
won't find new issues along the way.  For instance we'll need to test
how -EAGAIN returns interact with existing QEMU and maybe decided
whether there are cases that are better handled by doing an
interruptible wait.  Thanks,

I will dig into it.

Sincerely
Zhou Jie





Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume

2016-06-24 Thread Zhou Jie

Hi Alex,


We should never depend on the guest driver to behave in a certain way,
but we need to prioritize what that actually means.  vfio in the kernel
has a responsibility first and foremost to the host kernel.  User owned
devices cannot be allowed to exploit or interfere with the host
regardless of user behavior.  The next priority is correct operation
for the user.  When the host kernel is handling the AER event between
the error and resume notifies, it doesn't have device specific drivers,
it's manipulating the device as a generic PCI device.  That makes me
think that vfio should not allow the user to interact (interfere) with
the device during that process and that such interference can be
limited to standard PCI level interactions.  That means config space,
and things that operate on config space (like interrupt ioctls and
resets).  On the QEMU side, we've sent a notification that an error
occurred, how the user and the guest respond to that is beyond the
concern of vfio in the kernel.  If the user/guest driver continues to
interact with resources on the device, that's fine, but I think vfio in
the kernel does need to prevent the user from interfering with the PCI
state of the device for that brief window when we know the host kernel
is operating on the device.  Otherwise the results are unpredictable
and therefore unsupportable.  Does that make sense?  Thanks,

I understand.

I want to alter the VFIO driver like following.
During err occurs and resume:
1. Make config space read only.
   Ignore config space writing to prevent the user from
   interfering with the PCI state of the device.
   User can get the error infomation by reading the config space.
2. Disable INTx and MSI
   Write "Command Register" to disable INTx and MSI.
3. Do nothing for bar regions.
   Guest driver may access bar regions.
   It doesn't matter as device is going to be reset.

The following code will be modified.
1. vfio_pci_ioctl
   add flag for aer support
2. vfio_pci_ioctl
   During err occurs and resume:
   if (cmd == VFIO_DEVICE_SET_IRQS) return EAGAIN
   if (cmd == VFIO_DEVICE_RESET) return EAGAIN
   if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) return EAGAIN
   if (cmd == VFIO_DEVICE_PCI_HOT_RESET) return EAGAIN
3. vfio_pci_write
   During err occurs and resume:
   block
4. vfio_pci_aer_err_detected
   Set aer state in "struct vfio_pci_device"
   Write "Command Register" to disable INTx and MSI.
5. vfio_pci_aer_resume
   Clear aer state in "struct vfio_pci_device"
   I don't need to enable INTx and MSI.
   The device will be initalized by guest driver.

Sincerely
Zhoujie





Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume

2016-06-22 Thread Zhou Jie

Hi Alex,

On 2016/6/22 13:45, Zhou Jie wrote:

Hi Alex,



In vfio I have some questions.
1. How can I disable the access by mmap?
We can disable all access to vfio fd by returning a EAGAIN error
if user try to access it during the reset period until the host
reset finished.
But about the bar region which is maped by vfio_pci_mmap.
How can I disable it in vfio driver?
Even there is a way to do it,
how about the complexity to recovery the mmap?


That's exactly the "sticky point" I refer to above, you'd need to
solve that problem.  MST would probably still argue that we don't need
to disable all those interfaces, a userspace driver can already do
things like disable mmio space and then attempt to read from the mmio
space of the device.

You said we should not depend on user to protect the device
 be accessed during the reset period.


So maybe the problem can be simplified to
non-device specific interfaces, like config space access plus ioctls.

I don't understand what's your mean.


When a fatal aer error occurs the process is following.
For host
   aer driver detect aer error
-> vfio driver send aer error
-> aer driver reset bus
-> qemu report aer error
For guest
-> aer driver detect aer error
-> aer driver reset bus
-> device driver maybe disable device

I am not sure if all the device driver disable device
when a fatal aer error occurs.
Should we depend on the guest device driver to protect the device
be accessed during the reset period?

Sincerely
Zhoujie





Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume

2016-06-21 Thread Zhou Jie

Hi Alex,



In vfio I have some questions.
1. How can I disable the access by mmap?
We can disable all access to vfio fd by returning a EAGAIN error
if user try to access it during the reset period until the host
reset finished.
But about the bar region which is maped by vfio_pci_mmap.
How can I disable it in vfio driver?
Even there is a way to do it,
how about the complexity to recovery the mmap?


That's exactly the "sticky point" I refer to above, you'd need to
solve that problem.  MST would probably still argue that we don't need
to disable all those interfaces, a userspace driver can already do
things like disable mmio space and then attempt to read from the mmio
space of the device.

You said we should not depend on user to protect the device
 be accessed during the reset period.


So maybe the problem can be simplified to
non-device specific interfaces, like config space access plus ioctls.

I don't understand what's your mean.

Sincerely
Zhoujie





Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume

2016-06-21 Thread Zhou Jie

Hi Alex,


Hi Alex,
  on kernel side, I think if we don't trust the user behaviors, we
should
  disable the access of vfio-pci interface once vfio-pci driver got the
error_detected,
  we should disable all access to vfio fd regardless whether the vfio-pci
  was assigned to a VM, we also can return a EAGAIN error if user try
  to access it during the reset period until the host reset finished.
  on qemu side, when we got a error_detect, we pass through the
aer error to guest directly, ignore all access to vfio-pci during this
time,
when qemu need to do a hot reset, we can retry to get the info from
the get info ioctl until we got the info that vfio-pci has been reset
finished,
then do the hot_reset ioctl if need, the kernel should ensure the ioctl
become
 accessible after host reset completed.



That sounds pretty thorough, the sticky point there is always disabling
the device mmaps w/o a revoke interface.  Do we invalidate the pfn
range and setup a fault handler that blocks on access?  I don't think
we have a whole lot of options, either block or sigbus, but having such
a mechanism might allow us to easily put a device in a "dead" state
where the user can't touch it, which could be useful for other purposes
too.  QEMU would also need to timeout after some number of reset
attempts and assume the device is not coming back.  Plus we'd need a
device flag to indicate this behavior.  Thanks,

Alex


In vfio I have some questions.
1. How can I disable the access by mmap?
   We can disable all access to vfio fd by returning a EAGAIN error
   if user try to access it during the reset period until the host
   reset finished.
   But about the bar region which is maped by vfio_pci_mmap.
   How can I disable it in vfio driver?
   Even there is a way to do it,
   how about the complexity to recovery the mmap?

In qemu I have following proposals.
1. Setup a fault handler that blocks on access of bar region.
   So the data transmission will be blocked.
2. Disable vfio_pci_write_config, but keep vfio_pci_read_config
   enabled.
   The VM can get the error information by reading configure space.
   But operation of writing the configure space will be ignored.
3. Get VFIO device infomation instend of receiving resume notification.
   When I tested the non-fatal error.
   I found that sometimes the qemu receive resume notification earlier
   than error notification.
   The notification receiving time between different eventfd is not
   in the order of sending time.

Sincerely
Zhoujie





Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume

2016-06-20 Thread Zhou Jie

Hi, Alex


I was really hoping to hear your opinion, or at least some further
discussion of pros and cons rather than simply parroting back my idea.

I understand.


My current thinking is that a resume notifier to userspace is poorly
defined, it's not clear what the user can and cannot do between an
error notification and the resume notification.

Yes, do nothing between that time is better.


One approach to solve
that might be that the kernel internally handles the resume
notifications.  Maybe that means blocking the ioctl (interruptible
timeout) until the internal resume occurs, or maybe that means
returning -EAGAIN.

I don't think it is a good idea.
The kernel give the error and resume notifications, it's enough.
It's up to user to how to use them.


Probably implementations of each need to be worked
through to determine which is better.  We don't want to add complexity
to the kernel simply to make things easier for userspace, but we also
don't want a poorly specified interface that is difficult for
userspace to use correctly.  Thanks,

In qemu, the aer recovery process:
  1. Detect support for resume notification
 If host vfio driver does not support for resume notification,
 directly fail to boot up VM as with aer enabled.
  2. Immediately notify the VM on error detected.
  3. Disable the device.
 Unmap the config space and bar region.
  4. Delay the guest directed bus reset.
  5. Wait for resume notification.
 If we don't get the resume notification from the host after
 some timeout, we would abort the guest directed bus reset
 altogether and unplug of the device to prevent it from further
 interacting with the VM.
  6. After get the resume notification reset bus and enable the device.

I think we only make sure the disabled device
 will not interact with the VM.

Sincerely
Zhou jie



Alex


.



--

周潔
Dept 1
No. 6 Wenzhu Road,
Nanjing, 210012, China
TEL:+86+25-86630566-8557
FUJITSU INTERNAL:7998-8557
E-Mail:zhoujie2...@cn.fujitsu.com






Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume

2016-06-20 Thread Zhou Jie

ping

On 2016/6/12 10:38, Zhou Jie wrote:

Hi, Alex


It seems like we have a number of questions open in the thread with MST
from the previous version, particularly whether we should actually drop
the resume notifier and block the reset in the kernel.  The concern
being that it's not very well specified what we can and cannot do
between the error interrupt and the resume interrupt.  We'd probably
need some other indicate of whether the host has this capability,
perhaps a flag in vfio_device_info.  Appreciate your opinions there.
Thanks,

Alex


Have you had a conclusion with MST?

How about the process like following?
1. Detect support for VFIO reset blocking.
   Maybe use vfio_device_info.
2. Immediately notify the VM on error detected.
3. Invoke ioctl(VFIO_DEVICE_PCI_HOT_RESET).
   If kernel is doing reset, then block in ioctl.

   Can I ignore the resume notifier?
   else
   Shall ioctl block until after receiving resume notification?

Sincerely
Zhou Jie






--

周潔
Dept 1
No. 6 Wenzhu Road,
Nanjing, 210012, China
TEL:+86+25-86630566-8557
FUJITSU INTERNAL:7998-8557
E-Mail:zhoujie2...@cn.fujitsu.com






Re: [Qemu-devel] [RFC PATCH 0/3] x86: Add support for guest DMA dirty page tracking

2016-06-11 Thread Zhou Jie

Hi, Alex

On 2016/6/9 23:39, Alexander Duyck wrote:

On Thu, Jun 9, 2016 at 3:14 AM, Zhou Jie  wrote:

TO Alex
TO Michael

   In your solution you add a emulate PCI bridge to act as
   a bridge between direct assigned devices and the host bridge.
   Do you mean put all direct assigned devices to
   one emulate PCI bridge?
   If yes, this maybe bring some problems.

   We are writing a patchset to support aer feature in qemu.
   When assigning a vfio device with AER enabled, we must check whether
   the device supports a host bus reset (ie. hot reset) as this may be
   used by the guest OS in order to recover the device from an AER
   error.
   QEMU must therefore have the ability to perform a physical
   host bus reset using the existing vfio APIs in response to a virtual
   bus reset in the VM.
   A physical bus reset affects all of the devices on the host bus.
   Therefore all physical devices affected by a bus reset must be
   configured on the same virtual bus in the VM.
   And no devices unaffected by the bus reset,
   be configured on the same virtual bus.

   http://lists.nongnu.org/archive/html/qemu-devel/2016-05/msg02989.html

Sincerely,
Zhou Jie


That makes sense, but I don't think you have to worry much about this
at this point at least on my side as this was mostly just theory and I
haven't had a chance to put any of it into practice as of yet.

My idea has been evolving on this for a while.  One thought I had is
that we may want to have something like an emulated IOMMU and if
possible we would want to split it up over multiple domains just so we
can be certain that the virtual interfaces and the physical ones
existed in separate domains.  In regards to your concerns perhaps what
we could do is put each assigned device into its own domain to prevent
them from affecting each other.  To that end we could probably break
things up so that each device effectively lives in its own PCIe slot
in the emulated system.  Then when we start a migration of the guest
the assigned device domains would then have to be tracked for unmap
and sync calls when the direction is from the device.

I will keep your concerns in mind in the future when I get some time
to look at exploring this solution further.

- Alex


I am thinking about the practice of migration of passthrough device.

In your solution, you use a vendor specific configuration space to
negotiate with guest.
If you put each assigned device into its own domain,
how can qemu negotiate with guest?
Add the vendor specific configuration space to every pci bus which
is assigned a passthrough device?

Sincerely
Zhou Jie





Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume

2016-06-11 Thread Zhou Jie

Hi, Alex


It seems like we have a number of questions open in the thread with MST
from the previous version, particularly whether we should actually drop
the resume notifier and block the reset in the kernel.  The concern
being that it's not very well specified what we can and cannot do
between the error interrupt and the resume interrupt.  We'd probably
need some other indicate of whether the host has this capability,
perhaps a flag in vfio_device_info.  Appreciate your opinions there.
Thanks,

Alex


Have you had a conclusion with MST?

How about the process like following?
1. Detect support for VFIO reset blocking.
   Maybe use vfio_device_info.
2. Immediately notify the VM on error detected.
3. Invoke ioctl(VFIO_DEVICE_PCI_HOT_RESET).
   If kernel is doing reset, then block in ioctl.

   Can I ignore the resume notifier?
   else
   Shall ioctl block until after receiving resume notification?

Sincerely
Zhou Jie





Re: [Qemu-devel] [RFC PATCH 0/3] x86: Add support for guest DMA dirty page tracking

2016-06-09 Thread Zhou Jie

TO Alex
TO Michael

   In your solution you add a emulate PCI bridge to act as
   a bridge between direct assigned devices and the host bridge.
   Do you mean put all direct assigned devices to
   one emulate PCI bridge?
   If yes, this maybe bring some problems.

   We are writing a patchset to support aer feature in qemu.
   When assigning a vfio device with AER enabled, we must check whether
   the device supports a host bus reset (ie. hot reset) as this may be
   used by the guest OS in order to recover the device from an AER
   error.
   QEMU must therefore have the ability to perform a physical
   host bus reset using the existing vfio APIs in response to a virtual
   bus reset in the VM.
   A physical bus reset affects all of the devices on the host bus.
   Therefore all physical devices affected by a bus reset must be
   configured on the same virtual bus in the VM.
   And no devices unaffected by the bus reset,
   be configured on the same virtual bus.

   http://lists.nongnu.org/archive/html/qemu-devel/2016-05/msg02989.html

Sincerely,
Zhou Jie

On 2016/6/7 0:04, Alex Duyck wrote:

On Mon, Jun 6, 2016 at 2:18 AM, Zhou Jie  wrote:

Hi Alex,


On 2016/1/6 0:18, Alexander Duyck wrote:


On Tue, Jan 5, 2016 at 1:40 AM, Michael S. Tsirkin  wrote:


On Mon, Jan 04, 2016 at 07:11:25PM -0800, Alexander Duyck wrote:


The two mechanisms referenced above would likely require coordination
with
QEMU and as such are open to discussion.  I haven't attempted to
address
them as I am not sure there is a consensus as of yet.  My personal
preference would be to add a vendor-specific configuration block to
the
emulated pci-bridge interfaces created by QEMU that would allow us to
essentially extend shpc to support guest live migration with
pass-through
devices.



shpc?



That is kind of what I was thinking.  We basically need some mechanism
to allow for the host to ask the device to quiesce.  It has been
proposed to possibly even look at something like an ACPI interface
since I know ACPI is used by QEMU to manage hot-plug in the standard
case.

- Alex




Start by using hot-unplug for this!

Really use your patch guest side, and write host side
to allow starting migration with the device, but
defer completing it.



Yeah, I'm fully on board with this idea, though I'm not really working
on this right now since last I knew the folks on this thread from
Intel were working on it.  My patches were mostly meant to be a nudge
in this direction so that we could get away from the driver specific
code.



I have seen your email about live migration.

I conclude the idea you proposed as following.
1. Extend swiotlb to allow for a page dirtying functionality.
2. Use pci express capability to implement of a PCI bridge to act
   as a bridge between direct assigned devices and the host bridge.
3. Using APCI event or extend shpc driver to support device pause.
Is it right?

Will you implement the patchs for live migration?


That is pretty much the heart of the proposal I had.  I submitted an
RFC as a proof-of-concept for item 1 in the hopes that someone else
might try tackling items 2 and 3 but I haven't seen any updates since
then.  The trick is to find a way to make it so that item 1 doesn't
slow down standard SWIOTLB when you are not migrating a VM. If nothing
else we would probably just need to add a static key that we could
default to false unless there is a PCI bridge indicating we are
starting a migration.

I haven't had time to really work on this though. In addition I am not
that familiar with QEMU and the internals of live migration so pieces
2 and 3 would take me some additional time to work on.

- Alex


.







Re: [Qemu-devel] [RFC PATCH 0/3] x86: Add support for guest DMA dirty page tracking

2016-06-06 Thread Zhou Jie

Hi Alex,

On 2016/1/6 0:18, Alexander Duyck wrote:

On Tue, Jan 5, 2016 at 1:40 AM, Michael S. Tsirkin  wrote:

On Mon, Jan 04, 2016 at 07:11:25PM -0800, Alexander Duyck wrote:

The two mechanisms referenced above would likely require coordination with
QEMU and as such are open to discussion.  I haven't attempted to address
them as I am not sure there is a consensus as of yet.  My personal
preference would be to add a vendor-specific configuration block to the
emulated pci-bridge interfaces created by QEMU that would allow us to
essentially extend shpc to support guest live migration with pass-through
devices.


shpc?


That is kind of what I was thinking.  We basically need some mechanism
to allow for the host to ask the device to quiesce.  It has been
proposed to possibly even look at something like an ACPI interface
since I know ACPI is used by QEMU to manage hot-plug in the standard
case.

- Alex



Start by using hot-unplug for this!

Really use your patch guest side, and write host side
to allow starting migration with the device, but
defer completing it.


Yeah, I'm fully on board with this idea, though I'm not really working
on this right now since last I knew the folks on this thread from
Intel were working on it.  My patches were mostly meant to be a nudge
in this direction so that we could get away from the driver specific
code.


I have seen your email about live migration.

I conclude the idea you proposed as following.
1. Extend swiotlb to allow for a page dirtying functionality.
2. Use pci express capability to implement of a PCI bridge to act
   as a bridge between direct assigned devices and the host bridge.
3. Using APCI event or extend shpc driver to support device pause.
Is it right?

Will you implement the patchs for live migration?

Sincerely,
Zhou Jie





So

1.- host tells guest to start tracking memory writes
2.- guest acks
3.- migration starts
4.- most memory is migrated
5.- host tells guest to eject device
6.- guest acks
7.- stop vm and migrate rest of state



Sounds about right.  The only way this differs from what I see as the
final solution for this is that instead of fully ejecting the device
in step 5 the driver would instead pause the device and give the host
something like 10 seconds to stop the VM and resume with the same
device connected if it is available.  We would probably also need to
look at a solution that would force the device to be ejected or abort
prior to starting the migration if it doesn't give us the ack in step
2.


It will already be a win since hot unplug after migration starts and
most memory has been migrated is better than hot unplug before migration
starts.


Right.  Generally the longer the VF can be maintained as a part of the
guest the longer the network performance is improved versus using a
purely virtual interface.


Then measure downtime and profile. Then we can look at ways
to quiesce device faster which really means step 5 is replaced
with "host tells guest to quiesce device and dirty (or just unmap!)
all memory mapped for write by device".


Step 5 will be the spot where we really need to start modifying
drivers.  Specifically we probably need to go through and clean-up
things so that we can reduce as many of the delays in the driver
suspend/resume path as possible.  I suspect there is quite a bit that
can be done there that would probably also improve boot and shutdown
times since those are also impacted by the devices.

- Alex



.







[Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume

2016-05-26 Thread Zhou Jie
From: Chen Fan 

For supporting aer recovery, host and guest would run the same aer
recovery code, that would do the secondary bus reset if the error
is fatal, the aer recovery process:
  1. error_detected
  2. reset_link (if fatal)
  3. slot_reset/mmio_enabled
  4. resume

It indicates that host will do secondary bus reset to reset
the physical devices under bus in step 2, that would cause
devices in D3 status in a short time. But in qemu, we register
an error detected handler, that would be invoked as host broadcasts
the error-detected event in step 1, in order to avoid guest do
reset_link when host do reset_link simultaneously. it may cause
fatal error. we introduce a resmue notifier to assure host reset
completely.
In qemu, the aer recovery process:
  1. Detect support for resume notification
 If host vfio driver does not support for resume notification,
 directly fail to boot up VM as with aer enabled.
  2. Immediately notify the VM on error detected.
  3. Delay the guest directed bus reset.
  4. Wait for resume notification.
 If we don't get the resume notification from the host after
 some timeout, we would abort the guest directed bus reset
 altogether and unplug of the device to prevent it from further
 interacting with the VM.
  5. After get the resume notification reset bus.

Signed-off-by: Chen Fan 
Signed-off-by: Zhou Jie 
---
v7->v8
 *Add a comment for why VFIO_RESET_TIMEOUT value is 1000.
 *change vfio_resume_cap to pci_aer_has_resume.
 *change vfio_resume_notifier_handler to vfio_aer_resume_notifier_handler.
 *change reset_timer to pci_aer_reset_blocked_timer.
 *Remove error_report for not supporting resume notification in
  vfio_populate_device function.
 *All error and resume tracking is done with atomic bitmap on function 0.
 *Remove stalling any access to the device until resume is signaled.
  Because guest OS need read configure space to know the reason of error.
  And it should up to guest OS to decide to stop access BAR region.
 *Still use timer to delay reset.
  Because vfio_aer_resume_notifier_handler cann't be invoked when
  vfio_pci_reset is blocked.

 hw/vfio/pci.c  | 223 -
 hw/vfio/pci.h  |   5 +
 linux-headers/linux/vfio.h |   1 +
 3 files changed, 228 insertions(+), 1 deletion(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 6877a3d..77d86d8 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -35,6 +35,12 @@
 #include "trace.h"
 
 #define MSIX_CAP_LENGTH 12
+/*
+ * Timeout for waiting resume notification, it is 1 second.
+ * The resume notificaton will be sent after host aer error recovery.
+ * For hardware bus reset 1 second will be enough.
+ */
+#define VFIO_RESET_TIMEOUT 1000
 
 static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
@@ -1937,6 +1943,14 @@ static void vfio_check_hot_bus_reset(VFIOPCIDevice 
*vdev, Error **errp)
 VFIOGroup *group;
 int ret, i, devfn, range_limit;
 
+if (!vdev->pci_aer_has_resume) {
+error_setg(errp, "vfio: Cannot enable AER for device %s,"
+   " host vfio driver does not support for"
+   " resume notification",
+   vdev->vbasedev.name);
+return;
+}
+
 ret = vfio_get_hot_reset_info(vdev, &info);
 if (ret) {
 error_setg(errp, "vfio: Cannot enable AER for device %s,"
@@ -2594,6 +2608,17 @@ static int vfio_populate_device(VFIOPCIDevice *vdev)
  vbasedev->name);
 }
 
+irq_info.index = VFIO_PCI_RESUME_IRQ_INDEX;
+
+ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
+if (ret) {
+/* This can fail for an old kernel or legacy PCI dev */
+trace_vfio_populate_device_get_irq_info_failure();
+ret = 0;
+} else if (irq_info.count == 1) {
+vdev->pci_aer_has_resume = true;
+}
+
 error:
 return ret;
 }
@@ -2606,6 +2631,72 @@ static void vfio_put_device(VFIOPCIDevice *vdev)
 vfio_put_base_device(&vdev->vbasedev);
 }
 
+static void vfio_aer_error_set_signaled(VFIOPCIDevice *vdev)
+{
+PCIDevice *dev = &vdev->pdev;
+PCIDevice *dev_0 = pci_get_function_0(dev);
+VFIOPCIDevice *vdev_0 = DO_UPCAST(VFIOPCIDevice, pdev, dev_0);
+unsigned int func = PCI_FUNC(dev->devfn);
+
+bitmap_set_atomic(vdev_0->pci_aer_error_signaled_bitmap, func, 1);
+}
+
+static void vfio_aer_resume_set_signaled(VFIOPCIDevice *vdev)
+{
+PCIDevice *dev = &vdev->pdev;
+PCIDevice *dev_0 = pci_get_function_0(dev);
+VFIOPCIDevice *vdev_0 = DO_UPCAST(VFIOPCIDevice, pdev, dev_0);
+unsigned int func = PCI_FUNC(dev->devfn);
+
+bitmap_set_atomic(vdev_0->pci_aer_resume_signaled_bitmap, func, 1);
+}
+
+static void vfio_aer_error_clear_all_signaled(VFIOPCIDevice *vdev)
+{
+PCIDevice *d

Re: [Qemu-devel] [PATCH 11/12] vfio: register aer resume notification handler for aer resume

2016-05-25 Thread Zhou Jie

Hi, Alex

>> Do we even need a timer?  What if we simply spin?
>>
>> for (i = 0; i < 1000; i++) {
>> if (vdev->pci_aer_resume_signaled) {
>> break;
>> }
>> g_usleep(1000);
>> }
>>
>> if (i == 1000) {
>> /* We failed */
>> } else {
>> /* Proceed with reset */
>> }
>>
>> Does QEMU have enough concurrency to do this?  Thanks,On 2016/5/25
In my test, it can not work.
vfio_aer_resume_notifier_handler is invoked after vfio_pci_reset.
It is useless to wait in vfio_pci_reset.

Sincerely,
Zhou Jie

14:23, Zhou Jie wrote:

Hi, Alex


  3. Stall any access to the device until resume is signaled.


The code below doesn't actually do this, mmaps are disabled, but that
just means any device access will use the slow path through QEMU.  That
path is still fully enabled and so is config space access.

I will modify the code and find other way to
stall any access to the device.


I find that to stall any access to the device,
I need modify the following function.
1. vfio_region_read and vfio_region_write
   For stalling any access to the device bar region.
2. vfio_vga_read and vfio_vga_write
   For stalling any access to the vga device region.
3. vfio_pci_read_config and vfio_pci_write_config
   For stalling any access to the device config space.

What will happen if I don't stall any access to the device?

Sincerely,
Zhou Jie








Re: [Qemu-devel] [PATCH 11/12] vfio: register aer resume notification handler for aer resume

2016-05-24 Thread Zhou Jie

Hi, Alex


  3. Stall any access to the device until resume is signaled.


The code below doesn't actually do this, mmaps are disabled, but that
just means any device access will use the slow path through QEMU.  That
path is still fully enabled and so is config space access.

I will modify the code and find other way to
stall any access to the device.


I find that to stall any access to the device,
I need modify the following function.
1. vfio_region_read and vfio_region_write
   For stalling any access to the device bar region.
2. vfio_vga_read and vfio_vga_write
   For stalling any access to the vga device region.
3. vfio_pci_read_config and vfio_pci_write_config
   For stalling any access to the device config space.

What will happen if I don't stall any access to the device?

Sincerely,
Zhou Jie





Re: [Qemu-devel] [patch v6 11/12] vfio: register aer resume notification handler for aer resume

2016-05-24 Thread Zhou Jie
So I needn't disable access to the device between being notified that 
the error occurred and being notified to resume.

This will make code simpler.

Am I right?

On 2016/5/24 18:49, Michael S. Tsirkin wrote:

On Tue, Apr 26, 2016 at 08:48:15AM -0600, Alex Williamson wrote:

I think that means that if we want to switch from a
simple halt-on-error to a mechanism for the guest to handle recovery,
we need to disable access to the device between being notified that the
error occurred and being notified to resume.


But this isn't what happens on bare metal.
Errors are reported asynchronously and host might access the device
meanwhile.  These accesses might or might not trigger more errors, but
fundamentally this should not matter too much as device is going to be
reset.







Re: [Qemu-devel] [PATCH 11/12] vfio: register aer resume notification handler for aer resume

2016-05-18 Thread Zhou Jie



On 2016/5/19 10:18, Alex Williamson wrote:

On Thu, 19 May 2016 09:49:00 +0800
Zhou Jie  wrote:


On 2016/5/19 2:26, Alex Williamson wrote:

On Wed, 18 May 2016 11:31:09 +0800
Zhou Jie  wrote:


From: Chen Fan 

For supporting aer recovery, host and guest would run the same aer
recovery code, that would do the secondary bus reset if the error
is fatal, the aer recovery process:
  1. error_detected
  2. reset_link (if fatal)
  3. slot_reset/mmio_enabled
  4. resume

It indicates that host will do secondary bus reset to reset
the physical devices under bus in step 2, that would cause
devices in D3 status in a short time. But in qemu, we register
an error detected handler, that would be invoked as host broadcasts
the error-detected event in step 1, in order to avoid guest do
reset_link when host do reset_link simultaneously. it may cause
fatal error. we introduce a resmue notifier to assure host reset
completely.
In qemu, the aer recovery process:
  1. Detect support for resume notification
 If host vfio driver does not support for resume notification,
 directly fail to boot up VM as with aer enabled.
  2. Immediately notify the VM on error detected.
  3. Stall any access to the device until resume is signaled.


The code below doesn't actually do this, mmaps are disabled, but that
just means any device access will use the slow path through QEMU.  That
path is still fully enabled and so is config space access.

I will modify the code and find other way to
stall any access to the device.


  4. Delay the guest directed bus reset.


It's delayed, but not the way I expected.  The guest goes on executing
and then we do the reset at some point later?  More comments below...


  5. Wait for resume notification.
 If we don't get the resume notification from the host after
 some timeout, we would abort the guest directed bus reset
 altogether and unplug of the device to prevent it from further
 interacting with the VM.
  6. After get the resume notification reset bus.

Signed-off-by: Chen Fan 
Signed-off-by: Zhou Jie 
---
 hw/vfio/pci.c  | 182 -
 hw/vfio/pci.h  |   5 ++
 linux-headers/linux/vfio.h |   1 +
 3 files changed, 186 insertions(+), 2 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 6877a3d..39a9a9f 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -35,6 +35,7 @@
 #include "trace.h"

 #define MSIX_CAP_LENGTH 12
+#define VFIO_RESET_TIMEOUT 1000


It deserves at least a comment as to why this value was chosen.

OK. I will add a comment here.


 static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
@@ -1937,6 +1938,14 @@ static void vfio_check_hot_bus_reset(VFIOPCIDevice 
*vdev, Error **errp)
 VFIOGroup *group;
 int ret, i, devfn, range_limit;

+if (!vdev->vfio_resume_cap) {
+error_setg(errp, "vfio: Cannot enable AER for device %s,"
+   " host vfio driver does not support for"
+   " resume notification",
+   vdev->vbasedev.name);
+return;
+}
+
 ret = vfio_get_hot_reset_info(vdev, &info);
 if (ret) {
 error_setg(errp, "vfio: Cannot enable AER for device %s,"
@@ -2594,6 +2603,23 @@ static int vfio_populate_device(VFIOPCIDevice *vdev)
  vbasedev->name);
 }

+irq_info.index = VFIO_PCI_RESUME_IRQ_INDEX;
+
+ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
+if (ret) {
+/* This can fail for an old kernel or legacy PCI dev */
+trace_vfio_populate_device_get_irq_info_failure();
+ret = 0;
+} else if (irq_info.count == 1) {
+vdev->vfio_resume_cap = true;


"vfio" in the name is redundant, ERR_IRQ uses pci_aer, so a logical
name might be pci_aer_has_resume.

OK.


+} else {
+error_report("vfio: %s "
+ "Could not enable error recovery for the device,"
+ " because host vfio driver does not support for"
+ " resume notification",
+ vbasedev->name);
+}


This error_report makes sense for ERR_IRQ because halt-on-AER is setup
transparently, but I don't think it makes sense here.  If the user has
specified to enable AER then it should either work or they should get
an error message.  If they have not specified to enable AER, why does
the user care if there's an inconsistency here?

OK. I will delete the error report at here.


+
 error:
 return ret;
 }
@@ -2661,6 +2687,17 @@ static void vfio_err_notifier_handler(void *opaque)
 msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
  PCI_ERR_ROOT_CMD_NONFATAL_EN;

+if (isfatal) {
+PCIDevice *dev_0 = pci_get

Re: [Qemu-devel] [PATCH 11/12] vfio: register aer resume notification handler for aer resume

2016-05-18 Thread Zhou Jie



On 2016/5/19 10:18, Alex Williamson wrote:

On Thu, 19 May 2016 09:49:00 +0800
Zhou Jie  wrote:


On 2016/5/19 2:26, Alex Williamson wrote:

On Wed, 18 May 2016 11:31:09 +0800
Zhou Jie  wrote:


From: Chen Fan 

For supporting aer recovery, host and guest would run the same aer
recovery code, that would do the secondary bus reset if the error
is fatal, the aer recovery process:
  1. error_detected
  2. reset_link (if fatal)
  3. slot_reset/mmio_enabled
  4. resume

It indicates that host will do secondary bus reset to reset
the physical devices under bus in step 2, that would cause
devices in D3 status in a short time. But in qemu, we register
an error detected handler, that would be invoked as host broadcasts
the error-detected event in step 1, in order to avoid guest do
reset_link when host do reset_link simultaneously. it may cause
fatal error. we introduce a resmue notifier to assure host reset
completely.
In qemu, the aer recovery process:
  1. Detect support for resume notification
 If host vfio driver does not support for resume notification,
 directly fail to boot up VM as with aer enabled.
  2. Immediately notify the VM on error detected.
  3. Stall any access to the device until resume is signaled.


The code below doesn't actually do this, mmaps are disabled, but that
just means any device access will use the slow path through QEMU.  That
path is still fully enabled and so is config space access.

I will modify the code and find other way to
stall any access to the device.


  4. Delay the guest directed bus reset.


It's delayed, but not the way I expected.  The guest goes on executing
and then we do the reset at some point later?  More comments below...


  5. Wait for resume notification.
 If we don't get the resume notification from the host after
 some timeout, we would abort the guest directed bus reset
 altogether and unplug of the device to prevent it from further
 interacting with the VM.
  6. After get the resume notification reset bus.

Signed-off-by: Chen Fan 
Signed-off-by: Zhou Jie 
---
 hw/vfio/pci.c  | 182 -
 hw/vfio/pci.h  |   5 ++
 linux-headers/linux/vfio.h |   1 +
 3 files changed, 186 insertions(+), 2 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 6877a3d..39a9a9f 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -35,6 +35,7 @@
 #include "trace.h"

 #define MSIX_CAP_LENGTH 12
+#define VFIO_RESET_TIMEOUT 1000


It deserves at least a comment as to why this value was chosen.

OK. I will add a comment here.


 static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
@@ -1937,6 +1938,14 @@ static void vfio_check_hot_bus_reset(VFIOPCIDevice 
*vdev, Error **errp)
 VFIOGroup *group;
 int ret, i, devfn, range_limit;

+if (!vdev->vfio_resume_cap) {
+error_setg(errp, "vfio: Cannot enable AER for device %s,"
+   " host vfio driver does not support for"
+   " resume notification",
+   vdev->vbasedev.name);
+return;
+}
+
 ret = vfio_get_hot_reset_info(vdev, &info);
 if (ret) {
 error_setg(errp, "vfio: Cannot enable AER for device %s,"
@@ -2594,6 +2603,23 @@ static int vfio_populate_device(VFIOPCIDevice *vdev)
  vbasedev->name);
 }

+irq_info.index = VFIO_PCI_RESUME_IRQ_INDEX;
+
+ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
+if (ret) {
+/* This can fail for an old kernel or legacy PCI dev */
+trace_vfio_populate_device_get_irq_info_failure();
+ret = 0;
+} else if (irq_info.count == 1) {
+vdev->vfio_resume_cap = true;


"vfio" in the name is redundant, ERR_IRQ uses pci_aer, so a logical
name might be pci_aer_has_resume.

OK.


+} else {
+error_report("vfio: %s "
+ "Could not enable error recovery for the device,"
+ " because host vfio driver does not support for"
+ " resume notification",
+ vbasedev->name);
+}


This error_report makes sense for ERR_IRQ because halt-on-AER is setup
transparently, but I don't think it makes sense here.  If the user has
specified to enable AER then it should either work or they should get
an error message.  If they have not specified to enable AER, why does
the user care if there's an inconsistency here?

OK. I will delete the error report at here.


+
 error:
 return ret;
 }
@@ -2661,6 +2687,17 @@ static void vfio_err_notifier_handler(void *opaque)
 msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
  PCI_ERR_ROOT_CMD_NONFATAL_EN;

+if (isfatal) {
+PCIDevice *dev_0 = pci_get

Re: [Qemu-devel] [PATCH 11/12] vfio: register aer resume notification handler for aer resume

2016-05-18 Thread Zhou Jie



On 2016/5/19 2:26, Alex Williamson wrote:

On Wed, 18 May 2016 11:31:09 +0800
Zhou Jie  wrote:


From: Chen Fan 

For supporting aer recovery, host and guest would run the same aer
recovery code, that would do the secondary bus reset if the error
is fatal, the aer recovery process:
  1. error_detected
  2. reset_link (if fatal)
  3. slot_reset/mmio_enabled
  4. resume

It indicates that host will do secondary bus reset to reset
the physical devices under bus in step 2, that would cause
devices in D3 status in a short time. But in qemu, we register
an error detected handler, that would be invoked as host broadcasts
the error-detected event in step 1, in order to avoid guest do
reset_link when host do reset_link simultaneously. it may cause
fatal error. we introduce a resmue notifier to assure host reset
completely.
In qemu, the aer recovery process:
  1. Detect support for resume notification
 If host vfio driver does not support for resume notification,
 directly fail to boot up VM as with aer enabled.
  2. Immediately notify the VM on error detected.
  3. Stall any access to the device until resume is signaled.


The code below doesn't actually do this, mmaps are disabled, but that
just means any device access will use the slow path through QEMU.  That
path is still fully enabled and so is config space access.

I will modify the code and find other way to
stall any access to the device.


  4. Delay the guest directed bus reset.


It's delayed, but not the way I expected.  The guest goes on executing
and then we do the reset at some point later?  More comments below...


  5. Wait for resume notification.
 If we don't get the resume notification from the host after
 some timeout, we would abort the guest directed bus reset
 altogether and unplug of the device to prevent it from further
 interacting with the VM.
  6. After get the resume notification reset bus.

Signed-off-by: Chen Fan 
Signed-off-by: Zhou Jie 
---
 hw/vfio/pci.c  | 182 -
 hw/vfio/pci.h  |   5 ++
 linux-headers/linux/vfio.h |   1 +
 3 files changed, 186 insertions(+), 2 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 6877a3d..39a9a9f 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -35,6 +35,7 @@
 #include "trace.h"

 #define MSIX_CAP_LENGTH 12
+#define VFIO_RESET_TIMEOUT 1000


It deserves at least a comment as to why this value was chosen.

OK. I will add a comment here.


 static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
@@ -1937,6 +1938,14 @@ static void vfio_check_hot_bus_reset(VFIOPCIDevice 
*vdev, Error **errp)
 VFIOGroup *group;
 int ret, i, devfn, range_limit;

+if (!vdev->vfio_resume_cap) {
+error_setg(errp, "vfio: Cannot enable AER for device %s,"
+   " host vfio driver does not support for"
+   " resume notification",
+   vdev->vbasedev.name);
+return;
+}
+
 ret = vfio_get_hot_reset_info(vdev, &info);
 if (ret) {
 error_setg(errp, "vfio: Cannot enable AER for device %s,"
@@ -2594,6 +2603,23 @@ static int vfio_populate_device(VFIOPCIDevice *vdev)
  vbasedev->name);
 }

+irq_info.index = VFIO_PCI_RESUME_IRQ_INDEX;
+
+ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
+if (ret) {
+/* This can fail for an old kernel or legacy PCI dev */
+trace_vfio_populate_device_get_irq_info_failure();
+ret = 0;
+} else if (irq_info.count == 1) {
+vdev->vfio_resume_cap = true;


"vfio" in the name is redundant, ERR_IRQ uses pci_aer, so a logical
name might be pci_aer_has_resume.

OK.


+} else {
+error_report("vfio: %s "
+ "Could not enable error recovery for the device,"
+ " because host vfio driver does not support for"
+ " resume notification",
+ vbasedev->name);
+}


This error_report makes sense for ERR_IRQ because halt-on-AER is setup
transparently, but I don't think it makes sense here.  If the user has
specified to enable AER then it should either work or they should get
an error message.  If they have not specified to enable AER, why does
the user care if there's an inconsistency here?

OK. I will delete the error report at here.


+
 error:
 return ret;
 }
@@ -2661,6 +2687,17 @@ static void vfio_err_notifier_handler(void *opaque)
 msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
  PCI_ERR_ROOT_CMD_NONFATAL_EN;

+if (isfatal) {
+PCIDevice *dev_0 = pci_get_function_0(dev);
+vdev->vfio_resume_wait = true;
+vdev->vfio_resume_arri

[Qemu-devel] [PATCH 10/12] vfio-pci: pass the aer error to guest

2016-05-17 Thread Zhou Jie
From: Chen Fan 

when the vfio device encounters an uncorrectable error in host,
the vfio_pci driver will signal the eventfd registered by this
vfio device, resulting in the qemu eventfd handler getting
invoked.

this patch is to pass the error to guest and let the guest driver
recover from the error.

Signed-off-by: Chen Fan 
---
 hw/vfio/pci.c | 60 +--
 1 file changed, 54 insertions(+), 6 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 47fd407..6877a3d 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2609,18 +2609,66 @@ static void vfio_put_device(VFIOPCIDevice *vdev)
 static void vfio_err_notifier_handler(void *opaque)
 {
 VFIOPCIDevice *vdev = opaque;
+PCIDevice *dev = &vdev->pdev;
+Error *local_err = NULL;
+PCIEAERMsg msg = {
+.severity = 0,
+.source_id = (pci_bus_num(dev->bus) << 8) | dev->devfn,
+};
 
 if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
 return;
 }
 
+
+if (!(vdev->features & VFIO_FEATURE_ENABLE_AER)) {
+goto stop;
+}
+
+/*
+ * in case the real hardware configuration has been changed,
+ * here we should recheck the bus reset capability.
+ */
+vfio_check_hot_bus_reset(vdev, &local_err);
+if (local_err) {
+error_report_err(local_err);
+goto stop;
+}
+
+/*
+ * we should read the error details from the real hardware
+ * configuration spaces, here we only need to do is signaling
+ * to guest an uncorrectable error has occurred.
+ */
+if (dev->exp.aer_cap) {
+uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
+uint32_t uncor_status;
+bool isfatal;
+
+uncor_status = vfio_pci_read_config(dev,
+   dev->exp.aer_cap + PCI_ERR_UNCOR_STATUS, 4);
+
+/*
+ * if the error is not emitted by this device, we can
+ * just ignore it.
+ */
+if (!(uncor_status & ~0UL)) {
+return;
+}
+
+isfatal = uncor_status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER);
+
+msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
+ PCI_ERR_ROOT_CMD_NONFATAL_EN;
+
+pcie_aer_msg(dev, &msg);
+return;
+}
+
+stop:
 /*
- * TBD. Retrieve the error details and decide what action
- * needs to be taken. One of the actions could be to pass
- * the error to the guest and have the guest driver recover
- * from the error. This requires that PCIe capabilities be
- * exposed to the guest. For now, we just terminate the
- * guest to contain the error.
+ * If the aer capability is not exposed to the guest. we just
+ * terminate the guest to contain the error.
  */
 
 error_report("%s(%s) Unrecoverable error detected. Please collect any data 
possible and then kill the guest", __func__, vdev->vbasedev.name);
-- 
1.8.3.1






[Qemu-devel] [PATCH 09/12] vfio: vote the function 0 to do host bus reset when aer occurred

2016-05-17 Thread Zhou Jie
From: Chen Fan 

Due to all devices assigned to VM on the same way as host if enable
aer, so we can easily do the hot reset by selecting the function #0
to do the hot reset.

Signed-off-by: Chen Fan 
---
 hw/vfio/pci.c | 14 ++
 hw/vfio/pci.h |  1 +
 2 files changed, 15 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 109b11b..47fd407 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1948,6 +1948,8 @@ static void vfio_check_hot_bus_reset(VFIOPCIDevice *vdev, 
Error **errp)
 /* List all affected devices by bus reset */
 devices = &info->devices[0];
 
+vdev->single_depend_dev = (info->count == 1);
+
 /* Verify that we have all the groups required */
 for (i = 0; i < info->count; i++) {
 PCIHostDeviceAddress host;
@@ -3058,6 +3060,18 @@ static void vfio_pci_reset(DeviceState *dev)
 
 trace_vfio_pci_reset(vdev->vbasedev.name);
 
+if (vdev->features & VFIO_FEATURE_ENABLE_AER) {
+PCIDevice *br = pci_bridge_get_device(pdev->bus);
+
+if ((pci_get_word(br->config + PCI_BRIDGE_CONTROL) &
+ PCI_BRIDGE_CTL_BUS_RESET)) {
+if (pci_get_function_0(pdev) == pdev) {
+vfio_pci_hot_reset(vdev, vdev->single_depend_dev);
+}
+return;
+}
+}
+
 vfio_pci_pre_reset(vdev);
 
 if (vdev->resetfn && !vdev->resetfn(vdev)) {
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index db7c6d5..9fb0206 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -143,6 +143,7 @@ typedef struct VFIOPCIDevice {
 bool no_kvm_intx;
 bool no_kvm_msi;
 bool no_kvm_msix;
+bool single_depend_dev;
 } VFIOPCIDevice;
 
 uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
-- 
1.8.3.1






[Qemu-devel] [PATCH 04/12] vfio: add aer support for vfio device

2016-05-17 Thread Zhou Jie
From: Chen Fan 

Calling pcie_aer_init to initilize aer related registers for
vfio device, then reload physical related registers to expose
device capability.

Signed-off-by: Chen Fan 
---
 hw/vfio/pci.c | 83 +--
 hw/vfio/pci.h |  3 +++
 2 files changed, 84 insertions(+), 2 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index f697853..0516d94 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1877,6 +1877,66 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t 
pos)
 return 0;
 }
 
+static int vfio_setup_aer(VFIOPCIDevice *vdev, uint8_t cap_ver,
+  int pos, uint16_t size)
+{
+PCIDevice *pdev = &vdev->pdev;
+PCIDevice *dev_iter;
+uint8_t type;
+uint32_t errcap;
+
+if (!(vdev->features & VFIO_FEATURE_ENABLE_AER)) {
+pcie_add_capability(pdev, PCI_EXT_CAP_ID_ERR,
+cap_ver, pos, size);
+return 0;
+}
+
+dev_iter = pci_bridge_get_device(pdev->bus);
+if (!dev_iter) {
+goto error;
+}
+
+while (dev_iter) {
+if (!pci_is_express(dev_iter)) {
+goto error;
+}
+
+type = pcie_cap_get_type(dev_iter);
+if ((type != PCI_EXP_TYPE_ROOT_PORT &&
+ type != PCI_EXP_TYPE_UPSTREAM &&
+ type != PCI_EXP_TYPE_DOWNSTREAM)) {
+goto error;
+}
+
+if (!dev_iter->exp.aer_cap) {
+goto error;
+}
+
+dev_iter = pci_bridge_get_device(dev_iter->bus);
+}
+
+errcap = vfio_pci_read_config(pdev, pos + PCI_ERR_CAP, 4);
+/*
+ * The ability to record multiple headers is depending on
+ * the state of the Multiple Header Recording Capable bit and
+ * enabled by the Multiple Header Recording Enable bit.
+ */
+if ((errcap & PCI_ERR_CAP_MHRC) &&
+(errcap & PCI_ERR_CAP_MHRE)) {
+pdev->exp.aer_log.log_max = PCIE_AER_LOG_MAX_DEFAULT;
+} else {
+pdev->exp.aer_log.log_max = 0;
+}
+
+pcie_cap_deverr_init(pdev);
+return pcie_aer_init(pdev, pos, size);
+
+error:
+error_report("vfio: Unable to enable AER for device %s, parent bus "
+ "does not support AER signaling", vdev->vbasedev.name);
+return -1;
+}
+
 static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
 {
 PCIDevice *pdev = &vdev->pdev;
@@ -1884,6 +1944,7 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
 uint16_t cap_id, next, size;
 uint8_t cap_ver;
 uint8_t *config;
+int ret = 0;
 
 /*
  * pcie_add_capability always inserts the new capability at the tail
@@ -1907,7 +1968,19 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
  */
 size = vfio_ext_cap_max_size(config, next);
 
-pcie_add_capability(pdev, cap_id, cap_ver, next, size);
+switch (cap_id) {
+case PCI_EXT_CAP_ID_ERR:
+ret = vfio_setup_aer(vdev, cap_ver, next, size);
+break;
+default:
+pcie_add_capability(pdev, cap_id, cap_ver, next, size);
+break;
+}
+
+if (ret) {
+goto out;
+}
+
 pci_set_long(pdev->config + next, PCI_EXT_CAP(cap_id, cap_ver, 0));
 
 /* Use emulated next pointer to allow dropping extended caps */
@@ -1915,8 +1988,9 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
PCI_EXT_CAP_NEXT_MASK);
 }
 
+out:
 g_free(config);
-return 0;
+return ret;
 }
 
 static int vfio_add_capabilities(VFIOPCIDevice *vdev)
@@ -2673,6 +2747,11 @@ static int vfio_initfn(PCIDevice *pdev)
 goto out_teardown;
 }
 
+if ((vdev->features & VFIO_FEATURE_ENABLE_AER) &&
+!pdev->exp.aer_cap) {
+goto out_teardown;
+}
+
 /* QEMU emulates all of MSI & MSIX */
 if (pdev->cap_present & QEMU_PCI_CAP_MSIX) {
 memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff,
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index 3976f68..7b3924e 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -15,6 +15,7 @@
 #include "qemu-common.h"
 #include "exec/memory.h"
 #include "hw/pci/pci.h"
+#include "hw/pci/pci_bridge.h"
 #include "hw/vfio/vfio-common.h"
 #include "qemu/event_notifier.h"
 #include "qemu/queue.h"
@@ -128,6 +129,8 @@ typedef struct VFIOPCIDevice {
 #define VFIO_FEATURE_ENABLE_VGA (1 << VFIO_FEATURE_ENABLE_VGA_BIT)
 #define VFIO_FEATURE_ENABLE_REQ_BIT 1
 #define VFIO_FEATURE_ENABLE_REQ (1 << VFIO_FEATURE_ENABLE_REQ_BIT)
+#define VFIO_FEATURE_ENABLE_AER_BIT 2
+#define VFIO_FEATURE_ENABLE_AER (1 << VFIO_FEATURE_ENABLE_AER_BIT)
 int32_t bootindex;
 uint8_t pm_cap;
 bool has_vga;
-- 
1.8.3.1






[Qemu-devel] [PATCH 12/12] vfio: add 'aer' property to expose aercap

2016-05-17 Thread Zhou Jie
From: Chen Fan 

add 'aer' property to let user able to decide whether expose
the aer capability. by default we should disable aer feature,
because it needs configuration restrictions.

Signed-off-by: Chen Fan 
---
 hw/vfio/pci.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 39a9a9f..bf8fc95 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3369,6 +3369,8 @@ static Property vfio_pci_dev_properties[] = {
sub_vendor_id, PCI_ANY_ID),
 DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice,
sub_device_id, PCI_ANY_ID),
+DEFINE_PROP_BIT("aer", VFIOPCIDevice, features,
+VFIO_FEATURE_ENABLE_AER_BIT, false),
 /*
  * TODO - support passed fds... is this necessary?
  * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name),
-- 
1.8.3.1






[Qemu-devel] [PATCH 05/12] vfio: refine function vfio_pci_host_match

2016-05-17 Thread Zhou Jie
From: Chen Fan 

Signed-off-by: Chen Fan 
---
 hw/vfio/pci.c | 21 +
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 0516d94..5b23a86 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2060,14 +2060,27 @@ static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
 vfio_intx_enable(vdev);
 }
 
+static int vfio_pci_name_to_addr(const char *name, PCIHostDeviceAddress *addr)
+{
+if (strlen(name) != 12 ||
+sscanf(name, "%04x:%02x:%02x.%1x", &addr->domain,
+   &addr->bus, &addr->slot, &addr->function) != 4) {
+return -EINVAL;
+}
+
+return 0;
+}
+
 static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
 {
-char tmp[13];
+PCIHostDeviceAddress tmp;
 
-sprintf(tmp, "%04x:%02x:%02x.%1x", addr->domain,
-addr->bus, addr->slot, addr->function);
+if (vfio_pci_name_to_addr(name, &tmp)) {
+return false;
+}
 
-return (strcmp(tmp, name) == 0);
+return (tmp.domain == addr->domain && tmp.bus == addr->bus &&
+tmp.slot == addr->slot && tmp.function == addr->function);
 }
 
 static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
-- 
1.8.3.1






[Qemu-devel] [PATCH 02/12] vfio: squeeze out vfio_pci_do_hot_reset for support bus reset

2016-05-17 Thread Zhou Jie
From: Chen Fan 

Squeeze out vfio_pci_do_hot_reset to do host bus reset when AER recovery.

Signed-off-by: Chen Fan 
---
 hw/vfio/pci.c | 75 +++
 1 file changed, 44 insertions(+), 31 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index cf40f9e..1ad47ef 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1746,6 +1746,48 @@ error:
 return ret;
 }
 
+static int vfio_pci_do_hot_reset(VFIOPCIDevice *vdev,
+ struct vfio_pci_hot_reset_info *info)
+{
+VFIOGroup *group;
+struct vfio_pci_hot_reset *reset;
+int32_t *fds;
+int ret, i, count;
+struct vfio_pci_dependent_device *devices;
+
+/* Determine how many group fds need to be passed */
+count = 0;
+devices = &info->devices[0];
+QLIST_FOREACH(group, &vfio_group_list, next) {
+for (i = 0; i < info->count; i++) {
+if (group->groupid == devices[i].group_id) {
+count++;
+break;
+}
+}
+}
+
+reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
+reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
+fds = &reset->group_fds[0];
+
+/* Fill in group fds */
+QLIST_FOREACH(group, &vfio_group_list, next) {
+for (i = 0; i < info->count; i++) {
+if (group->groupid == devices[i].group_id) {
+fds[reset->count++] = group->fd;
+break;
+}
+}
+}
+
+/* Bus reset! */
+ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
+g_free(reset);
+
+return ret;
+}
+
 static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos)
 {
 PCIDevice *pdev = &vdev->pdev;
@@ -1889,9 +1931,7 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool 
single)
 VFIOGroup *group;
 struct vfio_pci_hot_reset_info *info = NULL;
 struct vfio_pci_dependent_device *devices;
-struct vfio_pci_hot_reset *reset;
-int32_t *fds;
-int ret, i, count;
+int ret, i;
 bool multi = false;
 
 trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi");
@@ -1969,34 +2009,7 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool 
single)
 goto out_single;
 }
 
-/* Determine how many group fds need to be passed */
-count = 0;
-QLIST_FOREACH(group, &vfio_group_list, next) {
-for (i = 0; i < info->count; i++) {
-if (group->groupid == devices[i].group_id) {
-count++;
-break;
-}
-}
-}
-
-reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
-reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
-fds = &reset->group_fds[0];
-
-/* Fill in group fds */
-QLIST_FOREACH(group, &vfio_group_list, next) {
-for (i = 0; i < info->count; i++) {
-if (group->groupid == devices[i].group_id) {
-fds[reset->count++] = group->fd;
-break;
-}
-}
-}
-
-/* Bus reset! */
-ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
-g_free(reset);
+ret = vfio_pci_do_hot_reset(vdev, info);
 
 trace_vfio_pci_hot_reset_result(vdev->vbasedev.name,
 ret ? "%m" : "Success");
-- 
1.8.3.1






[Qemu-devel] [PATCH 03/12] vfio: add pcie extended capability support

2016-05-17 Thread Zhou Jie
From: Chen Fan 

For vfio pcie device, we could expose the extended capability on
PCIE bus. due to add a new pcie capability at the tail of the chain,
in order to avoid config space overwritten, we introduce a copy config
for parsing extended caps. and rebuild the pcie extended config space.

Signed-off-by: Chen Fan 
---
 hw/vfio/pci.c | 72 ++-
 1 file changed, 71 insertions(+), 1 deletion(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 1ad47ef..f697853 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1528,6 +1528,21 @@ static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, 
uint8_t pos)
 return next - pos;
 }
 
+
+static uint16_t vfio_ext_cap_max_size(const uint8_t *config, uint16_t pos)
+{
+uint16_t tmp, next = PCIE_CONFIG_SPACE_SIZE;
+
+for (tmp = PCI_CONFIG_SPACE_SIZE; tmp;
+tmp = PCI_EXT_CAP_NEXT(pci_get_long(config + tmp))) {
+if (tmp > pos && tmp < next) {
+next = tmp;
+}
+}
+
+return next - pos;
+}
+
 static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask)
 {
 pci_set_word(buf, (pci_get_word(buf) & ~mask) | val);
@@ -1862,16 +1877,71 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, 
uint8_t pos)
 return 0;
 }
 
+static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
+{
+PCIDevice *pdev = &vdev->pdev;
+uint32_t header;
+uint16_t cap_id, next, size;
+uint8_t cap_ver;
+uint8_t *config;
+
+/*
+ * pcie_add_capability always inserts the new capability at the tail
+ * of the chain.  Therefore to end up with a chain that matches the
+ * physical device, we cache the config space to avoid overwriting
+ * the original config space when we parse the extended capabilities.
+ */
+config = g_memdup(pdev->config, vdev->config_size);
+
+for (next = PCI_CONFIG_SPACE_SIZE; next;
+ next = PCI_EXT_CAP_NEXT(pci_get_long(config + next))) {
+header = pci_get_long(config + next);
+cap_id = PCI_EXT_CAP_ID(header);
+cap_ver = PCI_EXT_CAP_VER(header);
+
+/*
+ * If it becomes important to configure extended capabilities to their
+ * actual size, use this as the default when it's something we don't
+ * recognize. Since QEMU doesn't actually handle many of the config
+ * accesses, exact size doesn't seem worthwhile.
+ */
+size = vfio_ext_cap_max_size(config, next);
+
+pcie_add_capability(pdev, cap_id, cap_ver, next, size);
+pci_set_long(pdev->config + next, PCI_EXT_CAP(cap_id, cap_ver, 0));
+
+/* Use emulated next pointer to allow dropping extended caps */
+pci_long_test_and_set_mask(vdev->emulated_config_bits + next,
+   PCI_EXT_CAP_NEXT_MASK);
+}
+
+g_free(config);
+return 0;
+}
+
 static int vfio_add_capabilities(VFIOPCIDevice *vdev)
 {
 PCIDevice *pdev = &vdev->pdev;
+int ret;
 
 if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) ||
 !pdev->config[PCI_CAPABILITY_LIST]) {
 return 0; /* Nothing to add */
 }
 
-return vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]);
+ret = vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]);
+if (ret) {
+return ret;
+}
+
+/* on PCI bus, it doesn't make sense to expose extended capabilities. */
+if (!pci_is_express(pdev) ||
+!pci_bus_is_express(pdev->bus) ||
+!pci_get_long(pdev->config + PCI_CONFIG_SPACE_SIZE)) {
+return 0;
+}
+
+return vfio_add_ext_cap(vdev);
 }
 
 static void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
-- 
1.8.3.1






[Qemu-devel] [PATCH 06/12] vfio: add check host bus reset is support or not

2016-05-17 Thread Zhou Jie
From: Chen Fan 

When assigning a vfio device with AER enabled, we must check whether
the device supports a host bus reset (ie. hot reset) as this may be
used by the guest OS in order to recover the device from an AER
error.  QEMU must therefore have the ability to perform a physical
host bus reset using the existing vfio APIs in response to a virtual
bus reset in the VM.  A physical bus reset affects all of the devices
on the host bus, therefore we place a few simplifying configuration
restriction on the VM:

 - All physical devices affected by a bus reset must be assigned to
   the VM with AER enabled on each and be configured on the same
   virtual bus in the VM.

 - No devices unaffected by the bus reset, be they physical, emulated,
   or paravirtual may be configured on the same virtual bus as a
   device supporting AER signaling through vfio.

In other words users wishing to enable AER on a multifunction device
need to assign all functions of the device to the same virtual bus
and enable AER support for each device.  The easiest way to
accomplish this is to identity map the physical functions to virtual
functions with multifunction enabled on the virtual device.

Signed-off-by: Chen Fan 
---
 hw/vfio/pci.c | 279 +-
 hw/vfio/pci.h |   1 +
 2 files changed, 257 insertions(+), 23 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 5b23a86..832f25e 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -19,6 +19,7 @@
  */
 
 #include "qemu/osdep.h"
+#include "qapi/error.h"
 #include 
 #include 
 #include 
@@ -1716,6 +1717,42 @@ static void vfio_check_af_flr(VFIOPCIDevice *vdev, 
uint8_t pos)
 }
 }
 
+static int vfio_pci_name_to_addr(const char *name, PCIHostDeviceAddress *addr)
+{
+if (strlen(name) != 12 ||
+sscanf(name, "%04x:%02x:%02x.%1x", &addr->domain,
+   &addr->bus, &addr->slot, &addr->function) != 4) {
+return -EINVAL;
+}
+
+return 0;
+}
+
+static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
+{
+PCIHostDeviceAddress tmp;
+
+if (vfio_pci_name_to_addr(name, &tmp)) {
+return false;
+}
+
+return (tmp.domain == addr->domain && tmp.bus == addr->bus &&
+tmp.slot == addr->slot && tmp.function == addr->function);
+}
+
+static bool vfio_pci_host_match_slot(PCIHostDeviceAddress *addr,
+ const char *name)
+{
+PCIHostDeviceAddress tmp;
+
+if (vfio_pci_name_to_addr(name, &tmp)) {
+return false;
+}
+
+return (tmp.domain == addr->domain && tmp.bus == addr->bus &&
+tmp.slot == addr->slot);
+}
+
 /*
  * return negative with errno, return 0 on success.
  * if success, the point of ret_info fill with the affected device reset info.
@@ -1877,6 +1914,200 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, 
uint8_t pos)
 return 0;
 }
 
+static int vfio_device_range_limit(PCIBus *bus)
+{
+PCIDevice *br;
+
+br = pci_bridge_get_device(bus);
+if (!br ||
+!pci_is_express(br) ||
+!(br->exp.exp_cap) ||
+pcie_cap_is_arifwd_enabled(br)) {
+return 255;
+}
+
+return 8;
+}
+
+static void vfio_check_hot_bus_reset(VFIOPCIDevice *vdev, Error **errp)
+{
+PCIBus *bus = vdev->pdev.bus;
+struct vfio_pci_hot_reset_info *info = NULL;
+struct vfio_pci_dependent_device *devices;
+VFIOGroup *group;
+int ret, i, devfn, range_limit;
+
+ret = vfio_get_hot_reset_info(vdev, &info);
+if (ret) {
+error_setg(errp, "vfio: Cannot enable AER for device %s,"
+   " device does not support hot reset.",
+   vdev->vbasedev.name);
+return;
+}
+
+/* List all affected devices by bus reset */
+devices = &info->devices[0];
+
+/* Verify that we have all the groups required */
+for (i = 0; i < info->count; i++) {
+PCIHostDeviceAddress host;
+VFIOPCIDevice *tmp;
+VFIODevice *vbasedev_iter;
+bool found = false;
+
+host.domain = devices[i].segment;
+host.bus = devices[i].bus;
+host.slot = PCI_SLOT(devices[i].devfn);
+host.function = PCI_FUNC(devices[i].devfn);
+
+/* Skip the current device */
+if (vfio_pci_host_match(&host, vdev->vbasedev.name)) {
+continue;
+}
+
+/* Ensure we own the group of the affected device */
+QLIST_FOREACH(group, &vfio_group_list, next) {
+if (group->groupid == devices[i].group_id) {
+break;
+}
+}
+
+if (!group) {
+error_setg(errp, "vfio: Cannot enable AER for device %s, "
+   "depends on group %d which is not owned.",
+   vdev->vbasedev.name, devices[i].group_id);
+goto out;
+}
+
+/* Ensure affected devices for reset on the same bus */
+QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
+

[Qemu-devel] [PATCH 11/12] vfio: register aer resume notification handler for aer resume

2016-05-17 Thread Zhou Jie
From: Chen Fan 

For supporting aer recovery, host and guest would run the same aer
recovery code, that would do the secondary bus reset if the error
is fatal, the aer recovery process:
  1. error_detected
  2. reset_link (if fatal)
  3. slot_reset/mmio_enabled
  4. resume

It indicates that host will do secondary bus reset to reset
the physical devices under bus in step 2, that would cause
devices in D3 status in a short time. But in qemu, we register
an error detected handler, that would be invoked as host broadcasts
the error-detected event in step 1, in order to avoid guest do
reset_link when host do reset_link simultaneously. it may cause
fatal error. we introduce a resmue notifier to assure host reset
completely.
In qemu, the aer recovery process:
  1. Detect support for resume notification
 If host vfio driver does not support for resume notification,
 directly fail to boot up VM as with aer enabled.
  2. Immediately notify the VM on error detected.
  3. Stall any access to the device until resume is signaled.
  4. Delay the guest directed bus reset.
  5. Wait for resume notification.
 If we don't get the resume notification from the host after
 some timeout, we would abort the guest directed bus reset
 altogether and unplug of the device to prevent it from further
 interacting with the VM.
  6. After get the resume notification reset bus.

Signed-off-by: Chen Fan 
Signed-off-by: Zhou Jie 
---
 hw/vfio/pci.c  | 182 -
 hw/vfio/pci.h  |   5 ++
 linux-headers/linux/vfio.h |   1 +
 3 files changed, 186 insertions(+), 2 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 6877a3d..39a9a9f 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -35,6 +35,7 @@
 #include "trace.h"
 
 #define MSIX_CAP_LENGTH 12
+#define VFIO_RESET_TIMEOUT 1000
 
 static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
@@ -1937,6 +1938,14 @@ static void vfio_check_hot_bus_reset(VFIOPCIDevice 
*vdev, Error **errp)
 VFIOGroup *group;
 int ret, i, devfn, range_limit;
 
+if (!vdev->vfio_resume_cap) {
+error_setg(errp, "vfio: Cannot enable AER for device %s,"
+   " host vfio driver does not support for"
+   " resume notification",
+   vdev->vbasedev.name);
+return;
+}
+
 ret = vfio_get_hot_reset_info(vdev, &info);
 if (ret) {
 error_setg(errp, "vfio: Cannot enable AER for device %s,"
@@ -2594,6 +2603,23 @@ static int vfio_populate_device(VFIOPCIDevice *vdev)
  vbasedev->name);
 }
 
+irq_info.index = VFIO_PCI_RESUME_IRQ_INDEX;
+
+ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
+if (ret) {
+/* This can fail for an old kernel or legacy PCI dev */
+trace_vfio_populate_device_get_irq_info_failure();
+ret = 0;
+} else if (irq_info.count == 1) {
+vdev->vfio_resume_cap = true;
+} else {
+error_report("vfio: %s "
+ "Could not enable error recovery for the device,"
+ " because host vfio driver does not support for"
+ " resume notification",
+ vbasedev->name);
+}
+
 error:
 return ret;
 }
@@ -2661,6 +2687,17 @@ static void vfio_err_notifier_handler(void *opaque)
 msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN :
  PCI_ERR_ROOT_CMD_NONFATAL_EN;
 
+if (isfatal) {
+PCIDevice *dev_0 = pci_get_function_0(dev);
+vdev->vfio_resume_wait = true;
+vdev->vfio_resume_arrived = false;
+vfio_mmap_set_enabled(vdev, false);
+if (dev_0 != dev) {
+VFIOPCIDevice *vdev_0 = DO_UPCAST(VFIOPCIDevice, pdev, dev_0);
+vdev_0->vfio_resume_wait = true;
+vdev_0->vfio_resume_arrived = false;
+}
+}
 pcie_aer_msg(dev, &msg);
 return;
 }
@@ -2756,6 +2793,96 @@ static void vfio_unregister_err_notifier(VFIOPCIDevice 
*vdev)
 event_notifier_cleanup(&vdev->err_notifier);
 }
 
+static void vfio_resume_notifier_handler(void *opaque)
+{
+VFIOPCIDevice *vdev = opaque;
+
+if (!event_notifier_test_and_clear(&vdev->resume_notifier)) {
+return;
+}
+
+vdev->vfio_resume_arrived = true;
+if (vdev->reset_timer != NULL) {
+timer_mod(vdev->reset_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL));
+}
+}
+
+static void vfio_register_aer_resume_notifier(VFIOPCIDevice *vdev)
+{
+int ret;
+int argsz;
+struct vfio_irq_set *irq_set;
+int32_t *pfd;
+
+if (!(vdev->features & VFIO_FEATURE_ENABLE_AER) ||
+!vdev-

[Qemu-devel] [PATCH 01/12] vfio: extract vfio_get_hot_reset_info as a single function

2016-05-17 Thread Zhou Jie
From: Chen Fan 

The function is used to get affected devices by bus reset.
So here extract it, and can used for aer soon.

Signed-off-by: Chen Fan 
---
 hw/vfio/pci.c | 66 +++
 1 file changed, 48 insertions(+), 18 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index d091d8c..cf40f9e 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1701,6 +1701,51 @@ static void vfio_check_af_flr(VFIOPCIDevice *vdev, 
uint8_t pos)
 }
 }
 
+/*
+ * return negative with errno, return 0 on success.
+ * if success, the point of ret_info fill with the affected device reset info.
+ *
+ */
+static int vfio_get_hot_reset_info(VFIOPCIDevice *vdev,
+   struct vfio_pci_hot_reset_info **ret_info)
+{
+struct vfio_pci_hot_reset_info *info;
+int ret, count;
+
+*ret_info = NULL;
+
+info = g_malloc0(sizeof(*info));
+info->argsz = sizeof(*info);
+
+ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
+if (ret && errno != ENOSPC) {
+ret = -errno;
+goto error;
+}
+
+count = info->count;
+
+info = g_realloc(info, sizeof(*info) +
+ (count * sizeof(struct vfio_pci_dependent_device)));
+info->argsz = sizeof(*info) +
+  (count * sizeof(struct vfio_pci_dependent_device));
+
+ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
+if (ret) {
+ret = -errno;
+error_report("vfio: hot reset info failed: %m");
+goto error;
+}
+
+*ret_info = info;
+info = NULL;
+
+return 0;
+error:
+g_free(info);
+return ret;
+}
+
 static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos)
 {
 PCIDevice *pdev = &vdev->pdev;
@@ -1842,7 +1887,7 @@ static bool vfio_pci_host_match(PCIHostDeviceAddress 
*addr, const char *name)
 static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
 {
 VFIOGroup *group;
-struct vfio_pci_hot_reset_info *info;
+struct vfio_pci_hot_reset_info *info = NULL;
 struct vfio_pci_dependent_device *devices;
 struct vfio_pci_hot_reset *reset;
 int32_t *fds;
@@ -1854,12 +1899,8 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool 
single)
 vfio_pci_pre_reset(vdev);
 vdev->vbasedev.needs_reset = false;
 
-info = g_malloc0(sizeof(*info));
-info->argsz = sizeof(*info);
-
-ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
-if (ret && errno != ENOSPC) {
-ret = -errno;
+ret = vfio_get_hot_reset_info(vdev, &info);
+if (ret) {
 if (!vdev->has_pm_reset) {
 error_report("vfio: Cannot reset device %s, "
  "no available reset mechanism.", vdev->vbasedev.name);
@@ -1867,18 +1908,7 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool 
single)
 goto out_single;
 }
 
-count = info->count;
-info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices)));
-info->argsz = sizeof(*info) + (count * sizeof(*devices));
 devices = &info->devices[0];
-
-ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
-if (ret) {
-ret = -errno;
-error_report("vfio: hot reset info failed: %m");
-goto out_single;
-}
-
 trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name);
 
 /* Verify that we have all the groups required */
-- 
1.8.3.1






[Qemu-devel] [PATCH 08/12] vfio: add check aer functionality for hotplug device

2016-05-17 Thread Zhou Jie
From: Chen Fan 

when function 0 is hot-added, we can check the vfio device
whether support hot bus reset.

Signed-off-by: Chen Fan 
---
 hw/vfio/pci.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 832f25e..109b11b 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3087,6 +3087,19 @@ post_reset:
 vfio_pci_post_reset(vdev);
 }
 
+static void vfio_pci_is_valid(PCIDevice *dev, Error **errp)
+{
+VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, dev);
+Error *local_err = NULL;
+
+if (vdev->features & VFIO_FEATURE_ENABLE_AER) {
+vfio_check_hot_bus_reset(vdev, &local_err);
+if (local_err) {
+error_propagate(errp, local_err);
+}
+}
+}
+
 static void vfio_instance_init(Object *obj)
 {
 PCIDevice *pci_dev = PCI_DEVICE(obj);
@@ -3141,6 +3154,7 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, 
void *data)
 set_bit(DEVICE_CATEGORY_MISC, dc->categories);
 pdc->init = vfio_initfn;
 pdc->exit = vfio_exitfn;
+pdc->is_valid_func = vfio_pci_is_valid;
 pdc->config_read = vfio_pci_read_config;
 pdc->config_write = vfio_pci_write_config;
 pdc->is_express = 1; /* We might be */
-- 
1.8.3.1






[Qemu-devel] [PATCH v7 00/12] vfio-pci: pass the aer error to guest

2016-05-17 Thread Zhou Jie
From: Chen Fan 

v6-v7:
   1. Stall any access to the device until resume is signaled.
  Do guest directed bus reset after receive resume notification.
  If don't get the resume notification after some timeout unplug the device.
   2. fix the patches 11/12 as Alex sugguestion.

v5-v6:
   1. register resume handler both qemu and kernel to ensure the
  reset in order.
   2. fix the patches 6/12, 7/12 as Alex and MST sugguestion.

v4-v5:
   1. add back the common function 0 hotplug code in pci core.
   2. fix a sporadic device stuck on D3 problem when doing aer recovery.
   3. fix patches 5/12 ~ 9/12 as Alex sugguestion.

v3-v4:
   1. rebase patchset to fit latest master branch.
   2. modifying patches 5/10 ~ 8/10 as Alex sugguestion(Thanks).

v2-v3:
   1. fix patch 4/9, 5/9 as Alex sugguestion.
   2. patches 5/9 ~ 8/9 are made to force limiting that all vfio functions
  are combined in the same way as on the host.

v1-v2:
   1. limit all devices on same bus in guest are on same bus in host in patch 

For now, for vfio pci passthough devices, when qemu receives
an error from host aer report, it just terminate the guest.
But usually user want to know what error occurred. So these patches add
aer capability support for vfio devices, and pass the error to guest.
Then let guest driver to recover from the error.

notes: this series patches enable aer support single/multi-function,
for multi-function, require all the function of the slot assigned to
VM and on the same slot.

Chen Fan (12):
  vfio: extract vfio_get_hot_reset_info as a single function
  vfio: squeeze out vfio_pci_do_hot_reset for support bus reset
  vfio: add pcie extended capability support
  vfio: add aer support for vfio device
  vfio: refine function vfio_pci_host_match
  vfio: add check host bus reset is support or not
  pci: add a pci_function_is_valid callback to check function if valid
  vfio: add check aer functionality for hotplug device
  vfio: vote the function 0 to do host bus reset when aer occurred
  vfio-pci: pass the aer error to guest
  vfio: register aer resume notification handler for aer resume
  vfio: add 'aer' property to expose aercap

 hw/pci/pci.c   |  32 ++
 hw/vfio/pci.c  | 826 +
 hw/vfio/pci.h  |  10 +
 include/hw/pci/pci.h   |   1 +
 linux-headers/linux/vfio.h |   1 +
 5 files changed, 804 insertions(+), 66 deletions(-)

-- 
1.8.3.1






[Qemu-devel] [PATCH 07/12] pci: add a pci_function_is_valid callback to check function if valid

2016-05-17 Thread Zhou Jie
From: Chen Fan 

PCI hotplug requires that function 0 is added last to close the
slot.  Since vfio supporting AER, we require that the VM bus
contains the same set of devices as the host bus to support AER,
we can perform an AER validation test whenever a function 0 in
the VM is hot-added.

Signed-off-by: Chen Fan 
---
 hw/pci/pci.c | 32 
 include/hw/pci/pci.h |  1 +
 2 files changed, 33 insertions(+)

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index bb605ef..acd48eb 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -1841,6 +1841,22 @@ PCIDevice *pci_find_device(PCIBus *bus, int bus_num, 
uint8_t devfn)
 return bus->devices[devfn];
 }
 
+static void pci_function_is_valid(PCIBus *bus, PCIDevice *d, void *opaque)
+{
+PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(d);
+Error **errp = opaque;
+
+if (*errp) {
+return;
+}
+
+if (!pc->is_valid_func) {
+return;
+}
+
+pc->is_valid_func(d, errp);
+}
+
 static void pci_qdev_realize(DeviceState *qdev, Error **errp)
 {
 PCIDevice *pci_dev = (PCIDevice *)qdev;
@@ -1883,6 +1899,22 @@ static void pci_qdev_realize(DeviceState *qdev, Error 
**errp)
 pci_qdev_unrealize(DEVICE(pci_dev), NULL);
 return;
 }
+
+/*
+ *  If the function number is 0, indicate the closure of the slot.
+ *  then we get the chance to check all functions on same device
+ *  if valid.
+ */
+if (DEVICE(pci_dev)->hotplugged &&
+pci_get_function_0(pci_dev) == pci_dev) {
+pci_for_each_device(bus, pci_bus_num(bus),
+pci_function_is_valid, &local_err);
+if (local_err) {
+error_propagate(errp, local_err);
+pci_qdev_unrealize(DEVICE(pci_dev), NULL);
+return;
+}
+}
 }
 
 static void pci_default_realize(PCIDevice *dev, Error **errp)
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index ef6ba51..848fa7b 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -197,6 +197,7 @@ typedef struct PCIDeviceClass {
 
 void (*realize)(PCIDevice *dev, Error **errp);
 int (*init)(PCIDevice *dev);/* TODO convert to realize() and remove */
+void (*is_valid_func)(PCIDevice *dev, Error **errp);
 PCIUnregisterFunc *exit;
 PCIConfigReadFunc *config_read;
 PCIConfigWriteFunc *config_write;
-- 
1.8.3.1






Re: [Qemu-devel] [patch v6 11/12] vfio: register aer resume notification handler for aer resume

2016-05-10 Thread Zhou Jie

Hi, Alex
What do you think about the following solution?
1. Detect support for resume notification.
   If host vfio driver does not have resume notifier flags,
   Directly fail to boot up VM as with aer enabled.
2. Immediately notify the VM on error detected.
3. Stall any access to the device until resume is signaled.
   Disable mmaps, drop writes, return -1 for reads.
4. Delay the guest directed bus reset.
   Don't reset bus in vfio_pci_reset function.
5. Wait for resume notification.
   If we don't get the resume notification from the host after
   some timeout, we would abort the guest directed bus reset
   altogether and make the device disappear,
   Initiating an unplug of the device to prevent it from further
   interacting with the VM.
6. After get the resume notification.
   Reset bus.
   It the second bus reset. Because the host did bus reset already.
   But as you said we shouldn't necessarily design the API that
   strictly around the current behavior of the Linux AER handler.

Sincerely,
Zhou Jie

On 2016/5/7 0:39, Alex Williamson wrote:

On Fri, 6 May 2016 09:38:41 +0800
Chen Fan  wrote:


On 04/26/2016 10:48 PM, Alex Williamson wrote:

On Tue, 26 Apr 2016 11:39:02 +0800
Chen Fan  wrote:


On 04/14/2016 09:02 AM, Chen Fan wrote:

On 04/12/2016 05:38 AM, Alex Williamson wrote:

On Tue, 5 Apr 2016 19:42:02 +0800
Cao jin  wrote:


From: Chen Fan

for supporting aer recovery, host and guest would run the same aer
recovery code, that would do the secondary bus reset if the error
is fatal, the aer recovery process:
1. error_detected
2. reset_link (if fatal)
3. slot_reset/mmio_enabled
4. resume

it indicates that host will do secondary bus reset to reset
the physical devices under bus in step 2, that would cause
devices in D3 status in a short time. but in qemu, we register
an error detected handler, that would be invoked as host broadcasts
the error-detected event in step 1, in order to avoid guest do
reset_link when host do reset_link simultaneously. it may cause
fatal error. we introduce a resmue notifier to assure host reset
completely. then do guest aer injection.

Why is it safe to continue running the VM between the error detected
notification and the resume notification?  We're just pushing back the
point at which we inject the AER into the guest, potentially negating
any benefit by allowing the VM to consume bad data.  Shouldn't we
instead be immediately notifying the VM on error detected, but stalling
any access to the device until resume is signaled?  How do we know that
resume will ever be signaled?  We have both the problem that we may be
running on an older kernel that won't support a resume notification and
the problem that seeing a resume notification depends on the host being
able to successfully complete a link reset after fatal error. We can
detect support for resume notification, but we still need a strategy
for never receiving it.  Thanks,

That's make sense, but I haven't came up with a good idea. do you have
any idea, Alex?

I don't know that there are any good solutions here.  We need to
respond to the current error notifier interrupt and not regress from
our support there.  I think that means that if we want to switch from a
simple halt-on-error to a mechanism for the guest to handle recovery,
we need to disable access to the device between being notified that the
error occurred and being notified to resume.  We can do that by
disabling mmaps to the device and preventing access via the slow path
handlers.  I don't know what the best solution is for preventing access,
do we block and pause the VM or do we drop writes and return -1 for
reads, that's something that needs to be determined.  We also need to
inject the AER into the VM at the point we're notified of an error
because the VM needs to know as soon as possible to stop using the
device or trusting any data from it.  The next coordination point would
be something like the resume notifier that you've added and there are
numerous questions around the interaction of that with the guest
handling.  Clearly we can't do a guest directed bus reset until we get
the resume notifier, so do we block that execution path in QEMU until
the resume notification is received?  What happens if we don't get that
notification?  Is there any way that we can rely on the host having
done a bus reset to the point where we don't need to act on the guest
directed reset?  These are all things that need to be figured out.
Thanks,

Maybe we can simply pause the vcpu running and avoid the VM to
access the device. and add two flags in VFIO_DEVICE_GET_INFO to query
whether the vfio pci driver has a resume notifier,
if it does not have resume notifier flags, we can directly fail to boot
up VM
as with aer enabled.


We can already tell if a resume interrupt is supported betw

[Qemu-devel] [PATCH v4] block: always compile-check debug prints

2016-04-28 Thread Zhou Jie
Files with conditional debug statements should ensure that the printf is
always compiled. This prevents bitrot of the format string of the debug
statement. And switch debug output to stderr.

Signed-off-by: Zhou Jie 
Reviewed-by: Eric Blake 
---
v3 -> v4:
 * Alter commit message to add "And switch debug output to stderr".

 block/curl.c | 10 --
 block/sheepdog.c | 13 -
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/block/curl.c b/block/curl.c
index 5a8f8b6..da9f5e8 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -36,10 +36,16 @@
 // #define DEBUG_VERBOSE
 
 #ifdef DEBUG_CURL
-#define DPRINTF(fmt, ...) do { printf(fmt, ## __VA_ARGS__); } while (0)
+#define DEBUG_CURL_PRINT 1
 #else
-#define DPRINTF(fmt, ...) do { } while (0)
+#define DEBUG_CURL_PRINT 0
 #endif
+#define DPRINTF(fmt, ...)\
+do { \
+if (DEBUG_CURL_PRINT) {  \
+fprintf(stderr, fmt, ## __VA_ARGS__);\
+}\
+} while (0)
 
 #if LIBCURL_VERSION_NUM >= 0x071000
 /* The multi interface timer callback was introduced in 7.16.0 */
diff --git a/block/sheepdog.c b/block/sheepdog.c
index 33e0a33..9023686 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -294,13 +294,16 @@ static inline size_t count_data_objs(const struct 
SheepdogInode *inode)
 
 #undef DPRINTF
 #ifdef DEBUG_SDOG
-#define DPRINTF(fmt, args...)   \
-do {\
-fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \
-} while (0)
+#define DEBUG_SDOG_PRINT 1
 #else
-#define DPRINTF(fmt, args...)
+#define DEBUG_SDOG_PRINT 0
 #endif
+#define DPRINTF(fmt, args...)   \
+do {\
+if (DEBUG_SDOG_PRINT) { \
+fprintf(stderr, "%s %d: " fmt, __func__, __LINE__, ##args); \
+}   \
+} while (0)
 
 typedef struct SheepdogAIOCB SheepdogAIOCB;
 
-- 
2.5.5






[Qemu-devel] [PATCH v3] block: always compile-check debug prints

2016-04-28 Thread Zhou Jie
Files with conditional debug statements should ensure that the printf is
always compiled.
This prevents bitrot of the format string of the debug statement.

Signed-off-by: Zhou Jie 
Reviewed-by: Eric Blake 
---
v1 -> v2:
 * Keep the user-visible witness as defined/undefined,
   and create a secondary witness as the actual conditional.
 * Switch debug output to stderr.

v2 -> v3:
 * Alter commit message to add "Reviewed-by ...".

 block/curl.c | 10 --
 block/sheepdog.c | 13 -
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/block/curl.c b/block/curl.c
index 5a8f8b6..da9f5e8 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -36,10 +36,16 @@
 // #define DEBUG_VERBOSE
 
 #ifdef DEBUG_CURL
-#define DPRINTF(fmt, ...) do { printf(fmt, ## __VA_ARGS__); } while (0)
+#define DEBUG_CURL_PRINT 1
 #else
-#define DPRINTF(fmt, ...) do { } while (0)
+#define DEBUG_CURL_PRINT 0
 #endif
+#define DPRINTF(fmt, ...)\
+do { \
+if (DEBUG_CURL_PRINT) {  \
+fprintf(stderr, fmt, ## __VA_ARGS__);\
+}\
+} while (0)
 
 #if LIBCURL_VERSION_NUM >= 0x071000
 /* The multi interface timer callback was introduced in 7.16.0 */
diff --git a/block/sheepdog.c b/block/sheepdog.c
index 33e0a33..9023686 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -294,13 +294,16 @@ static inline size_t count_data_objs(const struct 
SheepdogInode *inode)
 
 #undef DPRINTF
 #ifdef DEBUG_SDOG
-#define DPRINTF(fmt, args...)   \
-do {\
-fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \
-} while (0)
+#define DEBUG_SDOG_PRINT 1
 #else
-#define DPRINTF(fmt, args...)
+#define DEBUG_SDOG_PRINT 0
 #endif
+#define DPRINTF(fmt, args...)   \
+do {\
+if (DEBUG_SDOG_PRINT) { \
+fprintf(stderr, "%s %d: " fmt, __func__, __LINE__, ##args); \
+}   \
+} while (0)
 
 typedef struct SheepdogAIOCB SheepdogAIOCB;
 
-- 
2.5.5






[Qemu-devel] [PATCH v2] block: always compile-check debug prints

2016-04-28 Thread Zhou Jie
Files with conditional debug statements should ensure that the printf is
always compiled.
This prevents bitrot of the format string of the debug statement.

Signed-off-by: Zhou Jie 
---
 block/curl.c | 10 --
 block/sheepdog.c | 13 -
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/block/curl.c b/block/curl.c
index 5a8f8b6..da9f5e8 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -36,10 +36,16 @@
 // #define DEBUG_VERBOSE
 
 #ifdef DEBUG_CURL
-#define DPRINTF(fmt, ...) do { printf(fmt, ## __VA_ARGS__); } while (0)
+#define DEBUG_CURL_PRINT 1
 #else
-#define DPRINTF(fmt, ...) do { } while (0)
+#define DEBUG_CURL_PRINT 0
 #endif
+#define DPRINTF(fmt, ...)\
+do { \
+if (DEBUG_CURL_PRINT) {  \
+fprintf(stderr, fmt, ## __VA_ARGS__);\
+}\
+} while (0)
 
 #if LIBCURL_VERSION_NUM >= 0x071000
 /* The multi interface timer callback was introduced in 7.16.0 */
diff --git a/block/sheepdog.c b/block/sheepdog.c
index 33e0a33..9023686 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -294,13 +294,16 @@ static inline size_t count_data_objs(const struct 
SheepdogInode *inode)
 
 #undef DPRINTF
 #ifdef DEBUG_SDOG
-#define DPRINTF(fmt, args...)   \
-do {\
-fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \
-} while (0)
+#define DEBUG_SDOG_PRINT 1
 #else
-#define DPRINTF(fmt, args...)
+#define DEBUG_SDOG_PRINT 0
 #endif
+#define DPRINTF(fmt, args...)   \
+do {\
+if (DEBUG_SDOG_PRINT) { \
+fprintf(stderr, "%s %d: " fmt, __func__, __LINE__, ##args); \
+}   \
+} while (0)
 
 typedef struct SheepdogAIOCB SheepdogAIOCB;
 
-- 
2.5.5






[Qemu-devel] [PATCH] block: always compile-check debug prints

2016-04-28 Thread Zhou Jie
Files with conditional debug statements should ensure that the printf is
always compiled.
This prevents bitrot of the format string of the debug statement.

Signed-off-by: Zhou Jie 
---
 block/curl.c | 13 +++--
 block/sheepdog.c | 13 ++---
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/block/curl.c b/block/curl.c
index 5a8f8b6..6655108 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -32,14 +32,15 @@
 #include 
 #include "qemu/cutils.h"
 
-// #define DEBUG_CURL
+#define DEBUG_CURL 0
 // #define DEBUG_VERBOSE
 
-#ifdef DEBUG_CURL
-#define DPRINTF(fmt, ...) do { printf(fmt, ## __VA_ARGS__); } while (0)
-#else
-#define DPRINTF(fmt, ...) do { } while (0)
-#endif
+#define DPRINTF(fmt, ...)\
+do { \
+if (DEBUG_CURL) {\
+printf(fmt, ## __VA_ARGS__); \
+}\
+} while (0)
 
 #if LIBCURL_VERSION_NUM >= 0x071000
 /* The multi interface timer callback was introduced in 7.16.0 */
diff --git a/block/sheepdog.c b/block/sheepdog.c
index 33e0a33..47cca74 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -293,14 +293,13 @@ static inline size_t count_data_objs(const struct 
SheepdogInode *inode)
 }
 
 #undef DPRINTF
-#ifdef DEBUG_SDOG
-#define DPRINTF(fmt, args...)   \
-do {\
-fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \
+#define DEBUG_SDOG 0
+#define DPRINTF(fmt, args...)   \
+do {\
+if (DEBUG_SDOG) {   \
+fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \
+}   \
 } while (0)
-#else
-#define DPRINTF(fmt, args...)
-#endif
 
 typedef struct SheepdogAIOCB SheepdogAIOCB;
 
-- 
2.5.5






Re: [Qemu-devel] [PATCH v2] hw/net/opencores_eth: Allocating Large sized arrays to heap

2016-04-26 Thread Zhou Jie



On 2016/4/27 10:46, Max Filippov wrote:

Hi Zhou,

On Wed, Apr 27, 2016 at 10:18:56AM +0800, Zhou Jie wrote:

When I committed another patch which named as
"hw/net/virtio-net: Allocating Large sized arrays to heap" .

Christian Borntraeger said that 16k is usually perfectly fine
for a userspace stack and doing allocations in a hot path
might actually hurt performance.

Although the size is 65536 bytes here,
I think open_eth_start_xmit is in a hot path.
So, it is OK, if you think that this patch should not be applied.


With Linux as guest OS we shouldn't see any allocations
as it doesn't send huge packets, so I think this patch is fine.
I can take it through the xtensa tree if you don't have other
plan.


OK, Thanks

Sincerely,
Zhou Jie






Re: [Qemu-devel] [PATCH v2] hw/net/opencores_eth: Allocating Large sized arrays to heap

2016-04-26 Thread Zhou Jie

Hi Max,

   When I committed another patch which named as
   "hw/net/virtio-net: Allocating Large sized arrays to heap" .

   Christian Borntraeger said that 16k is usually perfectly fine
   for a userspace stack and doing allocations in a hot path
   might actually hurt performance.

   Although the size is 65536 bytes here,
   I think open_eth_start_xmit is in a hot path.
   So, it is OK, if you think that this patch should not be applied.

Sincerely,
Zhou Jie

On 2016/4/27 10:07, Zhou Jie wrote:

open_eth_start_xmit has a huge stack usage of 65536 bytes approx.
Moving large arrays to heap to reduce stack usage.

Signed-off-by: Zhou Jie 
---
  hw/net/opencores_eth.c | 11 ++-
  1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/hw/net/opencores_eth.c b/hw/net/opencores_eth.c
index c6094fb..fa0a4e7 100644
--- a/hw/net/opencores_eth.c
+++ b/hw/net/opencores_eth.c
@@ -483,7 +483,8 @@ static NetClientInfo net_open_eth_info = {

  static void open_eth_start_xmit(OpenEthState *s, desc *tx)
  {
-uint8_t buf[65536];
+uint8_t *buf = NULL;
+uint8_t buffer[0x600];
  unsigned len = GET_FIELD(tx->len_flags, TXD_LEN);
  unsigned tx_len = len;

@@ -498,6 +499,11 @@ static void open_eth_start_xmit(OpenEthState *s, desc *tx)

  trace_open_eth_start_xmit(tx->buf_ptr, len, tx_len);

+if (tx_len > 0x600) {
+buf = g_new(uint8_t, tx_len);
+} else {
+buf = buffer;
+}
  if (len > tx_len) {
  len = tx_len;
  }
@@ -506,6 +512,9 @@ static void open_eth_start_xmit(OpenEthState *s, desc *tx)
  memset(buf + len, 0, tx_len - len);
  }
  qemu_send_packet(qemu_get_queue(s->nic), buf, tx_len);
+if (tx_len > 0x600) {
+g_free(buf);
+}

  if (tx->len_flags & TXD_WR) {
  s->tx_desc = 0;



--

周潔
Dept 1
No. 6 Wenzhu Road,
Nanjing, 210012, China
TEL:+86+25-86630566-8557
FUJITSU INTERNAL:7998-8557
E-Mail:zhoujie2...@cn.fujitsu.com






[Qemu-devel] [PATCH v2] hw/net/opencores_eth: Allocating Large sized arrays to heap

2016-04-26 Thread Zhou Jie
open_eth_start_xmit has a huge stack usage of 65536 bytes approx.
Moving large arrays to heap to reduce stack usage.

Signed-off-by: Zhou Jie 
---
 hw/net/opencores_eth.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/hw/net/opencores_eth.c b/hw/net/opencores_eth.c
index c6094fb..fa0a4e7 100644
--- a/hw/net/opencores_eth.c
+++ b/hw/net/opencores_eth.c
@@ -483,7 +483,8 @@ static NetClientInfo net_open_eth_info = {
 
 static void open_eth_start_xmit(OpenEthState *s, desc *tx)
 {
-uint8_t buf[65536];
+uint8_t *buf = NULL;
+uint8_t buffer[0x600];
 unsigned len = GET_FIELD(tx->len_flags, TXD_LEN);
 unsigned tx_len = len;
 
@@ -498,6 +499,11 @@ static void open_eth_start_xmit(OpenEthState *s, desc *tx)
 
 trace_open_eth_start_xmit(tx->buf_ptr, len, tx_len);
 
+if (tx_len > 0x600) {
+buf = g_new(uint8_t, tx_len);
+} else {
+buf = buffer;
+}
 if (len > tx_len) {
 len = tx_len;
 }
@@ -506,6 +512,9 @@ static void open_eth_start_xmit(OpenEthState *s, desc *tx)
 memset(buf + len, 0, tx_len - len);
 }
 qemu_send_packet(qemu_get_queue(s->nic), buf, tx_len);
+if (tx_len > 0x600) {
+g_free(buf);
+}
 
 if (tx->len_flags & TXD_WR) {
 s->tx_desc = 0;
-- 
2.5.5






Re: [Qemu-devel] [PATCH] hw/net/opencores_eth: Allocating Large sized arrays to heap

2016-04-26 Thread Zhou Jie


On 2016/4/26 12:12, Max Filippov wrote:

Hi Zhou,

On Tue, Apr 26, 2016 at 6:35 AM, Zhou Jie  wrote:

open_eth_start_xmit has a huge stack usage of 65536 bytes approx.
Moving large arrays to heap to reduce stack usage.


It's an exception, not the rule when full 65536 byte long buffer might be
needed. Can we do a little better change and not allocate and free this
buffer every time unconditionally, but instead make buf smaller (1536
bytes, maximal frame length when HUGEN bit is not set in MODER)
and only do allocation when that's not enough?


Thank you for your suggestion.
I will modify this patch.

Sincerely,
Zhou Jie

--

周潔
Dept 1
No. 6 Wenzhu Road,
Nanjing, 210012, China
TEL:+86+25-86630566-8557
FUJITSU INTERNAL:7998-8557
E-Mail:zhoujie2...@cn.fujitsu.com






Re: [Qemu-devel] [PATCH] hw/net/virtio-net: Allocating Large sized arrays to heap

2016-04-26 Thread Zhou Jie

On 2016/4/26 20:42, Michael S. Tsirkin wrote:

On Tue, Apr 26, 2016 at 04:05:24PM +0800, Zhou Jie wrote:

virtio_net_flush_tx has a huge stack usage of 16392 bytes approx.
Moving large arrays to heap to reduce stack usage.

Signed-off-by: Zhou Jie 


I don't think it's appropriate for trivial.
Also - what's the point, exactly?

I think functions should not have very large stack frames.
For 64bit machine it will be 32k.
But according what Christian Borntraeger said, it doesn't matter.
So, considerate that virtio_net_flush_tx is in a hot path,
I agree to not change this.

Sincerely,
Zhou Jie




---
  hw/net/virtio-net.c | 15 +++
  1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 5798f87..cab7bbc 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -1213,6 +1213,7 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
  VirtIONet *n = q->n;
  VirtIODevice *vdev = VIRTIO_DEVICE(n);
  VirtQueueElement *elem;
+struct iovec *sg = NULL, *sg2 = NULL;
  int32_t num_packets = 0;
  int queue_index = vq2q(virtio_get_queue_index(q->tx_vq));
  if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
@@ -1224,10 +1225,12 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
  return num_packets;
  }

+sg = g_new(struct iovec, VIRTQUEUE_MAX_SIZE);
+sg2 = g_new(struct iovec, VIRTQUEUE_MAX_SIZE + 1);
  for (;;) {
  ssize_t ret;
  unsigned int out_num;
-struct iovec sg[VIRTQUEUE_MAX_SIZE], sg2[VIRTQUEUE_MAX_SIZE + 1], 
*out_sg;
+struct iovec *out_sg;
  struct virtio_net_hdr_mrg_rxbuf mhdr;

  elem = virtqueue_pop(q->tx_vq, sizeof(VirtQueueElement));
@@ -1252,7 +1255,7 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
  virtio_net_hdr_swap(vdev, (void *) &mhdr);
  sg2[0].iov_base = &mhdr;
  sg2[0].iov_len = n->guest_hdr_len;
-out_num = iov_copy(&sg2[1], ARRAY_SIZE(sg2) - 1,
+out_num = iov_copy(&sg2[1], VIRTQUEUE_MAX_SIZE,
 out_sg, out_num,
 n->guest_hdr_len, -1);
  if (out_num == VIRTQUEUE_MAX_SIZE) {
@@ -1269,10 +1272,10 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
   */
  assert(n->host_hdr_len <= n->guest_hdr_len);
  if (n->host_hdr_len != n->guest_hdr_len) {
-unsigned sg_num = iov_copy(sg, ARRAY_SIZE(sg),
+unsigned sg_num = iov_copy(sg, VIRTQUEUE_MAX_SIZE,
 out_sg, out_num,
 0, n->host_hdr_len);
-sg_num += iov_copy(sg + sg_num, ARRAY_SIZE(sg) - sg_num,
+sg_num += iov_copy(sg + sg_num, VIRTQUEUE_MAX_SIZE - sg_num,
   out_sg, out_num,
   n->guest_hdr_len, -1);
  out_num = sg_num;
@@ -1284,6 +1287,8 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
  if (ret == 0) {
  virtio_queue_set_notification(q->tx_vq, 0);
  q->async_tx.elem = elem;
+g_free(sg);
+g_free(sg2);
  return -EBUSY;
  }

@@ -1296,6 +1301,8 @@ drop:
  break;
  }
  }
+g_free(sg);
+g_free(sg2);
  return num_packets;
  }

--
2.5.5





.



--

周潔
Dept 1
No. 6 Wenzhu Road,
Nanjing, 210012, China
TEL:+86+25-86630566-8557
FUJITSU INTERNAL:7998-8557
E-Mail:zhoujie2...@cn.fujitsu.com






Re: [Qemu-devel] [PATCH] hw/net/virtio-net: Allocating Large sized arrays to heap

2016-04-26 Thread Zhou Jie

On 2016/4/26 16:49, Christian Borntraeger wrote:

On 04/26/2016 10:05 AM, Zhou Jie wrote:

virtio_net_flush_tx has a huge stack usage of 16392 bytes approx.
Moving large arrays to heap to reduce stack usage.

Signed-off-by: Zhou Jie 
---
  hw/net/virtio-net.c | 15 +++
  1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 5798f87..cab7bbc 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -1213,6 +1213,7 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
  VirtIONet *n = q->n;
  VirtIODevice *vdev = VIRTIO_DEVICE(n);
  VirtQueueElement *elem;
+struct iovec *sg = NULL, *sg2 = NULL;
  int32_t num_packets = 0;
  int queue_index = vq2q(virtio_get_queue_index(q->tx_vq));
  if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
@@ -1224,10 +1225,12 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
  return num_packets;
  }

+sg = g_new(struct iovec, VIRTQUEUE_MAX_SIZE);
+sg2 = g_new(struct iovec, VIRTQUEUE_MAX_SIZE + 1);



As I said in another mail, 16k is usually perfectly fine for a userspace
stack and doing allocations in a hot path might actually hurt performance.

Unless we have a real problem (e.g. very long call stack on a small thread
stack) I would prefer to not change this.
Do you have seen a real problem due to this?

No, I don't.
I think functions should not have very large stack frames.
For 64bit machine it will be 32k.
But according what you said, it doesn't matter.
So, considerate that virtio_net_flush_tx is in a hot path,
I agree to not change this.

Sincerely,
Zhou Jie



  for (;;) {
  ssize_t ret;
  unsigned int out_num;
-struct iovec sg[VIRTQUEUE_MAX_SIZE], sg2[VIRTQUEUE_MAX_SIZE + 1], 
*out_sg;
+struct iovec *out_sg;
  struct virtio_net_hdr_mrg_rxbuf mhdr;

  elem = virtqueue_pop(q->tx_vq, sizeof(VirtQueueElement));
@@ -1252,7 +1255,7 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
  virtio_net_hdr_swap(vdev, (void *) &mhdr);
  sg2[0].iov_base = &mhdr;
  sg2[0].iov_len = n->guest_hdr_len;
-out_num = iov_copy(&sg2[1], ARRAY_SIZE(sg2) - 1,
+out_num = iov_copy(&sg2[1], VIRTQUEUE_MAX_SIZE,
 out_sg, out_num,
 n->guest_hdr_len, -1);
  if (out_num == VIRTQUEUE_MAX_SIZE) {
@@ -1269,10 +1272,10 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
   */
  assert(n->host_hdr_len <= n->guest_hdr_len);
  if (n->host_hdr_len != n->guest_hdr_len) {
-unsigned sg_num = iov_copy(sg, ARRAY_SIZE(sg),
+unsigned sg_num = iov_copy(sg, VIRTQUEUE_MAX_SIZE,
 out_sg, out_num,
 0, n->host_hdr_len);
-sg_num += iov_copy(sg + sg_num, ARRAY_SIZE(sg) - sg_num,
+sg_num += iov_copy(sg + sg_num, VIRTQUEUE_MAX_SIZE - sg_num,
   out_sg, out_num,
   n->guest_hdr_len, -1);
  out_num = sg_num;
@@ -1284,6 +1287,8 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
  if (ret == 0) {
  virtio_queue_set_notification(q->tx_vq, 0);
  q->async_tx.elem = elem;
+g_free(sg);
+g_free(sg2);
  return -EBUSY;
  }

@@ -1296,6 +1301,8 @@ drop:
  break;
  }
  }
+g_free(sg);
+g_free(sg2);
  return num_packets;
  }





.



--

周潔
Dept 1
No. 6 Wenzhu Road,
Nanjing, 210012, China
TEL:+86+25-86630566-8557
FUJITSU INTERNAL:7998-8557
E-Mail:zhoujie2...@cn.fujitsu.com






Re: [Qemu-devel] [PATCH] net/tap: Allocating Large sized arrays to heap

2016-04-26 Thread Zhou Jie

On 2016/4/26 15:45, Christian Borntraeger wrote:

On 04/26/2016 03:26 AM, Zhou Jie wrote:

net_init_tap has a huge stack usage of 8192 bytes approx.
Moving large arrays to heap to reduce stack usage.


I am wondering. Why is 8k a problem for a user space program?

For 64bit machine it will be 16k.


Please note that malloc/new like allocations are much more expensive
than stack allocation in terms of performance. This does not matter
here, but in your other patch that deals with the xmit function, I would
not be surprised if that actually harms performance.

OK. I will note it.

Sincerely,
Zhou Jie



Christian





Signed-off-by: Zhou Jie 
---
  net/tap.c | 6 --
  1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/tap.c b/net/tap.c
index 740e8a2..49817c7 100644
--- a/net/tap.c
+++ b/net/tap.c
@@ -769,8 +769,8 @@ int net_init_tap(const NetClientOptions *opts, const char 
*name,
  return -1;
  }
  } else if (tap->has_fds) {
-char *fds[MAX_TAP_QUEUES];
-char *vhost_fds[MAX_TAP_QUEUES];
+char **fds = g_new(char *, MAX_TAP_QUEUES);
+char **vhost_fds = g_new(char *, MAX_TAP_QUEUES);
  int nfds, nvhosts;

  if (tap->has_ifname || tap->has_script || tap->has_downscript ||
@@ -818,6 +818,8 @@ int net_init_tap(const NetClientOptions *opts, const char 
*name,
  return -1;
  }
  }
+g_free(fds);
+g_free(vhost_fds);
  } else if (tap->has_helper) {
  if (tap->has_ifname || tap->has_script || tap->has_downscript ||
  tap->has_vnet_hdr || tap->has_queues || tap->has_vhostfds) {








--

周潔
Dept 1
No. 6 Wenzhu Road,
Nanjing, 210012, China
TEL:+86+25-86630566-8557
FUJITSU INTERNAL:7998-8557
E-Mail:zhoujie2...@cn.fujitsu.com






[Qemu-devel] [PATCH] hw/net/virtio-net: Allocating Large sized arrays to heap

2016-04-26 Thread Zhou Jie
virtio_net_flush_tx has a huge stack usage of 16392 bytes approx.
Moving large arrays to heap to reduce stack usage.

Signed-off-by: Zhou Jie 
---
 hw/net/virtio-net.c | 15 +++
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 5798f87..cab7bbc 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -1213,6 +1213,7 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
 VirtIONet *n = q->n;
 VirtIODevice *vdev = VIRTIO_DEVICE(n);
 VirtQueueElement *elem;
+struct iovec *sg = NULL, *sg2 = NULL;
 int32_t num_packets = 0;
 int queue_index = vq2q(virtio_get_queue_index(q->tx_vq));
 if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
@@ -1224,10 +1225,12 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
 return num_packets;
 }
 
+sg = g_new(struct iovec, VIRTQUEUE_MAX_SIZE);
+sg2 = g_new(struct iovec, VIRTQUEUE_MAX_SIZE + 1);
 for (;;) {
 ssize_t ret;
 unsigned int out_num;
-struct iovec sg[VIRTQUEUE_MAX_SIZE], sg2[VIRTQUEUE_MAX_SIZE + 1], 
*out_sg;
+struct iovec *out_sg;
 struct virtio_net_hdr_mrg_rxbuf mhdr;
 
 elem = virtqueue_pop(q->tx_vq, sizeof(VirtQueueElement));
@@ -1252,7 +1255,7 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
 virtio_net_hdr_swap(vdev, (void *) &mhdr);
 sg2[0].iov_base = &mhdr;
 sg2[0].iov_len = n->guest_hdr_len;
-out_num = iov_copy(&sg2[1], ARRAY_SIZE(sg2) - 1,
+out_num = iov_copy(&sg2[1], VIRTQUEUE_MAX_SIZE,
out_sg, out_num,
n->guest_hdr_len, -1);
 if (out_num == VIRTQUEUE_MAX_SIZE) {
@@ -1269,10 +1272,10 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
  */
 assert(n->host_hdr_len <= n->guest_hdr_len);
 if (n->host_hdr_len != n->guest_hdr_len) {
-unsigned sg_num = iov_copy(sg, ARRAY_SIZE(sg),
+unsigned sg_num = iov_copy(sg, VIRTQUEUE_MAX_SIZE,
out_sg, out_num,
0, n->host_hdr_len);
-sg_num += iov_copy(sg + sg_num, ARRAY_SIZE(sg) - sg_num,
+sg_num += iov_copy(sg + sg_num, VIRTQUEUE_MAX_SIZE - sg_num,
  out_sg, out_num,
  n->guest_hdr_len, -1);
 out_num = sg_num;
@@ -1284,6 +1287,8 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
 if (ret == 0) {
 virtio_queue_set_notification(q->tx_vq, 0);
 q->async_tx.elem = elem;
+g_free(sg);
+g_free(sg2);
 return -EBUSY;
 }
 
@@ -1296,6 +1301,8 @@ drop:
 break;
 }
 }
+g_free(sg);
+g_free(sg2);
 return num_packets;
 }
 
-- 
2.5.5






[Qemu-devel] [PATCH] hw/arm/nseries: Allocating Large sized arrays to heap

2016-04-25 Thread Zhou Jie
n8x0_init has a huge stack usage of 65536 bytes approx.
Moving large arrays to heap to reduce stack usage.

Signed-off-by: Zhou Jie 
---
 hw/arm/nseries.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hw/arm/nseries.c b/hw/arm/nseries.c
index 5382505..c7068c0 100644
--- a/hw/arm/nseries.c
+++ b/hw/arm/nseries.c
@@ -1364,7 +1364,7 @@ static void n8x0_init(MachineState *machine,
 
 if (option_rom[0].name &&
 (machine->boot_order[0] == 'n' || !machine->kernel_filename)) {
-uint8_t nolo_tags[0x1];
+uint8_t *nolo_tags = g_new(uint8_t, 0x1);
 /* No, wait, better start at the ROM.  */
 s->mpu->cpu->env.regs[15] = OMAP2_Q2_BASE + 0x40;
 
@@ -1383,6 +1383,7 @@ static void n8x0_init(MachineState *machine,
 
 n800_setup_nolo_tags(nolo_tags);
 cpu_physical_memory_write(OMAP2_SRAM_BASE, nolo_tags, 0x1);
+g_free(nolo_tags);
 }
 }
 
-- 
2.5.5






[Qemu-devel] [PATCH] hw/net/opencores_eth: Allocating Large sized arrays to heap

2016-04-25 Thread Zhou Jie
open_eth_start_xmit has a huge stack usage of 65536 bytes approx.
Moving large arrays to heap to reduce stack usage.

Signed-off-by: Zhou Jie 
---
 hw/net/opencores_eth.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/hw/net/opencores_eth.c b/hw/net/opencores_eth.c
index c6094fb..ba23922 100644
--- a/hw/net/opencores_eth.c
+++ b/hw/net/opencores_eth.c
@@ -483,7 +483,7 @@ static NetClientInfo net_open_eth_info = {
 
 static void open_eth_start_xmit(OpenEthState *s, desc *tx)
 {
-uint8_t buf[65536];
+uint8_t *buf = NULL;
 unsigned len = GET_FIELD(tx->len_flags, TXD_LEN);
 unsigned tx_len = len;
 
@@ -498,6 +498,7 @@ static void open_eth_start_xmit(OpenEthState *s, desc *tx)
 
 trace_open_eth_start_xmit(tx->buf_ptr, len, tx_len);
 
+buf = g_new(uint8_t, tx_len);
 if (len > tx_len) {
 len = tx_len;
 }
@@ -506,6 +507,7 @@ static void open_eth_start_xmit(OpenEthState *s, desc *tx)
 memset(buf + len, 0, tx_len - len);
 }
 qemu_send_packet(qemu_get_queue(s->nic), buf, tx_len);
+g_free(buf);
 
 if (tx->len_flags & TXD_WR) {
 s->tx_desc = 0;
-- 
2.5.5






[Qemu-devel] [PATCH] net/tap: Allocating Large sized arrays to heap

2016-04-25 Thread Zhou Jie
net_init_tap has a huge stack usage of 8192 bytes approx.
Moving large arrays to heap to reduce stack usage.

Signed-off-by: Zhou Jie 
---
 net/tap.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/tap.c b/net/tap.c
index 740e8a2..49817c7 100644
--- a/net/tap.c
+++ b/net/tap.c
@@ -769,8 +769,8 @@ int net_init_tap(const NetClientOptions *opts, const char 
*name,
 return -1;
 }
 } else if (tap->has_fds) {
-char *fds[MAX_TAP_QUEUES];
-char *vhost_fds[MAX_TAP_QUEUES];
+char **fds = g_new(char *, MAX_TAP_QUEUES);
+char **vhost_fds = g_new(char *, MAX_TAP_QUEUES);
 int nfds, nvhosts;
 
 if (tap->has_ifname || tap->has_script || tap->has_downscript ||
@@ -818,6 +818,8 @@ int net_init_tap(const NetClientOptions *opts, const char 
*name,
 return -1;
 }
 }
+g_free(fds);
+g_free(vhost_fds);
 } else if (tap->has_helper) {
 if (tap->has_ifname || tap->has_script || tap->has_downscript ||
 tap->has_vnet_hdr || tap->has_queues || tap->has_vhostfds) {
-- 
2.5.5






[Qemu-devel] [PATCH] Added negative check for get_image_size()

2016-04-24 Thread Zhou Jie
This patch adds check for negative return value from get_image_size(),
where it is missing. It avoids unnecessary two function calls.

Signed-off-by: Zhou Jie 
---
 hw/ppc/spapr.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index feaab08..ccea633 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1842,6 +1842,10 @@ static void ppc_spapr_init(MachineState *machine)
 exit(1);
 }
 spapr->rtas_size = get_image_size(filename);
+if (spapr->rtas_size < 0) {
+error_report("Could not get size of LPAR rtas '%s'", filename);
+exit(1);
+}
 spapr->rtas_blob = g_malloc(spapr->rtas_size);
 if (load_image_size(filename, spapr->rtas_blob, spapr->rtas_size) < 0) {
 error_report("Could not load LPAR rtas '%s'", filename);
-- 
2.5.5