Re: [Qemu-devel] [PATCH v3] vfio : add aer process
ping On 2016/8/15 10:53, Zhou Jie wrote: ping On 2016/8/2 11:57, Zhou Jie wrote: During aer err occurs and resume do following to protect device from being accessed. 1. Make config space read only. 2. Disable INTx/MSI Interrupt. 3. Do nothing for bar regions. Signed-off-by: Zhou Jie --- v2-v3: 1. Call init_completion() in vfio_pci_probe. 2. Call reinit_completion() in vfio_pci_aer_err_detected. 3. Remove unnecessary brackets. v1-v2: 1. Add aer process to vfio driver. drivers/vfio/pci/vfio_pci.c | 48 + drivers/vfio/pci/vfio_pci_private.h | 2 ++ include/uapi/linux/vfio.h | 2 ++ 3 files changed, 52 insertions(+) diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index d624a52..4c246a1 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -648,6 +648,15 @@ static long vfio_pci_ioctl(void *device_data, struct vfio_pci_device *vdev = device_data; unsigned long minsz; +if (vdev->aer_error_in_progress && (cmd == VFIO_DEVICE_SET_IRQS || +cmd == VFIO_DEVICE_RESET || cmd == VFIO_DEVICE_PCI_HOT_RESET)) { +int ret; +ret = wait_for_completion_interruptible( +&vdev->aer_error_completion); +if (ret) +return ret; +} + if (cmd == VFIO_DEVICE_GET_INFO) { struct vfio_device_info info; @@ -664,6 +673,10 @@ static long vfio_pci_ioctl(void *device_data, if (vdev->reset_works) info.flags |= VFIO_DEVICE_FLAGS_RESET; +info.flags |= VFIO_DEVICE_FLAGS_AERPROCESS; +if (vdev->aer_error_in_progress) +info.flags |= VFIO_DEVICE_FLAGS_INAERPROCESS; + info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions; info.num_irqs = VFIO_PCI_NUM_IRQS; @@ -1070,6 +1083,13 @@ static ssize_t vfio_pci_rw(void *device_data, char __user *buf, switch (index) { case VFIO_PCI_CONFIG_REGION_INDEX: +if (vdev->aer_error_in_progress && iswrite) { +int ret; +ret = wait_for_completion_interruptible( +&vdev->aer_error_completion); +if (ret) +return ret; +} return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite); case VFIO_PCI_ROM_REGION_INDEX: @@ -1228,6 +1248,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) vdev->irq_type = VFIO_PCI_NUM_IRQS; mutex_init(&vdev->igate); spin_lock_init(&vdev->irqlock); +init_completion(&vdev->aer_error_completion); ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev); if (ret) { @@ -1300,6 +1321,11 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, mutex_lock(&vdev->igate); +vdev->aer_error_in_progress = true; +reinit_completion(&vdev->aer_error_completion); +vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE | +VFIO_IRQ_SET_ACTION_TRIGGER, +vdev->irq_type, 0, 0, NULL); if (vdev->err_trigger) eventfd_signal(vdev->err_trigger, 1); @@ -1310,8 +1336,30 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, return PCI_ERS_RESULT_CAN_RECOVER; } +static void vfio_pci_aer_resume(struct pci_dev *pdev) +{ +struct vfio_pci_device *vdev; +struct vfio_device *device; + +device = vfio_device_get_from_dev(&pdev->dev); +if (device == NULL) +return; + +vdev = vfio_device_data(device); +if (vdev == NULL) { +vfio_device_put(device); +return; +} + +vdev->aer_error_in_progress = false; +complete_all(&vdev->aer_error_completion); + +vfio_device_put(device); +} + static const struct pci_error_handlers vfio_err_handlers = { .error_detected = vfio_pci_aer_err_detected, +.resume = vfio_pci_aer_resume, }; static struct pci_driver vfio_pci_driver = { diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index 2128de8..7430d92 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -91,6 +91,8 @@ struct vfio_pci_device { boolhas_vga; boolneeds_reset; boolnointx; +boolaer_error_in_progress; +struct completionaer_error_completion; struct pci_saved_state*pci_saved_state; intrefcnt; struct eventfd_ctx*err_trigger; diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 255a211..59b9cf6 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -198,6 +198,8 @@ struct vfio_device_info { #define VFIO_DEVICE_FLAGS_PCI(1 << 1)/* vfio-pci device */ #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2)/* vfio-platform device */ #define VFIO_DEVICE_FLAG
Re: [Qemu-devel] [PATCH v3] vfio : add aer process
ping On 2016/8/2 11:57, Zhou Jie wrote: During aer err occurs and resume do following to protect device from being accessed. 1. Make config space read only. 2. Disable INTx/MSI Interrupt. 3. Do nothing for bar regions. Signed-off-by: Zhou Jie --- v2-v3: 1. Call init_completion() in vfio_pci_probe. 2. Call reinit_completion() in vfio_pci_aer_err_detected. 3. Remove unnecessary brackets. v1-v2: 1. Add aer process to vfio driver. drivers/vfio/pci/vfio_pci.c | 48 + drivers/vfio/pci/vfio_pci_private.h | 2 ++ include/uapi/linux/vfio.h | 2 ++ 3 files changed, 52 insertions(+) diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index d624a52..4c246a1 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -648,6 +648,15 @@ static long vfio_pci_ioctl(void *device_data, struct vfio_pci_device *vdev = device_data; unsigned long minsz; + if (vdev->aer_error_in_progress && (cmd == VFIO_DEVICE_SET_IRQS || + cmd == VFIO_DEVICE_RESET || cmd == VFIO_DEVICE_PCI_HOT_RESET)) { + int ret; + ret = wait_for_completion_interruptible( + &vdev->aer_error_completion); + if (ret) + return ret; + } + if (cmd == VFIO_DEVICE_GET_INFO) { struct vfio_device_info info; @@ -664,6 +673,10 @@ static long vfio_pci_ioctl(void *device_data, if (vdev->reset_works) info.flags |= VFIO_DEVICE_FLAGS_RESET; + info.flags |= VFIO_DEVICE_FLAGS_AERPROCESS; + if (vdev->aer_error_in_progress) + info.flags |= VFIO_DEVICE_FLAGS_INAERPROCESS; + info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions; info.num_irqs = VFIO_PCI_NUM_IRQS; @@ -1070,6 +1083,13 @@ static ssize_t vfio_pci_rw(void *device_data, char __user *buf, switch (index) { case VFIO_PCI_CONFIG_REGION_INDEX: + if (vdev->aer_error_in_progress && iswrite) { + int ret; + ret = wait_for_completion_interruptible( + &vdev->aer_error_completion); + if (ret) + return ret; + } return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite); case VFIO_PCI_ROM_REGION_INDEX: @@ -1228,6 +1248,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) vdev->irq_type = VFIO_PCI_NUM_IRQS; mutex_init(&vdev->igate); spin_lock_init(&vdev->irqlock); + init_completion(&vdev->aer_error_completion); ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev); if (ret) { @@ -1300,6 +1321,11 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, mutex_lock(&vdev->igate); + vdev->aer_error_in_progress = true; + reinit_completion(&vdev->aer_error_completion); + vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE | + VFIO_IRQ_SET_ACTION_TRIGGER, + vdev->irq_type, 0, 0, NULL); if (vdev->err_trigger) eventfd_signal(vdev->err_trigger, 1); @@ -1310,8 +1336,30 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, return PCI_ERS_RESULT_CAN_RECOVER; } +static void vfio_pci_aer_resume(struct pci_dev *pdev) +{ + struct vfio_pci_device *vdev; + struct vfio_device *device; + + device = vfio_device_get_from_dev(&pdev->dev); + if (device == NULL) + return; + + vdev = vfio_device_data(device); + if (vdev == NULL) { + vfio_device_put(device); + return; + } + + vdev->aer_error_in_progress = false; + complete_all(&vdev->aer_error_completion); + + vfio_device_put(device); +} + static const struct pci_error_handlers vfio_err_handlers = { .error_detected = vfio_pci_aer_err_detected, + .resume = vfio_pci_aer_resume, }; static struct pci_driver vfio_pci_driver = { diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index 2128de8..7430d92 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -91,6 +91,8 @@ struct vfio_pci_device { boolhas_vga; boolneeds_reset; boolnointx; + boolaer_error_in_progress; + struct completion aer_error_completion; struct pci_saved_state *pci_saved_state; int refcnt; struct eventfd_ctx *err_trigger; diff --git
[Qemu-devel] [PATCH v3] vfio : add aer process
During aer err occurs and resume do following to protect device from being accessed. 1. Make config space read only. 2. Disable INTx/MSI Interrupt. 3. Do nothing for bar regions. Signed-off-by: Zhou Jie --- v2-v3: 1. Call init_completion() in vfio_pci_probe. 2. Call reinit_completion() in vfio_pci_aer_err_detected. 3. Remove unnecessary brackets. v1-v2: 1. Add aer process to vfio driver. drivers/vfio/pci/vfio_pci.c | 48 + drivers/vfio/pci/vfio_pci_private.h | 2 ++ include/uapi/linux/vfio.h | 2 ++ 3 files changed, 52 insertions(+) diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index d624a52..4c246a1 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -648,6 +648,15 @@ static long vfio_pci_ioctl(void *device_data, struct vfio_pci_device *vdev = device_data; unsigned long minsz; + if (vdev->aer_error_in_progress && (cmd == VFIO_DEVICE_SET_IRQS || + cmd == VFIO_DEVICE_RESET || cmd == VFIO_DEVICE_PCI_HOT_RESET)) { + int ret; + ret = wait_for_completion_interruptible( + &vdev->aer_error_completion); + if (ret) + return ret; + } + if (cmd == VFIO_DEVICE_GET_INFO) { struct vfio_device_info info; @@ -664,6 +673,10 @@ static long vfio_pci_ioctl(void *device_data, if (vdev->reset_works) info.flags |= VFIO_DEVICE_FLAGS_RESET; + info.flags |= VFIO_DEVICE_FLAGS_AERPROCESS; + if (vdev->aer_error_in_progress) + info.flags |= VFIO_DEVICE_FLAGS_INAERPROCESS; + info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions; info.num_irqs = VFIO_PCI_NUM_IRQS; @@ -1070,6 +1083,13 @@ static ssize_t vfio_pci_rw(void *device_data, char __user *buf, switch (index) { case VFIO_PCI_CONFIG_REGION_INDEX: + if (vdev->aer_error_in_progress && iswrite) { + int ret; + ret = wait_for_completion_interruptible( + &vdev->aer_error_completion); + if (ret) + return ret; + } return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite); case VFIO_PCI_ROM_REGION_INDEX: @@ -1228,6 +1248,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) vdev->irq_type = VFIO_PCI_NUM_IRQS; mutex_init(&vdev->igate); spin_lock_init(&vdev->irqlock); + init_completion(&vdev->aer_error_completion); ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev); if (ret) { @@ -1300,6 +1321,11 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, mutex_lock(&vdev->igate); + vdev->aer_error_in_progress = true; + reinit_completion(&vdev->aer_error_completion); + vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE | + VFIO_IRQ_SET_ACTION_TRIGGER, + vdev->irq_type, 0, 0, NULL); if (vdev->err_trigger) eventfd_signal(vdev->err_trigger, 1); @@ -1310,8 +1336,30 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, return PCI_ERS_RESULT_CAN_RECOVER; } +static void vfio_pci_aer_resume(struct pci_dev *pdev) +{ + struct vfio_pci_device *vdev; + struct vfio_device *device; + + device = vfio_device_get_from_dev(&pdev->dev); + if (device == NULL) + return; + + vdev = vfio_device_data(device); + if (vdev == NULL) { + vfio_device_put(device); + return; + } + + vdev->aer_error_in_progress = false; + complete_all(&vdev->aer_error_completion); + + vfio_device_put(device); +} + static const struct pci_error_handlers vfio_err_handlers = { .error_detected = vfio_pci_aer_err_detected, + .resume = vfio_pci_aer_resume, }; static struct pci_driver vfio_pci_driver = { diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index 2128de8..7430d92 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -91,6 +91,8 @@ struct vfio_pci_device { boolhas_vga; boolneeds_reset; boolnointx; + boolaer_error_in_progress; + struct completion aer_error_completion; struct pci_saved_state *pci_saved_state; int refcnt; struct eventfd_ctx *err_trigger; diff --git a/include/uapi/linux/vfio.h b/includ
Re: [Qemu-devel] [PATCH v2 2/2] vfio : add aer process
Hi, Alex Clearly this has only been tested for a single instance of an AER error event and resume per device. Are the things you're intending to block actually blocked for subsequent events? Note how complete_all() fills the done field to let all current and future waiters go through and nowhere is there a call to reinit_completion() to drain that path. Thanks, Alex Do you mean this condition? For device 1: error1 occurs error1 resumes error2 occurs error2 resumes error3 occurs error3 resumes In current code, I do complete_all() when error1 resumes. And this will unblock the device when error2 and error3 are still be processed. So walk me through how this works. On vfio_pci_open() we call init_completion(), which sets aer_error_completion.done equal to zero (BTW, a user can open the device file descriptor multiple times, so there's already a bug here). I will call init_completion() in vfio_pci_probe. Let's assume that an error occurs and the user stalls a single access on wait_for_completion_interruptible(). The bulk of this function happens here: static inline long __sched do_wait_for_common(struct completion *x, long (*action)(long), long timeout, int state) { if (!x->done) { DECLARE_WAITQUEUE(wait, current); __add_wait_queue_tail_exclusive(&x->wait, &wait); do { if (signal_pending_state(state, current)) { timeout = -ERESTARTSYS; break; } __set_current_state(state); spin_unlock_irq(&x->wait.lock); timeout = action(timeout); spin_lock_irq(&x->wait.lock); } while (!x->done && timeout); __remove_wait_queue(&x->wait, &wait); if (!x->done) return timeout; } x->done--; return timeout ?: 1; } So it waits within that do{}while loop for a completion, interruption, or timeout. Then we have: void complete_all(struct completion *x) { unsigned long flags; spin_lock_irqsave(&x->wait.lock, flags); x->done += UINT_MAX/2; __wake_up_locked(&x->wait, TASK_NORMAL, 0); spin_unlock_irqrestore(&x->wait.lock, flags); } So aer_error_completion.done gets incremented to let a couple billion completion waiters through... Show me how another call to wait_for_completion_interruptible() will ever block again within our lifetime when the actual wait of do_wait_for_common() is only entered when 'done' count is equal to zero. This seems to be why reinit_completion() exists, but it's not used here. Thanks, Alex I will call reinit_completion() in vfio_pci_aer_err_detected when an aer error is detected. Thank you very much. Sincerely ZhouJie
Re: [Qemu-devel] [PATCH v2 2/2] vfio : add aer process
Hi, Alex On 2016/7/30 1:12, Alex Williamson wrote: On Tue, 19 Jul 2016 15:32:43 +0800 Zhou Jie wrote: From: Chen Fan During aer err occurs and resume do following to protect device from being accessed. 1. Make config space read only. 2. Disable INTx/MSI Interrupt. 3. Do nothing for bar regions. Signed-off-by: Zhou Jie --- drivers/vfio/pci/vfio_pci.c | 30 ++ drivers/vfio/pci/vfio_pci_private.h | 2 ++ include/uapi/linux/vfio.h | 2 ++ 3 files changed, 34 insertions(+) diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 2d12b03..dd96b60 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -318,6 +318,7 @@ static int vfio_pci_open(void *device_data) return -ENODEV; mutex_lock(&driver_lock); + init_completion(&vdev->aer_error_completion); if (!vdev->refcnt) { ret = vfio_pci_enable(vdev); @@ -571,6 +572,16 @@ static long vfio_pci_ioctl(void *device_data, struct vfio_pci_device *vdev = device_data; unsigned long minsz; + if (vdev->aer_error_in_progress && (cmd == VFIO_DEVICE_SET_IRQS || + cmd == VFIO_DEVICE_RESET || cmd == VFIO_DEVICE_PCI_HOT_RESET)) { + int ret; + ret = wait_for_completion_interruptible( + &vdev->aer_error_completion); + if (ret) { + return ret; + } No brackets necessary. + } + if (cmd == VFIO_DEVICE_GET_INFO) { struct vfio_device_info info; @@ -587,6 +598,10 @@ static long vfio_pci_ioctl(void *device_data, if (vdev->reset_works) info.flags |= VFIO_DEVICE_FLAGS_RESET; + info.flags |= VFIO_DEVICE_FLAGS_AERPROCESS; + if (vdev->aer_error_in_progress) + info.flags |= VFIO_DEVICE_FLAGS_INAERPROCESS; + info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions; info.num_irqs = VFIO_PCI_NUM_IRQS; @@ -996,6 +1011,14 @@ static ssize_t vfio_pci_rw(void *device_data, char __user *buf, switch (index) { case VFIO_PCI_CONFIG_REGION_INDEX: + if (vdev->aer_error_in_progress && iswrite) { + int ret; + ret = wait_for_completion_interruptible( + &vdev->aer_error_completion); + if (ret) { + return ret; + } + } return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite); case VFIO_PCI_ROM_REGION_INDEX: @@ -1226,6 +1249,10 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, mutex_lock(&vdev->igate); + vdev->aer_error_in_progress = true; + vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE | + VFIO_IRQ_SET_ACTION_TRIGGER, + vdev->irq_type, 0, 0, NULL); if (vdev->err_trigger) eventfd_signal(vdev->err_trigger, 1); @@ -1252,6 +1279,9 @@ static void vfio_pci_aer_resume(struct pci_dev *pdev) } mutex_lock(&vdev->igate); + + vdev->aer_error_in_progress = false; + complete_all(&vdev->aer_error_completion); if (vdev->resume_trigger) eventfd_signal(vdev->resume_trigger, 1); diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index 80d4ddd..2f151f5 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -84,6 +84,8 @@ struct vfio_pci_device { boolhas_vga; boolneeds_reset; boolnointx; + boolaer_error_in_progress; + struct completion aer_error_completion; struct pci_saved_state *pci_saved_state; int refcnt; struct eventfd_ctx *err_trigger; diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 34ab138..276ce50 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -198,6 +198,8 @@ struct vfio_device_info { #define VFIO_DEVICE_FLAGS_PCI (1 << 1) /* vfio-pci device */ #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2) /* vfio-platform device */ #define VFIO_DEVICE_FLAGS_AMBA (1 << 3) /* vfio-amba device */ +#define VFIO_DEVICE_FLAGS_AERPROCESS (1 << 4) /* support aer error progress */ +#define VFIO_DEVICE_FLAGS_INAERPROCESS (1 << 5)/* status in aer error progress */ __u32 num_regions;/* Max region index + 1 */ __u32 num_irqs; /* Max IRQ index + 1 */ }; Clearly this has only been tested for a single instance of an AER error event and
Re: [Qemu-devel] [PATCH v2 1/2] vfio : resume notifier
Hi, Alex On 2016/7/30 1:11, Alex Williamson wrote: On Tue, 19 Jul 2016 15:52:45 +0800 Zhou Jie wrote: From: Chen Fan An empty commit log is unacceptable for all but the most trivial patches. There's also no sign-off on this patch. Sorry. I should note it. I also don't know why we need this since you previously found that for QEMU, ordering of error versus resume notifications is not guaranteed, which is why I thought we went with a status flag within the struct vfio_device_info. I'm not adding an interrupt to the user that has no users. This was not part of the most recent discussion we had about this, so I'm lost why this patch exists. Thanks, Alex I will remove the resume interrupt. Sincerely ZhouJie
Re: [Qemu-devel] [PATCH v2 0/2] vfio: add aer process
ping On 2016/7/19 16:13, Zhou Jie wrote: From: Chen Fan v1-v2: 1. Add aer process to vfio driver. Chen Fan (2): vfio : add aer process vfio : resume notifier drivers/vfio/pci/vfio_pci.c | 58 - drivers/vfio/pci/vfio_pci_intrs.c | 18 drivers/vfio/pci/vfio_pci_private.h | 3 ++ include/uapi/linux/vfio.h | 3 ++ 4 files changed, 81 insertions(+), 1 deletion(-)
[Qemu-devel] [PATCH v2 0/2] vfio: add aer process
From: Chen Fan v1-v2: 1. Add aer process to vfio driver. Chen Fan (2): vfio : add aer process vfio : resume notifier drivers/vfio/pci/vfio_pci.c | 58 - drivers/vfio/pci/vfio_pci_intrs.c | 18 drivers/vfio/pci/vfio_pci_private.h | 3 ++ include/uapi/linux/vfio.h | 3 ++ 4 files changed, 81 insertions(+), 1 deletion(-) -- 1.8.3.1
[Qemu-devel] [PATCH v2 1/2] vfio : resume notifier
From: Chen Fan --- drivers/vfio/pci/vfio_pci.c | 28 +++- drivers/vfio/pci/vfio_pci_intrs.c | 18 ++ drivers/vfio/pci/vfio_pci_private.h | 1 + include/uapi/linux/vfio.h | 1 + 4 files changed, 47 insertions(+), 1 deletion(-) diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 188b1ff..2d12b03 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -363,7 +363,8 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type) return (flags & PCI_MSIX_FLAGS_QSIZE) + 1; } - } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) { + } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX || + irq_type == VFIO_PCI_RESUME_IRQ_INDEX) { if (pci_is_pcie(vdev->pdev)) return 1; } else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) { @@ -731,6 +732,7 @@ static long vfio_pci_ioctl(void *device_data, case VFIO_PCI_REQ_IRQ_INDEX: break; case VFIO_PCI_ERR_IRQ_INDEX: + case VFIO_PCI_RESUME_IRQ_INDEX: if (pci_is_pcie(vdev->pdev)) break; /* pass thru to return error */ @@ -1234,8 +1236,32 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, return PCI_ERS_RESULT_CAN_RECOVER; } +static void vfio_pci_aer_resume(struct pci_dev *pdev) +{ + struct vfio_pci_device *vdev; + struct vfio_device *device; + + device = vfio_device_get_from_dev(&pdev->dev); + if (device == NULL) + return; + + vdev = vfio_device_data(device); + if (vdev == NULL) { + vfio_device_put(device); + return; + } + + mutex_lock(&vdev->igate); + if (vdev->resume_trigger) + eventfd_signal(vdev->resume_trigger, 1); + + mutex_unlock(&vdev->igate); + vfio_device_put(device); +} + static const struct pci_error_handlers vfio_err_handlers = { .error_detected = vfio_pci_aer_err_detected, + .resume = vfio_pci_aer_resume, }; static struct pci_driver vfio_pci_driver = { diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 15ecfc9..3a01a62 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -617,6 +617,16 @@ static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev, return vfio_pci_set_ctx_trigger_single(&vdev->err_trigger, flags, data); } +static int vfio_pci_set_resume_trigger(struct vfio_pci_device *vdev, + unsigned index, unsigned start, + unsigned count, uint32_t flags, void *data) +{ + if (index != VFIO_PCI_RESUME_IRQ_INDEX) + return -EINVAL; + + return vfio_pci_set_ctx_trigger_single(&vdev->resume_trigger, flags, data); +} + static int vfio_pci_set_req_trigger(struct vfio_pci_device *vdev, unsigned index, unsigned start, unsigned count, uint32_t flags, void *data) @@ -676,6 +686,14 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags, break; } break; + case VFIO_PCI_RESUME_IRQ_INDEX: + switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { + case VFIO_IRQ_SET_ACTION_TRIGGER: + if (pci_is_pcie(vdev->pdev)) + func = vfio_pci_set_resume_trigger; + break; + } + break; } if (!func) diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index 016c14a..80d4ddd 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -88,6 +88,7 @@ struct vfio_pci_device { int refcnt; struct eventfd_ctx *err_trigger; struct eventfd_ctx *req_trigger; + struct eventfd_ctx *resume_trigger; }; #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 255a211..34ab138 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -433,6 +433,7 @@ enum { VFIO_PCI_MSIX_IRQ_INDEX, VFIO_PCI_ERR_IRQ_INDEX, VFIO_PCI_REQ_IRQ_INDEX, + VFIO_PCI_RESUME_IRQ_INDEX, VFIO_PCI_NUM_IRQS }; -- 1.8.3.1
[Qemu-devel] [PATCH v9 10/11] vfio: Add waiting for host aer error progress
From: Chen Fan For supporting aer recovery, host and guest would run the same aer recovery code, that would do the secondary bus reset if the error is fatal, the aer recovery process: 1. error_detected 2. reset_link (if fatal) 3. slot_reset/mmio_enabled 4. resume It indicates that host will do secondary bus reset to reset the physical devices under bus in step 2, that would cause devices in D3 status in a short time. But in qemu, we register an error detected handler, that would be invoked as host broadcasts the error-detected event in step 1, in order to avoid guest do reset_link when host do reset_link simultaneously. it may cause fatal error. we poll the vfio_device_info to assure host reset completely. In qemu, the aer recovery process: 1. Detect support for aer error progress If host vfio driver does not support for aer error progress, directly fail to boot up VM as with aer enabled. 2. Immediately notify the VM on error detected. 3. Wait for host aer error progress Poll the vfio_device_info, If it is still in aer error progress after some timeout, we would abort the guest directed bus reset altogether and unplug of the device to prevent it from further interacting with the VM. 4. Reset bus. Signed-off-by: Chen Fan Signed-off-by: Zhou Jie --- hw/vfio/pci.c | 51 +- hw/vfio/pci.h | 1 + linux-headers/linux/vfio.h | 4 3 files changed, 55 insertions(+), 1 deletion(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 0e42786..777245c 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -35,6 +35,12 @@ #define MSIX_CAP_LENGTH 12 +/* + * Timeout for waiting host aer error process, it is 3 seconds. + * For hardware bus reset 3 seconds will be enough. + */ +#define PCI_AER_PROCESS_TIMEOUT 300 + static void vfio_disable_interrupts(VFIOPCIDevice *vdev); static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); @@ -1913,6 +1919,14 @@ static void vfio_check_hot_bus_reset(VFIOPCIDevice *vdev, Error **errp) VFIOGroup *group; int ret, i, devfn, range_limit; +if (!(vdev->vbasedev.flags & VFIO_DEVICE_FLAGS_AERPROCESS)) { +error_setg(errp, "vfio: Cannot enable AER for device %s," + " host vfio driver does not support for" + " aer error progress", + vdev->vbasedev.name); +return; +} + ret = vfio_get_hot_reset_info(vdev, &info); if (ret) { error_setg(errp, "vfio: Cannot enable AER for device %s," @@ -2679,6 +2693,11 @@ static void vfio_err_notifier_handler(void *opaque) msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN : PCI_ERR_ROOT_CMD_NONFATAL_EN; +if (isfatal) { +PCIDevice *dev_0 = pci_get_function_0(dev); +VFIOPCIDevice *vdev_0 = DO_UPCAST(VFIOPCIDevice, pdev, dev_0); +vdev_0->pci_aer_error_signaled = true; +} pcie_aer_msg(dev, &msg); return; } @@ -3163,6 +3182,19 @@ static void vfio_exitfn(PCIDevice *pdev) vfio_bars_exit(vdev); } +static int vfio_aer_error_is_in_process(VFIOPCIDevice *vdev) +{ +struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) }; +int ret; + +ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_INFO, &dev_info); +if (ret) { +error_report("vfio: error getting device info: %m"); +return ret; +} +return dev_info.flags & VFIO_DEVICE_FLAGS_INAERPROCESS ? 1 : 0; +} + static void vfio_pci_reset(DeviceState *dev) { PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev); @@ -3176,7 +3208,24 @@ static void vfio_pci_reset(DeviceState *dev) if ((pci_get_word(br->config + PCI_BRIDGE_CONTROL) & PCI_BRIDGE_CTL_BUS_RESET)) { if (pci_get_function_0(pdev) == pdev) { -vfio_pci_hot_reset(vdev, vdev->single_depend_dev); +if (!vdev->pci_aer_error_signaled) { +vfio_pci_hot_reset(vdev, vdev->single_depend_dev); +} else { +int i; +for (i = 0; i < 1000; i++) { +if (!vfio_aer_error_is_in_process(vdev)) { +break; +} +g_usleep(PCI_AER_PROCESS_TIMEOUT / 1000); +} + +if (i == 1000) { +qdev_unplug(&vdev->pdev.qdev, NULL); +} else { +vfio_pci_hot_reset(vdev, vdev->single_depend_dev); +} +vdev->pci_aer_error_signaled = false; +} } return; } diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index ed14322..c9e0202 100644 --- a/hw/vfio/pci.h +++ b/hw/vf
[Qemu-devel] [PATCH v9 04/11] vfio: refine function vfio_pci_host_match
From: Chen Fan Signed-off-by: Chen Fan --- hw/vfio/pci.c | 21 + 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 11c895c..21fd801 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2060,14 +2060,27 @@ static void vfio_pci_post_reset(VFIOPCIDevice *vdev) vfio_intx_enable(vdev); } +static int vfio_pci_name_to_addr(const char *name, PCIHostDeviceAddress *addr) +{ +if (strlen(name) != 12 || +sscanf(name, "%04x:%02x:%02x.%1x", &addr->domain, + &addr->bus, &addr->slot, &addr->function) != 4) { +return -EINVAL; +} + +return 0; +} + static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name) { -char tmp[13]; +PCIHostDeviceAddress tmp; -sprintf(tmp, "%04x:%02x:%02x.%1x", addr->domain, -addr->bus, addr->slot, addr->function); +if (vfio_pci_name_to_addr(name, &tmp)) { +return false; +} -return (strcmp(tmp, name) == 0); +return (tmp.domain == addr->domain && tmp.bus == addr->bus && +tmp.slot == addr->slot && tmp.function == addr->function); } static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) -- 1.8.3.1
[Qemu-devel] [PATCH v9 09/11] vfio-pci: pass the aer error to guest
From: Chen Fan when the vfio device encounters an uncorrectable error in host, the vfio_pci driver will signal the eventfd registered by this vfio device, resulting in the qemu eventfd handler getting invoked. this patch is to pass the error to guest and let the guest driver recover from the error. Signed-off-by: Chen Fan --- hw/vfio/pci.c | 60 +-- 1 file changed, 54 insertions(+), 6 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 0521652..0e42786 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2627,18 +2627,66 @@ static void vfio_put_device(VFIOPCIDevice *vdev) static void vfio_err_notifier_handler(void *opaque) { VFIOPCIDevice *vdev = opaque; +PCIDevice *dev = &vdev->pdev; +Error *local_err = NULL; +PCIEAERMsg msg = { +.severity = 0, +.source_id = (pci_bus_num(dev->bus) << 8) | dev->devfn, +}; if (!event_notifier_test_and_clear(&vdev->err_notifier)) { return; } + +if (!(vdev->features & VFIO_FEATURE_ENABLE_AER)) { +goto stop; +} + +/* + * in case the real hardware configuration has been changed, + * here we should recheck the bus reset capability. + */ +vfio_check_hot_bus_reset(vdev, &local_err); +if (local_err) { +error_report_err(local_err); +goto stop; +} + +/* + * we should read the error details from the real hardware + * configuration spaces, here we only need to do is signaling + * to guest an uncorrectable error has occurred. + */ +if (dev->exp.aer_cap) { +uint8_t *aer_cap = dev->config + dev->exp.aer_cap; +uint32_t uncor_status; +bool isfatal; + +uncor_status = vfio_pci_read_config(dev, + dev->exp.aer_cap + PCI_ERR_UNCOR_STATUS, 4); + +/* + * if the error is not emitted by this device, we can + * just ignore it. + */ +if (!(uncor_status & ~0UL)) { +return; +} + +isfatal = uncor_status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER); + +msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN : + PCI_ERR_ROOT_CMD_NONFATAL_EN; + +pcie_aer_msg(dev, &msg); +return; +} + +stop: /* - * TBD. Retrieve the error details and decide what action - * needs to be taken. One of the actions could be to pass - * the error to the guest and have the guest driver recover - * from the error. This requires that PCIe capabilities be - * exposed to the guest. For now, we just terminate the - * guest to contain the error. + * If the aer capability is not exposed to the guest. we just + * terminate the guest to contain the error. */ error_report("%s(%s) Unrecoverable error detected. Please collect any data possible and then kill the guest", __func__, vdev->vbasedev.name); -- 1.8.3.1
[Qemu-devel] [PATCH v9 11/11] vfio: add 'aer' property to expose aercap
From: Chen Fan add 'aer' property to let user able to decide whether expose the aer capability. by default we should disable aer feature, because it needs configuration restrictions. Signed-off-by: Chen Fan --- hw/vfio/pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 777245c..fdc27e6 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3305,6 +3305,8 @@ static Property vfio_pci_dev_properties[] = { DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice, sub_device_id, PCI_ANY_ID), DEFINE_PROP_UINT32("x-igd-gms", VFIOPCIDevice, igd_gms, 0), +DEFINE_PROP_BIT("aer", VFIOPCIDevice, features, +VFIO_FEATURE_ENABLE_AER_BIT, false), /* * TODO - support passed fds... is this necessary? * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name), -- 1.8.3.1
[Qemu-devel] [PATCH v9 03/11] vfio: add aer support for vfio device
From: Chen Fan Calling pcie_aer_init to initilize aer related registers for vfio device, then reload physical related registers to expose device capability. Signed-off-by: Chen Fan --- hw/vfio/pci.c | 75 ++- hw/vfio/pci.h | 3 +++ 2 files changed, 77 insertions(+), 1 deletion(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 6a6160b..11c895c 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -1854,6 +1854,66 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos) return 0; } +static int vfio_setup_aer(VFIOPCIDevice *vdev, uint8_t cap_ver, + int pos, uint16_t size) +{ +PCIDevice *pdev = &vdev->pdev; +PCIDevice *dev_iter; +uint8_t type; +uint32_t errcap; + +if (!(vdev->features & VFIO_FEATURE_ENABLE_AER)) { +pcie_add_capability(pdev, PCI_EXT_CAP_ID_ERR, +cap_ver, pos, size); +return 0; +} + +dev_iter = pci_bridge_get_device(pdev->bus); +if (!dev_iter) { +goto error; +} + +while (dev_iter) { +if (!pci_is_express(dev_iter)) { +goto error; +} + +type = pcie_cap_get_type(dev_iter); +if ((type != PCI_EXP_TYPE_ROOT_PORT && + type != PCI_EXP_TYPE_UPSTREAM && + type != PCI_EXP_TYPE_DOWNSTREAM)) { +goto error; +} + +if (!dev_iter->exp.aer_cap) { +goto error; +} + +dev_iter = pci_bridge_get_device(dev_iter->bus); +} + +errcap = vfio_pci_read_config(pdev, pos + PCI_ERR_CAP, 4); +/* + * The ability to record multiple headers is depending on + * the state of the Multiple Header Recording Capable bit and + * enabled by the Multiple Header Recording Enable bit. + */ +if ((errcap & PCI_ERR_CAP_MHRC) && +(errcap & PCI_ERR_CAP_MHRE)) { +pdev->exp.aer_log.log_max = PCIE_AER_LOG_MAX_DEFAULT; +} else { +pdev->exp.aer_log.log_max = 0; +} + +pcie_cap_deverr_init(pdev); +return pcie_aer_init(pdev, pos, size); + +error: +error_report("vfio: Unable to enable AER for device %s, parent bus " + "does not support AER signaling", vdev->vbasedev.name); +return -1; +} + static int vfio_add_ext_cap(VFIOPCIDevice *vdev) { PCIDevice *pdev = &vdev->pdev; @@ -1861,6 +1921,7 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev) uint16_t cap_id, next, size; uint8_t cap_ver; uint8_t *config; +int ret = 0; /* Only add extended caps if we have them and the guest can see them */ if (!pci_is_express(pdev) || !pci_bus_is_express(pdev->bus) || @@ -1914,6 +1975,9 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev) PCI_EXT_CAP_NEXT_MASK); switch (cap_id) { +case PCI_EXT_CAP_ID_ERR: +ret = vfio_setup_aer(vdev, cap_ver, next, size); +break; case PCI_EXT_CAP_ID_SRIOV: /* Read-only VF BARs confuse OVMF */ trace_vfio_add_ext_cap_dropped(vdev->vbasedev.name, cap_id, next); break; @@ -1921,6 +1985,9 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev) pcie_add_capability(pdev, cap_id, cap_ver, next, size); } +if (ret) { +goto out; +} } /* Cleanup chain head ID if necessary */ @@ -1928,8 +1995,9 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev) pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0); } +out: g_free(config); -return 0; +return ret; } static int vfio_add_capabilities(VFIOPCIDevice *vdev) @@ -2698,6 +2766,11 @@ static int vfio_initfn(PCIDevice *pdev) goto out_teardown; } +if ((vdev->features & VFIO_FEATURE_ENABLE_AER) && +!pdev->exp.aer_cap) { +goto out_teardown; +} + if (vdev->vga) { vfio_vga_quirk_setup(vdev); } diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index 7d482d9..5483044 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -15,6 +15,7 @@ #include "qemu-common.h" #include "exec/memory.h" #include "hw/pci/pci.h" +#include "hw/pci/pci_bridge.h" #include "hw/vfio/vfio-common.h" #include "qemu/event_notifier.h" #include "qemu/queue.h" @@ -132,6 +133,8 @@ typedef struct VFIOPCIDevice { #define VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT 2 #define VFIO_FEATURE_ENABLE_IGD_OPREGION \ (1 << VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT) +#define VFIO_FEATURE_ENABLE_AER_BIT 3 +#define VFIO_FEATURE_ENABLE_AER (1 << VFIO_FEATURE_ENABLE_AER_BIT) int32_t bootindex; uint32_t igd_gms; uint8_t pm_cap; -- 1.8.3.1
[Qemu-devel] [PATCH v9 05/11] vfio: add check host bus reset is support or not
From: Chen Fan When assigning a vfio device with AER enabled, we must check whether the device supports a host bus reset (ie. hot reset) as this may be used by the guest OS in order to recover the device from an AER error. QEMU must therefore have the ability to perform a physical host bus reset using the existing vfio APIs in response to a virtual bus reset in the VM. A physical bus reset affects all of the devices on the host bus, therefore we place a few simplifying configuration restriction on the VM: - All physical devices affected by a bus reset must be assigned to the VM with AER enabled on each and be configured on the same virtual bus in the VM. - No devices unaffected by the bus reset, be they physical, emulated, or paravirtual may be configured on the same virtual bus as a device supporting AER signaling through vfio. In other words users wishing to enable AER on a multifunction device need to assign all functions of the device to the same virtual bus and enable AER support for each device. The easiest way to accomplish this is to identity map the physical functions to virtual functions with multifunction enabled on the virtual device. Signed-off-by: Chen Fan --- hw/vfio/pci.c | 278 +- hw/vfio/pci.h | 1 + 2 files changed, 256 insertions(+), 23 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 21fd801..242c1e4 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -1693,6 +1693,42 @@ static void vfio_check_af_flr(VFIOPCIDevice *vdev, uint8_t pos) } } +static int vfio_pci_name_to_addr(const char *name, PCIHostDeviceAddress *addr) +{ +if (strlen(name) != 12 || +sscanf(name, "%04x:%02x:%02x.%1x", &addr->domain, + &addr->bus, &addr->slot, &addr->function) != 4) { +return -EINVAL; +} + +return 0; +} + +static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name) +{ +PCIHostDeviceAddress tmp; + +if (vfio_pci_name_to_addr(name, &tmp)) { +return false; +} + +return (tmp.domain == addr->domain && tmp.bus == addr->bus && +tmp.slot == addr->slot && tmp.function == addr->function); +} + +static bool vfio_pci_host_match_slot(PCIHostDeviceAddress *addr, + const char *name) +{ +PCIHostDeviceAddress tmp; + +if (vfio_pci_name_to_addr(name, &tmp)) { +return false; +} + +return (tmp.domain == addr->domain && tmp.bus == addr->bus && +tmp.slot == addr->slot); +} + /* * return negative with errno, return 0 on success. * if success, the point of ret_info fill with the affected device reset info. @@ -1854,6 +1890,200 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos) return 0; } +static int vfio_device_range_limit(PCIBus *bus) +{ +PCIDevice *br; + +br = pci_bridge_get_device(bus); +if (!br || +!pci_is_express(br) || +!(br->exp.exp_cap) || +pcie_cap_is_arifwd_enabled(br)) { +return 255; +} + +return 8; +} + +static void vfio_check_hot_bus_reset(VFIOPCIDevice *vdev, Error **errp) +{ +PCIBus *bus = vdev->pdev.bus; +struct vfio_pci_hot_reset_info *info = NULL; +struct vfio_pci_dependent_device *devices; +VFIOGroup *group; +int ret, i, devfn, range_limit; + +ret = vfio_get_hot_reset_info(vdev, &info); +if (ret) { +error_setg(errp, "vfio: Cannot enable AER for device %s," + " device does not support hot reset.", + vdev->vbasedev.name); +return; +} + +/* List all affected devices by bus reset */ +devices = &info->devices[0]; + +/* Verify that we have all the groups required */ +for (i = 0; i < info->count; i++) { +PCIHostDeviceAddress host; +VFIOPCIDevice *tmp; +VFIODevice *vbasedev_iter; +bool found = false; + +host.domain = devices[i].segment; +host.bus = devices[i].bus; +host.slot = PCI_SLOT(devices[i].devfn); +host.function = PCI_FUNC(devices[i].devfn); + +/* Skip the current device */ +if (vfio_pci_host_match(&host, vdev->vbasedev.name)) { +continue; +} + +/* Ensure we own the group of the affected device */ +QLIST_FOREACH(group, &vfio_group_list, next) { +if (group->groupid == devices[i].group_id) { +break; +} +} + +if (!group) { +error_setg(errp, "vfio: Cannot enable AER for device %s, " + "depends on group %d which is not owned.", + vdev->vbasedev.name, devices[i].group_id); +goto out; +} + +/* Ensure affected devices for reset on the same bus */ +QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { +if (vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) { +continue; +} +
[Qemu-devel] [PATCH v9 08/11] vfio: vote the function 0 to do host bus reset when aer occurred
From: Chen Fan Due to all devices assigned to VM on the same way as host if enable aer, so we can easily do the hot reset by selecting the function #0 to do the hot reset. Signed-off-by: Chen Fan --- hw/vfio/pci.c | 14 ++ hw/vfio/pci.h | 1 + 2 files changed, 15 insertions(+) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 8bcb26b..0521652 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -1924,6 +1924,8 @@ static void vfio_check_hot_bus_reset(VFIOPCIDevice *vdev, Error **errp) /* List all affected devices by bus reset */ devices = &info->devices[0]; +vdev->single_depend_dev = (info->count == 1); + /* Verify that we have all the groups required */ for (i = 0; i < info->count; i++) { PCIHostDeviceAddress host; @@ -3120,6 +3122,18 @@ static void vfio_pci_reset(DeviceState *dev) trace_vfio_pci_reset(vdev->vbasedev.name); +if (vdev->features & VFIO_FEATURE_ENABLE_AER) { +PCIDevice *br = pci_bridge_get_device(pdev->bus); + +if ((pci_get_word(br->config + PCI_BRIDGE_CONTROL) & + PCI_BRIDGE_CTL_BUS_RESET)) { +if (pci_get_function_0(pdev) == pdev) { +vfio_pci_hot_reset(vdev, vdev->single_depend_dev); +} +return; +} +} + vfio_pci_pre_reset(vdev); if (vdev->resetfn && !vdev->resetfn(vdev)) { diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index e2faffb..ed14322 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -147,6 +147,7 @@ typedef struct VFIOPCIDevice { bool no_kvm_intx; bool no_kvm_msi; bool no_kvm_msix; +bool single_depend_dev; } VFIOPCIDevice; uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len); -- 1.8.3.1
[Qemu-devel] [PATCH v9 07/11] vfio: add check aer functionality for hotplug device
From: Chen Fan when function 0 is hot-added, we can check the vfio device whether support hot bus reset. Signed-off-by: Chen Fan --- hw/vfio/pci.c | 14 ++ 1 file changed, 14 insertions(+) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 242c1e4..8bcb26b 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3149,6 +3149,19 @@ post_reset: vfio_pci_post_reset(vdev); } +static void vfio_pci_is_valid(PCIDevice *dev, Error **errp) +{ +VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, dev); +Error *local_err = NULL; + +if (vdev->features & VFIO_FEATURE_ENABLE_AER) { +vfio_check_hot_bus_reset(vdev, &local_err); +if (local_err) { +error_propagate(errp, local_err); +} +} +} + static void vfio_instance_init(Object *obj) { PCIDevice *pci_dev = PCI_DEVICE(obj); @@ -3206,6 +3219,7 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) set_bit(DEVICE_CATEGORY_MISC, dc->categories); pdc->init = vfio_initfn; pdc->exit = vfio_exitfn; +pdc->is_valid_func = vfio_pci_is_valid; pdc->config_read = vfio_pci_read_config; pdc->config_write = vfio_pci_write_config; pdc->is_express = 1; /* We might be */ -- 1.8.3.1
[Qemu-devel] [PATCH v9 00/11] vfio-pci: pass the aer error to guest
From: Chen Fan v8-v9: 1. Don't use resume notification. Host vfio driver will process aer. Poll the vfio_device_info to assure host reset completely. v7-v8: 1. Use bitmap to record error and resume notification. v6-v7: 1. Stall any access to the device until resume is signaled. Do guest directed bus reset after receive resume notification. If don't get the resume notification after some timeout unplug the device. 2. fix the patches 11/12 as Alex sugguestion. v5-v6: 1. register resume handler both qemu and kernel to ensure the reset in order. 2. fix the patches 6/12, 7/12 as Alex and MST sugguestion. v4-v5: 1. add back the common function 0 hotplug code in pci core. 2. fix a sporadic device stuck on D3 problem when doing aer recovery. 3. fix patches 5/12 ~ 9/12 as Alex sugguestion. v3-v4: 1. rebase patchset to fit latest master branch. 2. modifying patches 5/10 ~ 8/10 as Alex sugguestion(Thanks). v2-v3: 1. fix patch 4/9, 5/9 as Alex sugguestion. 2. patches 5/9 ~ 8/9 are made to force limiting that all vfio functions are combined in the same way as on the host. v1-v2: 1. limit all devices on same bus in guest are on same bus in host in patch For now, for vfio pci passthough devices, when qemu receives an error from host aer report, it just terminate the guest. But usually user want to know what error occurred. So these patches add aer capability support for vfio devices, and pass the error to guest. Then let guest driver to recover from the error. notes: this series patches enable aer support single/multi-function, for multi-function, require all the function of the slot assigned to VM and on the same slot. Chen Fan (11): vfio: extract vfio_get_hot_reset_info as a single function vfio: squeeze out vfio_pci_do_hot_reset for support bus reset vfio: add aer support for vfio device vfio: refine function vfio_pci_host_match vfio: add check host bus reset is support or not pci: add a pci_function_is_valid callback to check function if valid vfio: add check aer functionality for hotplug device vfio: vote the function 0 to do host bus reset when aer occurred vfio-pci: pass the aer error to guest vfio: Add waiting for host aer error progress vfio: add 'aer' property to expose aercap hw/pci/pci.c | 32 +++ hw/vfio/pci.c | 620 - hw/vfio/pci.h | 6 + include/hw/pci/pci.h | 1 + linux-headers/linux/vfio.h | 4 + 5 files changed, 597 insertions(+), 66 deletions(-) -- 1.8.3.1
[Qemu-devel] [PATCH v9 02/11] vfio: squeeze out vfio_pci_do_hot_reset for support bus reset
From: Chen Fan Squeeze out vfio_pci_do_hot_reset to do host bus reset when AER recovery. Signed-off-by: Chen Fan --- hw/vfio/pci.c | 75 +++ 1 file changed, 44 insertions(+), 31 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index c1c7d31..6a6160b 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -1738,6 +1738,48 @@ error: return ret; } +static int vfio_pci_do_hot_reset(VFIOPCIDevice *vdev, + struct vfio_pci_hot_reset_info *info) +{ +VFIOGroup *group; +struct vfio_pci_hot_reset *reset; +int32_t *fds; +int ret, i, count; +struct vfio_pci_dependent_device *devices; + +/* Determine how many group fds need to be passed */ +count = 0; +devices = &info->devices[0]; +QLIST_FOREACH(group, &vfio_group_list, next) { +for (i = 0; i < info->count; i++) { +if (group->groupid == devices[i].group_id) { +count++; +break; +} +} +} + +reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds))); +reset->argsz = sizeof(*reset) + (count * sizeof(*fds)); +fds = &reset->group_fds[0]; + +/* Fill in group fds */ +QLIST_FOREACH(group, &vfio_group_list, next) { +for (i = 0; i < info->count; i++) { +if (group->groupid == devices[i].group_id) { +fds[reset->count++] = group->fd; +break; +} +} +} + +/* Bus reset! */ +ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset); +g_free(reset); + +return ret; +} + static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos) { PCIDevice *pdev = &vdev->pdev; @@ -1965,9 +2007,7 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) VFIOGroup *group; struct vfio_pci_hot_reset_info *info = NULL; struct vfio_pci_dependent_device *devices; -struct vfio_pci_hot_reset *reset; -int32_t *fds; -int ret, i, count; +int ret, i; bool multi = false; trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi"); @@ -2045,34 +2085,7 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) goto out_single; } -/* Determine how many group fds need to be passed */ -count = 0; -QLIST_FOREACH(group, &vfio_group_list, next) { -for (i = 0; i < info->count; i++) { -if (group->groupid == devices[i].group_id) { -count++; -break; -} -} -} - -reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds))); -reset->argsz = sizeof(*reset) + (count * sizeof(*fds)); -fds = &reset->group_fds[0]; - -/* Fill in group fds */ -QLIST_FOREACH(group, &vfio_group_list, next) { -for (i = 0; i < info->count; i++) { -if (group->groupid == devices[i].group_id) { -fds[reset->count++] = group->fd; -break; -} -} -} - -/* Bus reset! */ -ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset); -g_free(reset); +ret = vfio_pci_do_hot_reset(vdev, info); trace_vfio_pci_hot_reset_result(vdev->vbasedev.name, ret ? "%m" : "Success"); -- 1.8.3.1
[Qemu-devel] [PATCH v9 06/11] pci: add a pci_function_is_valid callback to check function if valid
From: Chen Fan PCI hotplug requires that function 0 is added last to close the slot. Since vfio supporting AER, we require that the VM bus contains the same set of devices as the host bus to support AER, we can perform an AER validation test whenever a function 0 in the VM is hot-added. Signed-off-by: Chen Fan --- hw/pci/pci.c | 32 include/hw/pci/pci.h | 1 + 2 files changed, 33 insertions(+) diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 149994b..b292b42 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -1940,6 +1940,22 @@ PCIDevice *pci_find_device(PCIBus *bus, int bus_num, uint8_t devfn) return bus->devices[devfn]; } +static void pci_function_is_valid(PCIBus *bus, PCIDevice *d, void *opaque) +{ +PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(d); +Error **errp = opaque; + +if (*errp) { +return; +} + +if (!pc->is_valid_func) { +return; +} + +pc->is_valid_func(d, errp); +} + static void pci_qdev_realize(DeviceState *qdev, Error **errp) { PCIDevice *pci_dev = (PCIDevice *)qdev; @@ -1982,6 +1998,22 @@ static void pci_qdev_realize(DeviceState *qdev, Error **errp) pci_qdev_unrealize(DEVICE(pci_dev), NULL); return; } + +/* + * If the function number is 0, indicate the closure of the slot. + * then we get the chance to check all functions on same device + * if valid. + */ +if (DEVICE(pci_dev)->hotplugged && +pci_get_function_0(pci_dev) == pci_dev) { +pci_for_each_device(bus, pci_bus_num(bus), +pci_function_is_valid, &local_err); +if (local_err) { +error_propagate(errp, local_err); +pci_qdev_unrealize(DEVICE(pci_dev), NULL); +return; +} +} } static void pci_default_realize(PCIDevice *dev, Error **errp) diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index 9ed1624..68b6848 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -198,6 +198,7 @@ typedef struct PCIDeviceClass { void (*realize)(PCIDevice *dev, Error **errp); int (*init)(PCIDevice *dev);/* TODO convert to realize() and remove */ +void (*is_valid_func)(PCIDevice *dev, Error **errp); PCIUnregisterFunc *exit; PCIConfigReadFunc *config_read; PCIConfigWriteFunc *config_write; -- 1.8.3.1
[Qemu-devel] [PATCH v9 01/11] vfio: extract vfio_get_hot_reset_info as a single function
From: Chen Fan The function is used to get affected devices by bus reset. So here extract it, and can used for aer soon. Signed-off-by: Chen Fan --- hw/vfio/pci.c | 66 +++ 1 file changed, 48 insertions(+), 18 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 44783c5..c1c7d31 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -1693,6 +1693,51 @@ static void vfio_check_af_flr(VFIOPCIDevice *vdev, uint8_t pos) } } +/* + * return negative with errno, return 0 on success. + * if success, the point of ret_info fill with the affected device reset info. + * + */ +static int vfio_get_hot_reset_info(VFIOPCIDevice *vdev, + struct vfio_pci_hot_reset_info **ret_info) +{ +struct vfio_pci_hot_reset_info *info; +int ret, count; + +*ret_info = NULL; + +info = g_malloc0(sizeof(*info)); +info->argsz = sizeof(*info); + +ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info); +if (ret && errno != ENOSPC) { +ret = -errno; +goto error; +} + +count = info->count; + +info = g_realloc(info, sizeof(*info) + + (count * sizeof(struct vfio_pci_dependent_device))); +info->argsz = sizeof(*info) + + (count * sizeof(struct vfio_pci_dependent_device)); + +ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info); +if (ret) { +ret = -errno; +error_report("vfio: hot reset info failed: %m"); +goto error; +} + +*ret_info = info; +info = NULL; + +return 0; +error: +g_free(info); +return ret; +} + static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos) { PCIDevice *pdev = &vdev->pdev; @@ -1918,7 +1963,7 @@ static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name) static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) { VFIOGroup *group; -struct vfio_pci_hot_reset_info *info; +struct vfio_pci_hot_reset_info *info = NULL; struct vfio_pci_dependent_device *devices; struct vfio_pci_hot_reset *reset; int32_t *fds; @@ -1930,12 +1975,8 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) vfio_pci_pre_reset(vdev); vdev->vbasedev.needs_reset = false; -info = g_malloc0(sizeof(*info)); -info->argsz = sizeof(*info); - -ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info); -if (ret && errno != ENOSPC) { -ret = -errno; +ret = vfio_get_hot_reset_info(vdev, &info); +if (ret) { if (!vdev->has_pm_reset) { error_report("vfio: Cannot reset device %s, " "no available reset mechanism.", vdev->vbasedev.name); @@ -1943,18 +1984,7 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) goto out_single; } -count = info->count; -info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices))); -info->argsz = sizeof(*info) + (count * sizeof(*devices)); devices = &info->devices[0]; - -ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info); -if (ret) { -ret = -errno; -error_report("vfio: hot reset info failed: %m"); -goto out_single; -} - trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name); /* Verify that we have all the groups required */ -- 1.8.3.1
[Qemu-devel] [PATCH v2 2/2] vfio : add aer process
From: Chen Fan During aer err occurs and resume do following to protect device from being accessed. 1. Make config space read only. 2. Disable INTx/MSI Interrupt. 3. Do nothing for bar regions. Signed-off-by: Zhou Jie --- drivers/vfio/pci/vfio_pci.c | 30 ++ drivers/vfio/pci/vfio_pci_private.h | 2 ++ include/uapi/linux/vfio.h | 2 ++ 3 files changed, 34 insertions(+) diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 2d12b03..dd96b60 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -318,6 +318,7 @@ static int vfio_pci_open(void *device_data) return -ENODEV; mutex_lock(&driver_lock); + init_completion(&vdev->aer_error_completion); if (!vdev->refcnt) { ret = vfio_pci_enable(vdev); @@ -571,6 +572,16 @@ static long vfio_pci_ioctl(void *device_data, struct vfio_pci_device *vdev = device_data; unsigned long minsz; + if (vdev->aer_error_in_progress && (cmd == VFIO_DEVICE_SET_IRQS || + cmd == VFIO_DEVICE_RESET || cmd == VFIO_DEVICE_PCI_HOT_RESET)) { + int ret; + ret = wait_for_completion_interruptible( + &vdev->aer_error_completion); + if (ret) { + return ret; + } + } + if (cmd == VFIO_DEVICE_GET_INFO) { struct vfio_device_info info; @@ -587,6 +598,10 @@ static long vfio_pci_ioctl(void *device_data, if (vdev->reset_works) info.flags |= VFIO_DEVICE_FLAGS_RESET; + info.flags |= VFIO_DEVICE_FLAGS_AERPROCESS; + if (vdev->aer_error_in_progress) + info.flags |= VFIO_DEVICE_FLAGS_INAERPROCESS; + info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions; info.num_irqs = VFIO_PCI_NUM_IRQS; @@ -996,6 +1011,14 @@ static ssize_t vfio_pci_rw(void *device_data, char __user *buf, switch (index) { case VFIO_PCI_CONFIG_REGION_INDEX: + if (vdev->aer_error_in_progress && iswrite) { + int ret; + ret = wait_for_completion_interruptible( + &vdev->aer_error_completion); + if (ret) { + return ret; + } + } return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite); case VFIO_PCI_ROM_REGION_INDEX: @@ -1226,6 +1249,10 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, mutex_lock(&vdev->igate); + vdev->aer_error_in_progress = true; + vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE | + VFIO_IRQ_SET_ACTION_TRIGGER, + vdev->irq_type, 0, 0, NULL); if (vdev->err_trigger) eventfd_signal(vdev->err_trigger, 1); @@ -1252,6 +1279,9 @@ static void vfio_pci_aer_resume(struct pci_dev *pdev) } mutex_lock(&vdev->igate); + + vdev->aer_error_in_progress = false; + complete_all(&vdev->aer_error_completion); if (vdev->resume_trigger) eventfd_signal(vdev->resume_trigger, 1); diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index 80d4ddd..2f151f5 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -84,6 +84,8 @@ struct vfio_pci_device { boolhas_vga; boolneeds_reset; boolnointx; + boolaer_error_in_progress; + struct completion aer_error_completion; struct pci_saved_state *pci_saved_state; int refcnt; struct eventfd_ctx *err_trigger; diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 34ab138..276ce50 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -198,6 +198,8 @@ struct vfio_device_info { #define VFIO_DEVICE_FLAGS_PCI (1 << 1)/* vfio-pci device */ #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2)/* vfio-platform device */ #define VFIO_DEVICE_FLAGS_AMBA (1 << 3) /* vfio-amba device */ +#define VFIO_DEVICE_FLAGS_AERPROCESS (1 << 4) /* support aer error progress */ +#define VFIO_DEVICE_FLAGS_INAERPROCESS (1 << 5)/* status in aer error progress */ __u32 num_regions;/* Max region index + 1 */ __u32 num_irqs; /* Max IRQ index + 1 */ }; -- 1.8.3.1
[Qemu-devel] [PATCH v2 1/2] vfio : resume notifier
From: root --- drivers/vfio/pci/vfio_pci.c | 28 +++- drivers/vfio/pci/vfio_pci_intrs.c | 18 ++ drivers/vfio/pci/vfio_pci_private.h | 1 + include/uapi/linux/vfio.h | 1 + 4 files changed, 47 insertions(+), 1 deletion(-) diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 188b1ff..2d12b03 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -363,7 +363,8 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type) return (flags & PCI_MSIX_FLAGS_QSIZE) + 1; } - } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) { + } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX || + irq_type == VFIO_PCI_RESUME_IRQ_INDEX) { if (pci_is_pcie(vdev->pdev)) return 1; } else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) { @@ -731,6 +732,7 @@ static long vfio_pci_ioctl(void *device_data, case VFIO_PCI_REQ_IRQ_INDEX: break; case VFIO_PCI_ERR_IRQ_INDEX: + case VFIO_PCI_RESUME_IRQ_INDEX: if (pci_is_pcie(vdev->pdev)) break; /* pass thru to return error */ @@ -1234,8 +1236,32 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, return PCI_ERS_RESULT_CAN_RECOVER; } +static void vfio_pci_aer_resume(struct pci_dev *pdev) +{ + struct vfio_pci_device *vdev; + struct vfio_device *device; + + device = vfio_device_get_from_dev(&pdev->dev); + if (device == NULL) + return; + + vdev = vfio_device_data(device); + if (vdev == NULL) { + vfio_device_put(device); + return; + } + + mutex_lock(&vdev->igate); + if (vdev->resume_trigger) + eventfd_signal(vdev->resume_trigger, 1); + + mutex_unlock(&vdev->igate); + vfio_device_put(device); +} + static const struct pci_error_handlers vfio_err_handlers = { .error_detected = vfio_pci_aer_err_detected, + .resume = vfio_pci_aer_resume, }; static struct pci_driver vfio_pci_driver = { diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 15ecfc9..3a01a62 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -617,6 +617,16 @@ static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev, return vfio_pci_set_ctx_trigger_single(&vdev->err_trigger, flags, data); } +static int vfio_pci_set_resume_trigger(struct vfio_pci_device *vdev, + unsigned index, unsigned start, + unsigned count, uint32_t flags, void *data) +{ + if (index != VFIO_PCI_RESUME_IRQ_INDEX) + return -EINVAL; + + return vfio_pci_set_ctx_trigger_single(&vdev->resume_trigger, flags, data); +} + static int vfio_pci_set_req_trigger(struct vfio_pci_device *vdev, unsigned index, unsigned start, unsigned count, uint32_t flags, void *data) @@ -676,6 +686,14 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags, break; } break; + case VFIO_PCI_RESUME_IRQ_INDEX: + switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { + case VFIO_IRQ_SET_ACTION_TRIGGER: + if (pci_is_pcie(vdev->pdev)) + func = vfio_pci_set_resume_trigger; + break; + } + break; } if (!func) diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index 016c14a..80d4ddd 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -88,6 +88,7 @@ struct vfio_pci_device { int refcnt; struct eventfd_ctx *err_trigger; struct eventfd_ctx *req_trigger; + struct eventfd_ctx *resume_trigger; }; #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 255a211..34ab138 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -433,6 +433,7 @@ enum { VFIO_PCI_MSIX_IRQ_INDEX, VFIO_PCI_ERR_IRQ_INDEX, VFIO_PCI_REQ_IRQ_INDEX, + VFIO_PCI_RESUME_IRQ_INDEX, VFIO_PCI_NUM_IRQS }; -- 1.8.3.1
[Qemu-devel] [PATCH v2 0/2] vfio: add aer process
From: Chen Fan v1-v2: 1. Add aer process to vfio driver. Chen Fan (1): vfio : add aer process root (1): vfio : resume notifier drivers/vfio/pci/vfio_pci.c | 58 - drivers/vfio/pci/vfio_pci_intrs.c | 18 drivers/vfio/pci/vfio_pci_private.h | 3 ++ include/uapi/linux/vfio.h | 3 ++ 4 files changed, 81 insertions(+), 1 deletion(-) -- 1.8.3.1
Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
Hi Alex, I will use workable state support flag to let user know whether the kenerl support block feature. And make configure space writing and ioctl function blocked. And what of my suggestion that a user may desire to poll the state of the device? I will also add a poll function to vfio_fops. Can you explain how this will work? I was only suggesting that one of the flag bits in vfio_device_info be allocated to report the current state of blocking and the user could poll by repeatedly calling the DEVICE_INFO ioctl. Are you thinking of using POLLOUT/POLLIN? I'm not sure if those are a perfect match since it's really only the PCI config region and a few ioctls where access is blocked, other operations may proceed normally. Sorry, I had misunderstood your meaning. I will use one of the flag bits in vfio_device_info to report the current state of blocking. The following code will be modified. 1. vfio_pci_ioctl add a flag in vfio_device_info for workable_state support 2. vfio_pci_ioctl During err occurs and resume: if (cmd == VFIO_DEVICE_SET_IRQS || VFIO_DEVICE_RESET || VFIO_DEVICE_GET_PCI_HOT_RESET_INFO || VFIO_DEVICE_PCI_HOT_RESET) block for workable_state clearing 3. vfio_pci_write During err occurs and resume: block write to configure space 4. vfio_pci_aer_err_detected Set workable_state to false in "struct vfio_pci_device" teardown the interrupt 5. vfio_pci_aer_resume Set workable_state to true in "struct vfio_pci_device" 6. vfio_fops Add poll function I would still suggest that the name "workable_state" is quite vague. Something like aer_error_in_progress is much more specific. Thanks, OK, I will alter the name. Sincerely Zhou Jie
Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
Hi Alex, I will use workable state support flag to let user know whether the kenerl support block feature. And make configure space writing and ioctl function blocked. And what of my suggestion that a user may desire to poll the state of the device? I will also add a poll function to vfio_fops. A user does know what the vfio driver has done if you define the behavior that on an AER error reported event, as signaled to the user via the error notification interrupt, vfio-pci will teardown device interrupts to an uninitialized state. The difference between the command register approach you suggest and the teardown I suggest is that the command register is simply masking interrupt deliver while the teardown approach returns the device to an uninitialized interrupt state. Take a look at the device state when a bus reset occurs, what state is saved and restored and what is left at a default PCI value. The command register is saved and restored, so any manipulation we do of it is racing the host kernel AER handling and bus reset. What about MSI and MSI-X? Looks to me like those are left at the PCI default initialization state, so now after an AER error we have irq handlers and eventfds configured, while in fact the device has been de-programmed. To handle that we're expecting users to teardown the interrupt state and re-establish it? Again, why not just teardown the interrupt state ourselves? I dont' see the value in simply masking the command register, especially when it doesn't handle the no-DisINTx case. I understand. Thank you very much to explain this to me. I will teardown the interrupt state. We cannot depend on the behavior of any given driver and the fact that the guest driver may teardown interrupts anyway is not a justification that vfio shouldn't be doing this to make the device state presented to the user consistent. Thanks, I understand. The following code will be modified. 1. vfio_pci_ioctl add a flag in vfio_device_info for workable_state support 2. vfio_pci_ioctl During err occurs and resume: if (cmd == VFIO_DEVICE_SET_IRQS || VFIO_DEVICE_RESET || VFIO_DEVICE_GET_PCI_HOT_RESET_INFO || VFIO_DEVICE_PCI_HOT_RESET) block for workable_state clearing 3. vfio_pci_write During err occurs and resume: block write to configure space 4. vfio_pci_aer_err_detected Set workable_state to false in "struct vfio_pci_device" teardown the interrupt 5. vfio_pci_aer_resume Set workable_state to true in "struct vfio_pci_device" 6. vfio_fops Add poll function Sincerely Zhou Jie
Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
Hi Alex, The variable clearly isn't visible to the user, so the user can know whether the kernel supports this feature, but not whether the feature is currently active. Perhaps there's no way to avoid races completely, but don't you expect that if we define that certain operations are blocked after an error notification that a user may want some way to poll for whether the block is active after a reset rather than simply calling a blocked interface to probe for it? Yes, I will use access blocked function, not the variable. I don't understand what this means. I will use workable state support flag to let user know whether the kenerl support block feature. And make configure space writing and ioctl function blocked. Is that acceptable? This is part of the problem I have with silently disabling interrupt delivery via the command register across reset. It seems more non-deterministic than properly disabling interrupts and requiring the user to reinitialize them after error. What is "properly disabling interrupts"? User don't know what vfio driver has done. What's the difference of "disabling interrupt delivery via the command register" and "doing a teardown of interrupts"? The interrupts will still lost silently. Or does it make more sense to simply disable the interrupts as done in vfio_pci_disable() and define that the user needs to re-establish interrupts before continuing after an error event? Thanks, If user invoked the vfio_pci_disable by ioctl function. I'm in no way suggesting that a user invoke vfio_pci_disable(), I'm just trying to point out that vfio_pci_disable() already does a teardown of interrupts, similar to what seems to be required here. Yes, user should re-establish interrupts before continuing after an error event. So if we define that users should re-establish interrupts after an error event, then what's the point of only doing command register masking of the interrupts and requiring the user to both tear-down the interrupts and re-establish them? Thanks, Take "Intel(R) 10GbE PCI Express Virtual Function" as an example. In drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c static const struct pci_error_handlers ixgbevf_err_handler = { .error_detected = ixgbevf_io_error_detected, .slot_reset = ixgbevf_io_slot_reset, .resume = ixgbevf_io_resume, }; User tear-down the interrupts in ixgbevf_io_error_detected function. And up the interrupts in ixgbevf_io_resume. Guest OS driver will do both tear-down the interrupts and re-establish them. Because it don't know what host vfio driver has done. I disable the interrupts to pretend them interfere with device reset. Sincerely Zhou Jie
Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
Hi Alex, The variable clearly isn't visible to the user, so the user can know whether the kernel supports this feature, but not whether the feature is currently active. Perhaps there's no way to avoid races completely, but don't you expect that if we define that certain operations are blocked after an error notification that a user may want some way to poll for whether the block is active after a reset rather than simply calling a blocked interface to probe for it? Yes, I will use access blocked function, not the variable. As we've discussed before, the AER notification needs to be relayed to the user without delay, otherwise we only increase the gap where the user might consume bogus data. It also only seems reasonable to modify the behavior of the interfaces (ie. blocking) if the user is notified, which would be through the existing error notifier. We can never depend on a specific behavior from the user, we may be dealing with a malicious user. We already disable interrupts in vfio_pci_disable() simply by calling the ioctl function directly. Sorry, I want to know where is vfio_pci_disable invoked. I can't find it in ioctl function. If we simply disable and re-enable interrupts as you propose, how does the user deal with edge triggered interrupts that may have occurred during that period? Are they lost? Should we instead leave the interrupts enabled but skip eventfd_signal() in the interrupts handlers, queuing interrupts for re-delivery after the device is resumed? Yes, they will lost. Or does it make more sense to simply disable the interrupts as done in vfio_pci_disable() and define that the user needs to re-establish interrupts before continuing after an error event? Thanks, If user invoked the vfio_pci_disable by ioctl function. Yes, user should re-establish interrupts before continuing after an error event. Sincerely Zhoujie
Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
Hi Alex, The following code will be modified. 1. vfio_pci_ioctl add a flag in vfio_device_info for workable_state support return workable_state in "struct vfio_pci_device" when user get info Seems like two flags are required, one to indicate the presence of this feature and another to indicate the state. I'd prefer something like access_blocked. User can get the state by state flag. And also maybe blocked by ioctl or write functons. User has choice to invoke which functions. Let's imagine there's one flag bit, there are two possible polarities, a) the bit is set when access is available, b) the bit is set when access is blocked. Let's examine a), if the bit is not set, does that means that access is not available or does that mean the kernel doesn't support that feature? There's no way to know. Fail. So we switch to b), an error occurs, the bit is not set, does that mean access is blocked or does that mean that the kernel we're using doesn't support the feature. Fail. If there's a way to do this with one bit, please explain it to me. Relying on a function to block, which may not be a valid assumption on the kernel we're using also fails. Userspace must be able to know, in a deterministic and backwards compatible way, the features of the kernel and the behavior to expect. Sorry, I didn't say clearly. There is one flag and one variable. 1) workable state support flag This flag let user know whether the kenerl support block feature. 2) workable state variable in "struct vfio_pci_device" This variable let user know whether the device is blocked. 2. vfio_pci_ioctl During err occurs and resume: if (cmd == VFIO_DEVICE_SET_IRQS || VFIO_DEVICE_RESET || VFIO_DEVICE_GET_PCI_HOT_RESET_INFO || VFIO_DEVICE_PCI_HOT_RESET) block for workable_state clearing 3. vfio_pci_write During err occurs and resume: ignore and return 0 This is contradictory to your comment "Do nothing for bar regions". ISTR that returning 0 for read/write calls is an easy way to break users since we've return neither the desired bytes nor an error code. No, there is not change for read. Just return 0 for write. Return 0 mean that there is no byte has been written. Consider that the aer has occurred, it is better to not to write any thing to device. User can still read/write bar regions by mmap address, this may generate some date errors, but it doesn't matter as device is going to be reset. My statement still stands, you state "Do nothing for bar regions" and "return 0 for write". Those are contradictory and as I indicate, returning 0 is problematic for userspace. Additionally, making vfio_pci_write return zero while still allowing writes through the BAR mmap is inconsistent. I understand. I will block writing to configure space. And don't change for writing bar regions. 4. vfio_pci_aer_err_detected Set workable_state to false in "struct vfio_pci_device" Disable INTx: If Disable INTx is support disable by PCI_COMMAND else disable by disable_irq function Disable MSI: disable by clearing the "Bus Master Enable" bit of PCI_COMMAND I've suggested repeatedly to properly teardown these interrupts. I disagree with your proposed approach here. If the device is intended to be in a state that requires re-initialization then the interrupt setup should be part of that. I have traced the INTx functions. -vfio_pci_intx_unmask_handler -vfio_pci_intx_mask -vfio_intx_set_signal They are invoked by User. -vfio_pci_write -vfio_pci_ioctl During err occurs and resume above functions are blocked. So, User cann't set the INTx. I will disable the INTx in vfio_pci_aer_err_detected. And reset the INTx in vfio_pci_aer_resume according the original user setting(vdev->ctx[0].masked). Again, you're missing the point. If the device is expected to be reinitialized after resume, why don't we return the device to an initial state where interrupts are not configured? I think this presents inconsistent behavior to the user. About return the device to an initial state where interrupts are not configured. Do you mean free_irq? User request_irq by vfio_pci_ioctl. If we free_irq, how can the user know it? Do you think guest will reinitializ the device after resume, so it doesn't matter? Maybe we should not expect what the guest will do. What I want to do in vfio driver is as following. 1. aer occurs 2. Disable INTx and MSI 3. aer driver reset the device 4. Restore INTx and MSI 5. user process the aer event Sincerely Zhou Jie
Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
Hi Alex, Due to weekend and holiday in my country, there were zero regular working hours between your emails. I wish you had a good time. The following code will be modified. 1. vfio_pci_ioctl add a flag in vfio_device_info for workable_state support return workable_state in "struct vfio_pci_device" when user get info Seems like two flags are required, one to indicate the presence of this feature and another to indicate the state. I'd prefer something like access_blocked. User can get the state by state flag. And also maybe blocked by ioctl or write functons. User has choice to invoke which functions. 2. vfio_pci_ioctl During err occurs and resume: if (cmd == VFIO_DEVICE_SET_IRQS || VFIO_DEVICE_RESET || VFIO_DEVICE_GET_PCI_HOT_RESET_INFO || VFIO_DEVICE_PCI_HOT_RESET) block for workable_state clearing 3. vfio_pci_write During err occurs and resume: ignore and return 0 This is contradictory to your comment "Do nothing for bar regions". ISTR that returning 0 for read/write calls is an easy way to break users since we've return neither the desired bytes nor an error code. No, there is not change for read. Just return 0 for write. Return 0 mean that there is no byte has been written. Consider that the aer has occurred, it is better to not to write any thing to device. User can still read/write bar regions by mmap address, this may generate some date errors, but it doesn't matter as device is going to be reset. 4. vfio_pci_aer_err_detected Set workable_state to false in "struct vfio_pci_device" Disable INTx: If Disable INTx is support disable by PCI_COMMAND else disable by disable_irq function Disable MSI: disable by clearing the "Bus Master Enable" bit of PCI_COMMAND I've suggested repeatedly to properly teardown these interrupts. I disagree with your proposed approach here. If the device is intended to be in a state that requires re-initialization then the interrupt setup should be part of that. I have traced the INTx functions. -vfio_pci_intx_unmask_handler -vfio_pci_intx_mask -vfio_intx_set_signal They are invoked by User. -vfio_pci_write -vfio_pci_ioctl During err occurs and resume above functions are blocked. So, User cann't set the INTx. I will disable the INTx in vfio_pci_aer_err_detected. And reset the INTx in vfio_pci_aer_resume according the original user setting(vdev->ctx[0].masked). 5. vfio_pci_aer_resume Set workable_state to true in "struct vfio_pci_device" About INTx: According to the value of "vdev->ctx[0].masked" to decide whether to enable INTx About MSI: After reset the "Bus Master Enable" bit is default to 0. So user should process this after reset. Again, I think this is error prone, teardown the interrupts and define that the device state, including interrupts, needs to be reinitialized after error. Why are you not incorporating this feedback? Thanks, The reinitialization depend on user. For VFIO driver the process is following. 1. aer occurs 2. disable the following functions of the device write(except the bar regions), ioctl and interrupt 3. aer driver reset the device 4. renable the device for user 5. user process the aer event Maybe reset the device and reinitialization What I do is make sure the following points. 1. Host can reset the device between step 2 and 4. 2. The user settings is the same at step 1 and 5. Sincerely Zhou Jie
Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
ping On 2016/7/3 12:00, Zhou Jie wrote: Hi Alex, On 2016/6/30 9:45, Zhou Jie wrote: Hi Alex, On 2016/6/30 2:22, Alex Williamson wrote: On Wed, 29 Jun 2016 16:54:05 +0800 Zhou Jie wrote: Hi Alex, And yet we have struct pci_dev.broken_intx_masking and we test for working DisINTx via pci_intx_mask_supported() rather than simply looking for a PCIe device. Some devices are broken and some simply don't follow the spec, so you're going to need to deal with that or exclude those devices. For those devices I have no way to disable the INTx. disable_irq()? Clearly vfio-pci already manages these types of devices and can disable INTx. This is why I keep suggesting that maybe tearing the interrupt setup down completely is a more complete and reliable approach than masking in the command register. Unless we're going to exclude such devices from supporting this mode (which I don't condone), we must deal with them. Thank you for tell me that. Yes, I can use disable_irq to disable the pci device irq. But should I enable the irq after reset? I will dig into it. I will alter the VFIO driver like following. During err occurs and resume: 1. Make config space read only. 2. Disable INTx/MSI Interrupt. 3. Do nothing for bar regions. The following code will be modified. 1. vfio_pci_ioctl add a flag in vfio_device_info for workable_state support return workable_state in "struct vfio_pci_device" when user get info 2. vfio_pci_ioctl During err occurs and resume: if (cmd == VFIO_DEVICE_SET_IRQS || VFIO_DEVICE_RESET || VFIO_DEVICE_GET_PCI_HOT_RESET_INFO || VFIO_DEVICE_PCI_HOT_RESET) block for workable_state clearing 3. vfio_pci_write During err occurs and resume: ignore and return 0 4. vfio_pci_aer_err_detected Set workable_state to false in "struct vfio_pci_device" Disable INTx: If Disable INTx is support disable by PCI_COMMAND else disable by disable_irq function Disable MSI: disable by clearing the "Bus Master Enable" bit of PCI_COMMAND 5. vfio_pci_aer_resume Set workable_state to true in "struct vfio_pci_device" About INTx: According to the value of "vdev->ctx[0].masked" to decide whether to enable INTx About MSI: After reset the "Bus Master Enable" bit is default to 0. So user should process this after reset. Sincerely Zhou Jie -- 周潔 Dept 1 No. 6 Wenzhu Road, Nanjing, 210012, China TEL:+86+25-86630566-8557 FUJITSU INTERNAL:7998-8557 E-Mail:zhoujie2...@cn.fujitsu.com
Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
Hi Alex, On 2016/6/30 9:45, Zhou Jie wrote: Hi Alex, On 2016/6/30 2:22, Alex Williamson wrote: On Wed, 29 Jun 2016 16:54:05 +0800 Zhou Jie wrote: Hi Alex, And yet we have struct pci_dev.broken_intx_masking and we test for working DisINTx via pci_intx_mask_supported() rather than simply looking for a PCIe device. Some devices are broken and some simply don't follow the spec, so you're going to need to deal with that or exclude those devices. For those devices I have no way to disable the INTx. disable_irq()? Clearly vfio-pci already manages these types of devices and can disable INTx. This is why I keep suggesting that maybe tearing the interrupt setup down completely is a more complete and reliable approach than masking in the command register. Unless we're going to exclude such devices from supporting this mode (which I don't condone), we must deal with them. Thank you for tell me that. Yes, I can use disable_irq to disable the pci device irq. But should I enable the irq after reset? I will dig into it. I will alter the VFIO driver like following. During err occurs and resume: 1. Make config space read only. 2. Disable INTx/MSI Interrupt. 3. Do nothing for bar regions. The following code will be modified. 1. vfio_pci_ioctl add a flag in vfio_device_info for workable_state support return workable_state in "struct vfio_pci_device" when user get info 2. vfio_pci_ioctl During err occurs and resume: if (cmd == VFIO_DEVICE_SET_IRQS || VFIO_DEVICE_RESET || VFIO_DEVICE_GET_PCI_HOT_RESET_INFO || VFIO_DEVICE_PCI_HOT_RESET) block for workable_state clearing 3. vfio_pci_write During err occurs and resume: ignore and return 0 4. vfio_pci_aer_err_detected Set workable_state to false in "struct vfio_pci_device" Disable INTx: If Disable INTx is support disable by PCI_COMMAND else disable by disable_irq function Disable MSI: disable by clearing the "Bus Master Enable" bit of PCI_COMMAND 5. vfio_pci_aer_resume Set workable_state to true in "struct vfio_pci_device" About INTx: According to the value of "vdev->ctx[0].masked" to decide whether to enable INTx About MSI: After reset the "Bus Master Enable" bit is default to 0. So user should process this after reset. Sincerely Zhou Jie
Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
Hi Alex, On 2016/6/30 2:22, Alex Williamson wrote: On Wed, 29 Jun 2016 16:54:05 +0800 Zhou Jie wrote: Hi Alex, And yet we have struct pci_dev.broken_intx_masking and we test for working DisINTx via pci_intx_mask_supported() rather than simply looking for a PCIe device. Some devices are broken and some simply don't follow the spec, so you're going to need to deal with that or exclude those devices. For those devices I have no way to disable the INTx. disable_irq()? Clearly vfio-pci already manages these types of devices and can disable INTx. This is why I keep suggesting that maybe tearing the interrupt setup down completely is a more complete and reliable approach than masking in the command register. Unless we're going to exclude such devices from supporting this mode (which I don't condone), we must deal with them. Thank you for tell me that. Yes, I can use disable_irq to disable the pci device irq. But should I enable the irq after reset? I will dig into it. Sincerely Zhou Jie How does that happen, aren't we notifying the user at the point the error occurs, while the device is still in the process or being reset? My question is how does the user know that the host reset is complete in order to begin their own re-initialization? I will add a state in "struct vfio_pci_device". The state is set when the device can not work such as a aer error occured. And the state is clear when the device can work such as resume received. Return the state when user get info by vfio_pci_ioctl. The interrupt status will be cleared by hardware. So the hardware is the same as the state when the vfio device fd is opened. The PCI-core in Linux will save and restore the device state around reset, how do we know that vfio-pci itself is not racing that reset and whether PCI-core will restore the state including our interrupt masking or a state without it? Do we need to restore the state to the one we saved when we originally opened the device? Shouldn't that mean we teardown the interrupt setup the user had prior to the error event? For above you said. Maybe disable the interrupt is not a good idea. Think about what will happend in the interrupt handler. Maybe read/write configure space and region bar. I will make the configure space read only. Do nothing for region bar which used by userd. I'm thinking that vfio-pci will be attempting to mask the interrupts via the PCI command register, which is potentially in a state of flux due to the host reset and yet we're somehow expecting that our write to the command register sticks. We certainly have the ability to a) discard interrupts received between AER error and resume, and b) if we want to be consistent with requiring the user to reinitialize the device, then the user interrupt setup should likely be torn down. Thanks,
Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
Hi Alex, And yet we have struct pci_dev.broken_intx_masking and we test for working DisINTx via pci_intx_mask_supported() rather than simply looking for a PCIe device. Some devices are broken and some simply don't follow the spec, so you're going to need to deal with that or exclude those devices. For those devices I have no way to disable the INTx. How does that happen, aren't we notifying the user at the point the error occurs, while the device is still in the process or being reset? My question is how does the user know that the host reset is complete in order to begin their own re-initialization? I will add a state in "struct vfio_pci_device". The state is set when the device can not work such as a aer error occured. And the state is clear when the device can work such as resume received. Return the state when user get info by vfio_pci_ioctl. The interrupt status will be cleared by hardware. So the hardware is the same as the state when the vfio device fd is opened. The PCI-core in Linux will save and restore the device state around reset, how do we know that vfio-pci itself is not racing that reset and whether PCI-core will restore the state including our interrupt masking or a state without it? Do we need to restore the state to the one we saved when we originally opened the device? Shouldn't that mean we teardown the interrupt setup the user had prior to the error event? For above you said. Maybe disable the interrupt is not a good idea. Think about what will happend in the interrupt handler. Maybe read/write configure space and region bar. I will make the configure space read only. Do nothing for region bar which used by userd. How will the user know when the device is ready to be reset? Which of the ioctls that you're blocking can they poll w/o any unwanted side-effects or awkward interactions? Should flag bits in the device info ioctl indicate not only support for this behavior but also the current status? Thanks, I can block the reset ioctl and config write. I will not add flag for the device current status, because I don't depend on user to prevent awkward interactions. Ok, so that's a reason to block rather than return -EAGAIN. Still we need some way to indicate to the user whether the device supports this new interaction rather than the existing behavior. Thanks, Because write configure space maybe happened in interrupt handler. I think block is not a good choice. Sincerely Zhou Jie
Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
Hi Alex, On 2016/6/28 11:58, Alex Williamson wrote: On Tue, 28 Jun 2016 11:26:33 +0800 Zhou Jie wrote: Hi Alex, The INTx/MSI part needs further definition for the user. Are we actually completely tearing down interrupts with the expectation that the user will re-enable them or are we just masking them such that the user needs to unmask? Also note that not all devices support DisINTx. After reset, the "Bus Master Enable" bit of "Command Register" should be cleared, so MSI/MSI- X interrupt Messages is still disabled. After reset, the "Interrupt Disable" bit of "Command Register" should be cleared, so INTx interrupts is enabled. If the device doesn't support INTx, "Interrupt Disable" bit will hardware to 0, it is OK here. After fatal-error occurs, the user should reset the device and reinitialize the device. So I disable the interrupt before host reset the device, and let user to do the reinitialization. I'm dubious here. When DisINTx is not supported by the device or it's marked broken in host quirks, then we can't trust the device to stop sending INTx. It's hardwired to zero, meaning that it doesn't work or it's been found to be broken in other ways. So COMMAND register masking is not sufficient for all devices. For Endpoints that generate INTx interrupts, this bit is required. For Endpoints that do not generate IN Tx interrupts this bit is optional. If not implemented, this bit must be hardwired to 0b. For Root Ports, Switch Ports, and Bridges that generate INTx interrupts on their own behalf, this bit is required. The above is from "7.5.1.1." of "PCI Express Base Specification 3.1a". So I think "Interrupt Disable" bit must be supported by the device which can generate INTx interrupts. Also, any time we start changing the state of the device from what the user expects, we risk consistency problems. We need to consider how the user last saw the device and whether we can legitimately expect them to handle the device in a new state. If we expect the user to re-initialize the device then would it be more correct to teardown all interrupt signaling such that the device is effectively in the same state as initial handoff when the vfio device fd is opened? Before the user re-initialize the device, host has reseted the device. The interrupt status will be cleared by hardware. So the hardware is the same as the state when the vfio device fd is opened. How will the user know when the device is ready to be reset? Which of the ioctls that you're blocking can they poll w/o any unwanted side-effects or awkward interactions? Should flag bits in the device info ioctl indicate not only support for this behavior but also the current status? Thanks, I can block the reset ioctl and config write. I will not add flag for the device current status, because I don't depend on user to prevent awkward interactions. Sincerely Zhou Jie
Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
Hi Alex, The INTx/MSI part needs further definition for the user. Are we actually completely tearing down interrupts with the expectation that the user will re-enable them or are we just masking them such that the user needs to unmask? Also note that not all devices support DisINTx. After reset, the "Bus Master Enable" bit of "Command Register" should be cleared, so MSI/MSI- X interrupt Messages is still disabled. After reset, the "Interrupt Disable" bit of "Command Register" should be cleared, so INTx interrupts is enabled. If the device doesn't support INTx, "Interrupt Disable" bit will hardware to 0, it is OK here. After fatal-error occurs, the user should reset the device and reinitialize the device. So I disable the interrupt before host reset the device, and let user to do the reinitialization. Otherwise it seems like a reasonable approach, but I can't guarantee we won't find new issues along the way. For instance we'll need to test how -EAGAIN returns interact with existing QEMU and maybe decided whether there are cases that are better handled by doing an interruptible wait. Thanks, I will dig into it. Sincerely Zhou Jie
Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
Hi Alex, We should never depend on the guest driver to behave in a certain way, but we need to prioritize what that actually means. vfio in the kernel has a responsibility first and foremost to the host kernel. User owned devices cannot be allowed to exploit or interfere with the host regardless of user behavior. The next priority is correct operation for the user. When the host kernel is handling the AER event between the error and resume notifies, it doesn't have device specific drivers, it's manipulating the device as a generic PCI device. That makes me think that vfio should not allow the user to interact (interfere) with the device during that process and that such interference can be limited to standard PCI level interactions. That means config space, and things that operate on config space (like interrupt ioctls and resets). On the QEMU side, we've sent a notification that an error occurred, how the user and the guest respond to that is beyond the concern of vfio in the kernel. If the user/guest driver continues to interact with resources on the device, that's fine, but I think vfio in the kernel does need to prevent the user from interfering with the PCI state of the device for that brief window when we know the host kernel is operating on the device. Otherwise the results are unpredictable and therefore unsupportable. Does that make sense? Thanks, I understand. I want to alter the VFIO driver like following. During err occurs and resume: 1. Make config space read only. Ignore config space writing to prevent the user from interfering with the PCI state of the device. User can get the error infomation by reading the config space. 2. Disable INTx and MSI Write "Command Register" to disable INTx and MSI. 3. Do nothing for bar regions. Guest driver may access bar regions. It doesn't matter as device is going to be reset. The following code will be modified. 1. vfio_pci_ioctl add flag for aer support 2. vfio_pci_ioctl During err occurs and resume: if (cmd == VFIO_DEVICE_SET_IRQS) return EAGAIN if (cmd == VFIO_DEVICE_RESET) return EAGAIN if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) return EAGAIN if (cmd == VFIO_DEVICE_PCI_HOT_RESET) return EAGAIN 3. vfio_pci_write During err occurs and resume: block 4. vfio_pci_aer_err_detected Set aer state in "struct vfio_pci_device" Write "Command Register" to disable INTx and MSI. 5. vfio_pci_aer_resume Clear aer state in "struct vfio_pci_device" I don't need to enable INTx and MSI. The device will be initalized by guest driver. Sincerely Zhoujie
Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
Hi Alex, On 2016/6/22 13:45, Zhou Jie wrote: Hi Alex, In vfio I have some questions. 1. How can I disable the access by mmap? We can disable all access to vfio fd by returning a EAGAIN error if user try to access it during the reset period until the host reset finished. But about the bar region which is maped by vfio_pci_mmap. How can I disable it in vfio driver? Even there is a way to do it, how about the complexity to recovery the mmap? That's exactly the "sticky point" I refer to above, you'd need to solve that problem. MST would probably still argue that we don't need to disable all those interfaces, a userspace driver can already do things like disable mmio space and then attempt to read from the mmio space of the device. You said we should not depend on user to protect the device be accessed during the reset period. So maybe the problem can be simplified to non-device specific interfaces, like config space access plus ioctls. I don't understand what's your mean. When a fatal aer error occurs the process is following. For host aer driver detect aer error -> vfio driver send aer error -> aer driver reset bus -> qemu report aer error For guest -> aer driver detect aer error -> aer driver reset bus -> device driver maybe disable device I am not sure if all the device driver disable device when a fatal aer error occurs. Should we depend on the guest device driver to protect the device be accessed during the reset period? Sincerely Zhoujie
Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
Hi Alex, In vfio I have some questions. 1. How can I disable the access by mmap? We can disable all access to vfio fd by returning a EAGAIN error if user try to access it during the reset period until the host reset finished. But about the bar region which is maped by vfio_pci_mmap. How can I disable it in vfio driver? Even there is a way to do it, how about the complexity to recovery the mmap? That's exactly the "sticky point" I refer to above, you'd need to solve that problem. MST would probably still argue that we don't need to disable all those interfaces, a userspace driver can already do things like disable mmio space and then attempt to read from the mmio space of the device. You said we should not depend on user to protect the device be accessed during the reset period. So maybe the problem can be simplified to non-device specific interfaces, like config space access plus ioctls. I don't understand what's your mean. Sincerely Zhoujie
Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
Hi Alex, Hi Alex, on kernel side, I think if we don't trust the user behaviors, we should disable the access of vfio-pci interface once vfio-pci driver got the error_detected, we should disable all access to vfio fd regardless whether the vfio-pci was assigned to a VM, we also can return a EAGAIN error if user try to access it during the reset period until the host reset finished. on qemu side, when we got a error_detect, we pass through the aer error to guest directly, ignore all access to vfio-pci during this time, when qemu need to do a hot reset, we can retry to get the info from the get info ioctl until we got the info that vfio-pci has been reset finished, then do the hot_reset ioctl if need, the kernel should ensure the ioctl become accessible after host reset completed. That sounds pretty thorough, the sticky point there is always disabling the device mmaps w/o a revoke interface. Do we invalidate the pfn range and setup a fault handler that blocks on access? I don't think we have a whole lot of options, either block or sigbus, but having such a mechanism might allow us to easily put a device in a "dead" state where the user can't touch it, which could be useful for other purposes too. QEMU would also need to timeout after some number of reset attempts and assume the device is not coming back. Plus we'd need a device flag to indicate this behavior. Thanks, Alex In vfio I have some questions. 1. How can I disable the access by mmap? We can disable all access to vfio fd by returning a EAGAIN error if user try to access it during the reset period until the host reset finished. But about the bar region which is maped by vfio_pci_mmap. How can I disable it in vfio driver? Even there is a way to do it, how about the complexity to recovery the mmap? In qemu I have following proposals. 1. Setup a fault handler that blocks on access of bar region. So the data transmission will be blocked. 2. Disable vfio_pci_write_config, but keep vfio_pci_read_config enabled. The VM can get the error information by reading configure space. But operation of writing the configure space will be ignored. 3. Get VFIO device infomation instend of receiving resume notification. When I tested the non-fatal error. I found that sometimes the qemu receive resume notification earlier than error notification. The notification receiving time between different eventfd is not in the order of sending time. Sincerely Zhoujie
Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
Hi, Alex I was really hoping to hear your opinion, or at least some further discussion of pros and cons rather than simply parroting back my idea. I understand. My current thinking is that a resume notifier to userspace is poorly defined, it's not clear what the user can and cannot do between an error notification and the resume notification. Yes, do nothing between that time is better. One approach to solve that might be that the kernel internally handles the resume notifications. Maybe that means blocking the ioctl (interruptible timeout) until the internal resume occurs, or maybe that means returning -EAGAIN. I don't think it is a good idea. The kernel give the error and resume notifications, it's enough. It's up to user to how to use them. Probably implementations of each need to be worked through to determine which is better. We don't want to add complexity to the kernel simply to make things easier for userspace, but we also don't want a poorly specified interface that is difficult for userspace to use correctly. Thanks, In qemu, the aer recovery process: 1. Detect support for resume notification If host vfio driver does not support for resume notification, directly fail to boot up VM as with aer enabled. 2. Immediately notify the VM on error detected. 3. Disable the device. Unmap the config space and bar region. 4. Delay the guest directed bus reset. 5. Wait for resume notification. If we don't get the resume notification from the host after some timeout, we would abort the guest directed bus reset altogether and unplug of the device to prevent it from further interacting with the VM. 6. After get the resume notification reset bus and enable the device. I think we only make sure the disabled device will not interact with the VM. Sincerely Zhou jie Alex . -- 周潔 Dept 1 No. 6 Wenzhu Road, Nanjing, 210012, China TEL:+86+25-86630566-8557 FUJITSU INTERNAL:7998-8557 E-Mail:zhoujie2...@cn.fujitsu.com
Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
ping On 2016/6/12 10:38, Zhou Jie wrote: Hi, Alex It seems like we have a number of questions open in the thread with MST from the previous version, particularly whether we should actually drop the resume notifier and block the reset in the kernel. The concern being that it's not very well specified what we can and cannot do between the error interrupt and the resume interrupt. We'd probably need some other indicate of whether the host has this capability, perhaps a flag in vfio_device_info. Appreciate your opinions there. Thanks, Alex Have you had a conclusion with MST? How about the process like following? 1. Detect support for VFIO reset blocking. Maybe use vfio_device_info. 2. Immediately notify the VM on error detected. 3. Invoke ioctl(VFIO_DEVICE_PCI_HOT_RESET). If kernel is doing reset, then block in ioctl. Can I ignore the resume notifier? else Shall ioctl block until after receiving resume notification? Sincerely Zhou Jie -- 周潔 Dept 1 No. 6 Wenzhu Road, Nanjing, 210012, China TEL:+86+25-86630566-8557 FUJITSU INTERNAL:7998-8557 E-Mail:zhoujie2...@cn.fujitsu.com
Re: [Qemu-devel] [RFC PATCH 0/3] x86: Add support for guest DMA dirty page tracking
Hi, Alex On 2016/6/9 23:39, Alexander Duyck wrote: On Thu, Jun 9, 2016 at 3:14 AM, Zhou Jie wrote: TO Alex TO Michael In your solution you add a emulate PCI bridge to act as a bridge between direct assigned devices and the host bridge. Do you mean put all direct assigned devices to one emulate PCI bridge? If yes, this maybe bring some problems. We are writing a patchset to support aer feature in qemu. When assigning a vfio device with AER enabled, we must check whether the device supports a host bus reset (ie. hot reset) as this may be used by the guest OS in order to recover the device from an AER error. QEMU must therefore have the ability to perform a physical host bus reset using the existing vfio APIs in response to a virtual bus reset in the VM. A physical bus reset affects all of the devices on the host bus. Therefore all physical devices affected by a bus reset must be configured on the same virtual bus in the VM. And no devices unaffected by the bus reset, be configured on the same virtual bus. http://lists.nongnu.org/archive/html/qemu-devel/2016-05/msg02989.html Sincerely, Zhou Jie That makes sense, but I don't think you have to worry much about this at this point at least on my side as this was mostly just theory and I haven't had a chance to put any of it into practice as of yet. My idea has been evolving on this for a while. One thought I had is that we may want to have something like an emulated IOMMU and if possible we would want to split it up over multiple domains just so we can be certain that the virtual interfaces and the physical ones existed in separate domains. In regards to your concerns perhaps what we could do is put each assigned device into its own domain to prevent them from affecting each other. To that end we could probably break things up so that each device effectively lives in its own PCIe slot in the emulated system. Then when we start a migration of the guest the assigned device domains would then have to be tracked for unmap and sync calls when the direction is from the device. I will keep your concerns in mind in the future when I get some time to look at exploring this solution further. - Alex I am thinking about the practice of migration of passthrough device. In your solution, you use a vendor specific configuration space to negotiate with guest. If you put each assigned device into its own domain, how can qemu negotiate with guest? Add the vendor specific configuration space to every pci bus which is assigned a passthrough device? Sincerely Zhou Jie
Re: [Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
Hi, Alex It seems like we have a number of questions open in the thread with MST from the previous version, particularly whether we should actually drop the resume notifier and block the reset in the kernel. The concern being that it's not very well specified what we can and cannot do between the error interrupt and the resume interrupt. We'd probably need some other indicate of whether the host has this capability, perhaps a flag in vfio_device_info. Appreciate your opinions there. Thanks, Alex Have you had a conclusion with MST? How about the process like following? 1. Detect support for VFIO reset blocking. Maybe use vfio_device_info. 2. Immediately notify the VM on error detected. 3. Invoke ioctl(VFIO_DEVICE_PCI_HOT_RESET). If kernel is doing reset, then block in ioctl. Can I ignore the resume notifier? else Shall ioctl block until after receiving resume notification? Sincerely Zhou Jie
Re: [Qemu-devel] [RFC PATCH 0/3] x86: Add support for guest DMA dirty page tracking
TO Alex TO Michael In your solution you add a emulate PCI bridge to act as a bridge between direct assigned devices and the host bridge. Do you mean put all direct assigned devices to one emulate PCI bridge? If yes, this maybe bring some problems. We are writing a patchset to support aer feature in qemu. When assigning a vfio device with AER enabled, we must check whether the device supports a host bus reset (ie. hot reset) as this may be used by the guest OS in order to recover the device from an AER error. QEMU must therefore have the ability to perform a physical host bus reset using the existing vfio APIs in response to a virtual bus reset in the VM. A physical bus reset affects all of the devices on the host bus. Therefore all physical devices affected by a bus reset must be configured on the same virtual bus in the VM. And no devices unaffected by the bus reset, be configured on the same virtual bus. http://lists.nongnu.org/archive/html/qemu-devel/2016-05/msg02989.html Sincerely, Zhou Jie On 2016/6/7 0:04, Alex Duyck wrote: On Mon, Jun 6, 2016 at 2:18 AM, Zhou Jie wrote: Hi Alex, On 2016/1/6 0:18, Alexander Duyck wrote: On Tue, Jan 5, 2016 at 1:40 AM, Michael S. Tsirkin wrote: On Mon, Jan 04, 2016 at 07:11:25PM -0800, Alexander Duyck wrote: The two mechanisms referenced above would likely require coordination with QEMU and as such are open to discussion. I haven't attempted to address them as I am not sure there is a consensus as of yet. My personal preference would be to add a vendor-specific configuration block to the emulated pci-bridge interfaces created by QEMU that would allow us to essentially extend shpc to support guest live migration with pass-through devices. shpc? That is kind of what I was thinking. We basically need some mechanism to allow for the host to ask the device to quiesce. It has been proposed to possibly even look at something like an ACPI interface since I know ACPI is used by QEMU to manage hot-plug in the standard case. - Alex Start by using hot-unplug for this! Really use your patch guest side, and write host side to allow starting migration with the device, but defer completing it. Yeah, I'm fully on board with this idea, though I'm not really working on this right now since last I knew the folks on this thread from Intel were working on it. My patches were mostly meant to be a nudge in this direction so that we could get away from the driver specific code. I have seen your email about live migration. I conclude the idea you proposed as following. 1. Extend swiotlb to allow for a page dirtying functionality. 2. Use pci express capability to implement of a PCI bridge to act as a bridge between direct assigned devices and the host bridge. 3. Using APCI event or extend shpc driver to support device pause. Is it right? Will you implement the patchs for live migration? That is pretty much the heart of the proposal I had. I submitted an RFC as a proof-of-concept for item 1 in the hopes that someone else might try tackling items 2 and 3 but I haven't seen any updates since then. The trick is to find a way to make it so that item 1 doesn't slow down standard SWIOTLB when you are not migrating a VM. If nothing else we would probably just need to add a static key that we could default to false unless there is a PCI bridge indicating we are starting a migration. I haven't had time to really work on this though. In addition I am not that familiar with QEMU and the internals of live migration so pieces 2 and 3 would take me some additional time to work on. - Alex .
Re: [Qemu-devel] [RFC PATCH 0/3] x86: Add support for guest DMA dirty page tracking
Hi Alex, On 2016/1/6 0:18, Alexander Duyck wrote: On Tue, Jan 5, 2016 at 1:40 AM, Michael S. Tsirkin wrote: On Mon, Jan 04, 2016 at 07:11:25PM -0800, Alexander Duyck wrote: The two mechanisms referenced above would likely require coordination with QEMU and as such are open to discussion. I haven't attempted to address them as I am not sure there is a consensus as of yet. My personal preference would be to add a vendor-specific configuration block to the emulated pci-bridge interfaces created by QEMU that would allow us to essentially extend shpc to support guest live migration with pass-through devices. shpc? That is kind of what I was thinking. We basically need some mechanism to allow for the host to ask the device to quiesce. It has been proposed to possibly even look at something like an ACPI interface since I know ACPI is used by QEMU to manage hot-plug in the standard case. - Alex Start by using hot-unplug for this! Really use your patch guest side, and write host side to allow starting migration with the device, but defer completing it. Yeah, I'm fully on board with this idea, though I'm not really working on this right now since last I knew the folks on this thread from Intel were working on it. My patches were mostly meant to be a nudge in this direction so that we could get away from the driver specific code. I have seen your email about live migration. I conclude the idea you proposed as following. 1. Extend swiotlb to allow for a page dirtying functionality. 2. Use pci express capability to implement of a PCI bridge to act as a bridge between direct assigned devices and the host bridge. 3. Using APCI event or extend shpc driver to support device pause. Is it right? Will you implement the patchs for live migration? Sincerely, Zhou Jie So 1.- host tells guest to start tracking memory writes 2.- guest acks 3.- migration starts 4.- most memory is migrated 5.- host tells guest to eject device 6.- guest acks 7.- stop vm and migrate rest of state Sounds about right. The only way this differs from what I see as the final solution for this is that instead of fully ejecting the device in step 5 the driver would instead pause the device and give the host something like 10 seconds to stop the VM and resume with the same device connected if it is available. We would probably also need to look at a solution that would force the device to be ejected or abort prior to starting the migration if it doesn't give us the ack in step 2. It will already be a win since hot unplug after migration starts and most memory has been migrated is better than hot unplug before migration starts. Right. Generally the longer the VF can be maintained as a part of the guest the longer the network performance is improved versus using a purely virtual interface. Then measure downtime and profile. Then we can look at ways to quiesce device faster which really means step 5 is replaced with "host tells guest to quiesce device and dirty (or just unmap!) all memory mapped for write by device". Step 5 will be the spot where we really need to start modifying drivers. Specifically we probably need to go through and clean-up things so that we can reduce as many of the delays in the driver suspend/resume path as possible. I suspect there is quite a bit that can be done there that would probably also improve boot and shutdown times since those are also impacted by the devices. - Alex .
[Qemu-devel] [PATCH v8 11/12] vfio: register aer resume notification handler for aer resume
From: Chen Fan For supporting aer recovery, host and guest would run the same aer recovery code, that would do the secondary bus reset if the error is fatal, the aer recovery process: 1. error_detected 2. reset_link (if fatal) 3. slot_reset/mmio_enabled 4. resume It indicates that host will do secondary bus reset to reset the physical devices under bus in step 2, that would cause devices in D3 status in a short time. But in qemu, we register an error detected handler, that would be invoked as host broadcasts the error-detected event in step 1, in order to avoid guest do reset_link when host do reset_link simultaneously. it may cause fatal error. we introduce a resmue notifier to assure host reset completely. In qemu, the aer recovery process: 1. Detect support for resume notification If host vfio driver does not support for resume notification, directly fail to boot up VM as with aer enabled. 2. Immediately notify the VM on error detected. 3. Delay the guest directed bus reset. 4. Wait for resume notification. If we don't get the resume notification from the host after some timeout, we would abort the guest directed bus reset altogether and unplug of the device to prevent it from further interacting with the VM. 5. After get the resume notification reset bus. Signed-off-by: Chen Fan Signed-off-by: Zhou Jie --- v7->v8 *Add a comment for why VFIO_RESET_TIMEOUT value is 1000. *change vfio_resume_cap to pci_aer_has_resume. *change vfio_resume_notifier_handler to vfio_aer_resume_notifier_handler. *change reset_timer to pci_aer_reset_blocked_timer. *Remove error_report for not supporting resume notification in vfio_populate_device function. *All error and resume tracking is done with atomic bitmap on function 0. *Remove stalling any access to the device until resume is signaled. Because guest OS need read configure space to know the reason of error. And it should up to guest OS to decide to stop access BAR region. *Still use timer to delay reset. Because vfio_aer_resume_notifier_handler cann't be invoked when vfio_pci_reset is blocked. hw/vfio/pci.c | 223 - hw/vfio/pci.h | 5 + linux-headers/linux/vfio.h | 1 + 3 files changed, 228 insertions(+), 1 deletion(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 6877a3d..77d86d8 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -35,6 +35,12 @@ #include "trace.h" #define MSIX_CAP_LENGTH 12 +/* + * Timeout for waiting resume notification, it is 1 second. + * The resume notificaton will be sent after host aer error recovery. + * For hardware bus reset 1 second will be enough. + */ +#define VFIO_RESET_TIMEOUT 1000 static void vfio_disable_interrupts(VFIOPCIDevice *vdev); static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); @@ -1937,6 +1943,14 @@ static void vfio_check_hot_bus_reset(VFIOPCIDevice *vdev, Error **errp) VFIOGroup *group; int ret, i, devfn, range_limit; +if (!vdev->pci_aer_has_resume) { +error_setg(errp, "vfio: Cannot enable AER for device %s," + " host vfio driver does not support for" + " resume notification", + vdev->vbasedev.name); +return; +} + ret = vfio_get_hot_reset_info(vdev, &info); if (ret) { error_setg(errp, "vfio: Cannot enable AER for device %s," @@ -2594,6 +2608,17 @@ static int vfio_populate_device(VFIOPCIDevice *vdev) vbasedev->name); } +irq_info.index = VFIO_PCI_RESUME_IRQ_INDEX; + +ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); +if (ret) { +/* This can fail for an old kernel or legacy PCI dev */ +trace_vfio_populate_device_get_irq_info_failure(); +ret = 0; +} else if (irq_info.count == 1) { +vdev->pci_aer_has_resume = true; +} + error: return ret; } @@ -2606,6 +2631,72 @@ static void vfio_put_device(VFIOPCIDevice *vdev) vfio_put_base_device(&vdev->vbasedev); } +static void vfio_aer_error_set_signaled(VFIOPCIDevice *vdev) +{ +PCIDevice *dev = &vdev->pdev; +PCIDevice *dev_0 = pci_get_function_0(dev); +VFIOPCIDevice *vdev_0 = DO_UPCAST(VFIOPCIDevice, pdev, dev_0); +unsigned int func = PCI_FUNC(dev->devfn); + +bitmap_set_atomic(vdev_0->pci_aer_error_signaled_bitmap, func, 1); +} + +static void vfio_aer_resume_set_signaled(VFIOPCIDevice *vdev) +{ +PCIDevice *dev = &vdev->pdev; +PCIDevice *dev_0 = pci_get_function_0(dev); +VFIOPCIDevice *vdev_0 = DO_UPCAST(VFIOPCIDevice, pdev, dev_0); +unsigned int func = PCI_FUNC(dev->devfn); + +bitmap_set_atomic(vdev_0->pci_aer_resume_signaled_bitmap, func, 1); +} + +static void vfio_aer_error_clear_all_signaled(VFIOPCIDevice *vdev) +{ +PCIDevice *d
Re: [Qemu-devel] [PATCH 11/12] vfio: register aer resume notification handler for aer resume
Hi, Alex >> Do we even need a timer? What if we simply spin? >> >> for (i = 0; i < 1000; i++) { >> if (vdev->pci_aer_resume_signaled) { >> break; >> } >> g_usleep(1000); >> } >> >> if (i == 1000) { >> /* We failed */ >> } else { >> /* Proceed with reset */ >> } >> >> Does QEMU have enough concurrency to do this? Thanks,On 2016/5/25 In my test, it can not work. vfio_aer_resume_notifier_handler is invoked after vfio_pci_reset. It is useless to wait in vfio_pci_reset. Sincerely, Zhou Jie 14:23, Zhou Jie wrote: Hi, Alex 3. Stall any access to the device until resume is signaled. The code below doesn't actually do this, mmaps are disabled, but that just means any device access will use the slow path through QEMU. That path is still fully enabled and so is config space access. I will modify the code and find other way to stall any access to the device. I find that to stall any access to the device, I need modify the following function. 1. vfio_region_read and vfio_region_write For stalling any access to the device bar region. 2. vfio_vga_read and vfio_vga_write For stalling any access to the vga device region. 3. vfio_pci_read_config and vfio_pci_write_config For stalling any access to the device config space. What will happen if I don't stall any access to the device? Sincerely, Zhou Jie
Re: [Qemu-devel] [PATCH 11/12] vfio: register aer resume notification handler for aer resume
Hi, Alex 3. Stall any access to the device until resume is signaled. The code below doesn't actually do this, mmaps are disabled, but that just means any device access will use the slow path through QEMU. That path is still fully enabled and so is config space access. I will modify the code and find other way to stall any access to the device. I find that to stall any access to the device, I need modify the following function. 1. vfio_region_read and vfio_region_write For stalling any access to the device bar region. 2. vfio_vga_read and vfio_vga_write For stalling any access to the vga device region. 3. vfio_pci_read_config and vfio_pci_write_config For stalling any access to the device config space. What will happen if I don't stall any access to the device? Sincerely, Zhou Jie
Re: [Qemu-devel] [patch v6 11/12] vfio: register aer resume notification handler for aer resume
So I needn't disable access to the device between being notified that the error occurred and being notified to resume. This will make code simpler. Am I right? On 2016/5/24 18:49, Michael S. Tsirkin wrote: On Tue, Apr 26, 2016 at 08:48:15AM -0600, Alex Williamson wrote: I think that means that if we want to switch from a simple halt-on-error to a mechanism for the guest to handle recovery, we need to disable access to the device between being notified that the error occurred and being notified to resume. But this isn't what happens on bare metal. Errors are reported asynchronously and host might access the device meanwhile. These accesses might or might not trigger more errors, but fundamentally this should not matter too much as device is going to be reset.
Re: [Qemu-devel] [PATCH 11/12] vfio: register aer resume notification handler for aer resume
On 2016/5/19 10:18, Alex Williamson wrote: On Thu, 19 May 2016 09:49:00 +0800 Zhou Jie wrote: On 2016/5/19 2:26, Alex Williamson wrote: On Wed, 18 May 2016 11:31:09 +0800 Zhou Jie wrote: From: Chen Fan For supporting aer recovery, host and guest would run the same aer recovery code, that would do the secondary bus reset if the error is fatal, the aer recovery process: 1. error_detected 2. reset_link (if fatal) 3. slot_reset/mmio_enabled 4. resume It indicates that host will do secondary bus reset to reset the physical devices under bus in step 2, that would cause devices in D3 status in a short time. But in qemu, we register an error detected handler, that would be invoked as host broadcasts the error-detected event in step 1, in order to avoid guest do reset_link when host do reset_link simultaneously. it may cause fatal error. we introduce a resmue notifier to assure host reset completely. In qemu, the aer recovery process: 1. Detect support for resume notification If host vfio driver does not support for resume notification, directly fail to boot up VM as with aer enabled. 2. Immediately notify the VM on error detected. 3. Stall any access to the device until resume is signaled. The code below doesn't actually do this, mmaps are disabled, but that just means any device access will use the slow path through QEMU. That path is still fully enabled and so is config space access. I will modify the code and find other way to stall any access to the device. 4. Delay the guest directed bus reset. It's delayed, but not the way I expected. The guest goes on executing and then we do the reset at some point later? More comments below... 5. Wait for resume notification. If we don't get the resume notification from the host after some timeout, we would abort the guest directed bus reset altogether and unplug of the device to prevent it from further interacting with the VM. 6. After get the resume notification reset bus. Signed-off-by: Chen Fan Signed-off-by: Zhou Jie --- hw/vfio/pci.c | 182 - hw/vfio/pci.h | 5 ++ linux-headers/linux/vfio.h | 1 + 3 files changed, 186 insertions(+), 2 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 6877a3d..39a9a9f 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -35,6 +35,7 @@ #include "trace.h" #define MSIX_CAP_LENGTH 12 +#define VFIO_RESET_TIMEOUT 1000 It deserves at least a comment as to why this value was chosen. OK. I will add a comment here. static void vfio_disable_interrupts(VFIOPCIDevice *vdev); static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); @@ -1937,6 +1938,14 @@ static void vfio_check_hot_bus_reset(VFIOPCIDevice *vdev, Error **errp) VFIOGroup *group; int ret, i, devfn, range_limit; +if (!vdev->vfio_resume_cap) { +error_setg(errp, "vfio: Cannot enable AER for device %s," + " host vfio driver does not support for" + " resume notification", + vdev->vbasedev.name); +return; +} + ret = vfio_get_hot_reset_info(vdev, &info); if (ret) { error_setg(errp, "vfio: Cannot enable AER for device %s," @@ -2594,6 +2603,23 @@ static int vfio_populate_device(VFIOPCIDevice *vdev) vbasedev->name); } +irq_info.index = VFIO_PCI_RESUME_IRQ_INDEX; + +ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); +if (ret) { +/* This can fail for an old kernel or legacy PCI dev */ +trace_vfio_populate_device_get_irq_info_failure(); +ret = 0; +} else if (irq_info.count == 1) { +vdev->vfio_resume_cap = true; "vfio" in the name is redundant, ERR_IRQ uses pci_aer, so a logical name might be pci_aer_has_resume. OK. +} else { +error_report("vfio: %s " + "Could not enable error recovery for the device," + " because host vfio driver does not support for" + " resume notification", + vbasedev->name); +} This error_report makes sense for ERR_IRQ because halt-on-AER is setup transparently, but I don't think it makes sense here. If the user has specified to enable AER then it should either work or they should get an error message. If they have not specified to enable AER, why does the user care if there's an inconsistency here? OK. I will delete the error report at here. + error: return ret; } @@ -2661,6 +2687,17 @@ static void vfio_err_notifier_handler(void *opaque) msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN : PCI_ERR_ROOT_CMD_NONFATAL_EN; +if (isfatal) { +PCIDevice *dev_0 = pci_get
Re: [Qemu-devel] [PATCH 11/12] vfio: register aer resume notification handler for aer resume
On 2016/5/19 10:18, Alex Williamson wrote: On Thu, 19 May 2016 09:49:00 +0800 Zhou Jie wrote: On 2016/5/19 2:26, Alex Williamson wrote: On Wed, 18 May 2016 11:31:09 +0800 Zhou Jie wrote: From: Chen Fan For supporting aer recovery, host and guest would run the same aer recovery code, that would do the secondary bus reset if the error is fatal, the aer recovery process: 1. error_detected 2. reset_link (if fatal) 3. slot_reset/mmio_enabled 4. resume It indicates that host will do secondary bus reset to reset the physical devices under bus in step 2, that would cause devices in D3 status in a short time. But in qemu, we register an error detected handler, that would be invoked as host broadcasts the error-detected event in step 1, in order to avoid guest do reset_link when host do reset_link simultaneously. it may cause fatal error. we introduce a resmue notifier to assure host reset completely. In qemu, the aer recovery process: 1. Detect support for resume notification If host vfio driver does not support for resume notification, directly fail to boot up VM as with aer enabled. 2. Immediately notify the VM on error detected. 3. Stall any access to the device until resume is signaled. The code below doesn't actually do this, mmaps are disabled, but that just means any device access will use the slow path through QEMU. That path is still fully enabled and so is config space access. I will modify the code and find other way to stall any access to the device. 4. Delay the guest directed bus reset. It's delayed, but not the way I expected. The guest goes on executing and then we do the reset at some point later? More comments below... 5. Wait for resume notification. If we don't get the resume notification from the host after some timeout, we would abort the guest directed bus reset altogether and unplug of the device to prevent it from further interacting with the VM. 6. After get the resume notification reset bus. Signed-off-by: Chen Fan Signed-off-by: Zhou Jie --- hw/vfio/pci.c | 182 - hw/vfio/pci.h | 5 ++ linux-headers/linux/vfio.h | 1 + 3 files changed, 186 insertions(+), 2 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 6877a3d..39a9a9f 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -35,6 +35,7 @@ #include "trace.h" #define MSIX_CAP_LENGTH 12 +#define VFIO_RESET_TIMEOUT 1000 It deserves at least a comment as to why this value was chosen. OK. I will add a comment here. static void vfio_disable_interrupts(VFIOPCIDevice *vdev); static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); @@ -1937,6 +1938,14 @@ static void vfio_check_hot_bus_reset(VFIOPCIDevice *vdev, Error **errp) VFIOGroup *group; int ret, i, devfn, range_limit; +if (!vdev->vfio_resume_cap) { +error_setg(errp, "vfio: Cannot enable AER for device %s," + " host vfio driver does not support for" + " resume notification", + vdev->vbasedev.name); +return; +} + ret = vfio_get_hot_reset_info(vdev, &info); if (ret) { error_setg(errp, "vfio: Cannot enable AER for device %s," @@ -2594,6 +2603,23 @@ static int vfio_populate_device(VFIOPCIDevice *vdev) vbasedev->name); } +irq_info.index = VFIO_PCI_RESUME_IRQ_INDEX; + +ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); +if (ret) { +/* This can fail for an old kernel or legacy PCI dev */ +trace_vfio_populate_device_get_irq_info_failure(); +ret = 0; +} else if (irq_info.count == 1) { +vdev->vfio_resume_cap = true; "vfio" in the name is redundant, ERR_IRQ uses pci_aer, so a logical name might be pci_aer_has_resume. OK. +} else { +error_report("vfio: %s " + "Could not enable error recovery for the device," + " because host vfio driver does not support for" + " resume notification", + vbasedev->name); +} This error_report makes sense for ERR_IRQ because halt-on-AER is setup transparently, but I don't think it makes sense here. If the user has specified to enable AER then it should either work or they should get an error message. If they have not specified to enable AER, why does the user care if there's an inconsistency here? OK. I will delete the error report at here. + error: return ret; } @@ -2661,6 +2687,17 @@ static void vfio_err_notifier_handler(void *opaque) msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN : PCI_ERR_ROOT_CMD_NONFATAL_EN; +if (isfatal) { +PCIDevice *dev_0 = pci_get
Re: [Qemu-devel] [PATCH 11/12] vfio: register aer resume notification handler for aer resume
On 2016/5/19 2:26, Alex Williamson wrote: On Wed, 18 May 2016 11:31:09 +0800 Zhou Jie wrote: From: Chen Fan For supporting aer recovery, host and guest would run the same aer recovery code, that would do the secondary bus reset if the error is fatal, the aer recovery process: 1. error_detected 2. reset_link (if fatal) 3. slot_reset/mmio_enabled 4. resume It indicates that host will do secondary bus reset to reset the physical devices under bus in step 2, that would cause devices in D3 status in a short time. But in qemu, we register an error detected handler, that would be invoked as host broadcasts the error-detected event in step 1, in order to avoid guest do reset_link when host do reset_link simultaneously. it may cause fatal error. we introduce a resmue notifier to assure host reset completely. In qemu, the aer recovery process: 1. Detect support for resume notification If host vfio driver does not support for resume notification, directly fail to boot up VM as with aer enabled. 2. Immediately notify the VM on error detected. 3. Stall any access to the device until resume is signaled. The code below doesn't actually do this, mmaps are disabled, but that just means any device access will use the slow path through QEMU. That path is still fully enabled and so is config space access. I will modify the code and find other way to stall any access to the device. 4. Delay the guest directed bus reset. It's delayed, but not the way I expected. The guest goes on executing and then we do the reset at some point later? More comments below... 5. Wait for resume notification. If we don't get the resume notification from the host after some timeout, we would abort the guest directed bus reset altogether and unplug of the device to prevent it from further interacting with the VM. 6. After get the resume notification reset bus. Signed-off-by: Chen Fan Signed-off-by: Zhou Jie --- hw/vfio/pci.c | 182 - hw/vfio/pci.h | 5 ++ linux-headers/linux/vfio.h | 1 + 3 files changed, 186 insertions(+), 2 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 6877a3d..39a9a9f 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -35,6 +35,7 @@ #include "trace.h" #define MSIX_CAP_LENGTH 12 +#define VFIO_RESET_TIMEOUT 1000 It deserves at least a comment as to why this value was chosen. OK. I will add a comment here. static void vfio_disable_interrupts(VFIOPCIDevice *vdev); static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); @@ -1937,6 +1938,14 @@ static void vfio_check_hot_bus_reset(VFIOPCIDevice *vdev, Error **errp) VFIOGroup *group; int ret, i, devfn, range_limit; +if (!vdev->vfio_resume_cap) { +error_setg(errp, "vfio: Cannot enable AER for device %s," + " host vfio driver does not support for" + " resume notification", + vdev->vbasedev.name); +return; +} + ret = vfio_get_hot_reset_info(vdev, &info); if (ret) { error_setg(errp, "vfio: Cannot enable AER for device %s," @@ -2594,6 +2603,23 @@ static int vfio_populate_device(VFIOPCIDevice *vdev) vbasedev->name); } +irq_info.index = VFIO_PCI_RESUME_IRQ_INDEX; + +ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); +if (ret) { +/* This can fail for an old kernel or legacy PCI dev */ +trace_vfio_populate_device_get_irq_info_failure(); +ret = 0; +} else if (irq_info.count == 1) { +vdev->vfio_resume_cap = true; "vfio" in the name is redundant, ERR_IRQ uses pci_aer, so a logical name might be pci_aer_has_resume. OK. +} else { +error_report("vfio: %s " + "Could not enable error recovery for the device," + " because host vfio driver does not support for" + " resume notification", + vbasedev->name); +} This error_report makes sense for ERR_IRQ because halt-on-AER is setup transparently, but I don't think it makes sense here. If the user has specified to enable AER then it should either work or they should get an error message. If they have not specified to enable AER, why does the user care if there's an inconsistency here? OK. I will delete the error report at here. + error: return ret; } @@ -2661,6 +2687,17 @@ static void vfio_err_notifier_handler(void *opaque) msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN : PCI_ERR_ROOT_CMD_NONFATAL_EN; +if (isfatal) { +PCIDevice *dev_0 = pci_get_function_0(dev); +vdev->vfio_resume_wait = true; +vdev->vfio_resume_arri
[Qemu-devel] [PATCH 10/12] vfio-pci: pass the aer error to guest
From: Chen Fan when the vfio device encounters an uncorrectable error in host, the vfio_pci driver will signal the eventfd registered by this vfio device, resulting in the qemu eventfd handler getting invoked. this patch is to pass the error to guest and let the guest driver recover from the error. Signed-off-by: Chen Fan --- hw/vfio/pci.c | 60 +-- 1 file changed, 54 insertions(+), 6 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 47fd407..6877a3d 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2609,18 +2609,66 @@ static void vfio_put_device(VFIOPCIDevice *vdev) static void vfio_err_notifier_handler(void *opaque) { VFIOPCIDevice *vdev = opaque; +PCIDevice *dev = &vdev->pdev; +Error *local_err = NULL; +PCIEAERMsg msg = { +.severity = 0, +.source_id = (pci_bus_num(dev->bus) << 8) | dev->devfn, +}; if (!event_notifier_test_and_clear(&vdev->err_notifier)) { return; } + +if (!(vdev->features & VFIO_FEATURE_ENABLE_AER)) { +goto stop; +} + +/* + * in case the real hardware configuration has been changed, + * here we should recheck the bus reset capability. + */ +vfio_check_hot_bus_reset(vdev, &local_err); +if (local_err) { +error_report_err(local_err); +goto stop; +} + +/* + * we should read the error details from the real hardware + * configuration spaces, here we only need to do is signaling + * to guest an uncorrectable error has occurred. + */ +if (dev->exp.aer_cap) { +uint8_t *aer_cap = dev->config + dev->exp.aer_cap; +uint32_t uncor_status; +bool isfatal; + +uncor_status = vfio_pci_read_config(dev, + dev->exp.aer_cap + PCI_ERR_UNCOR_STATUS, 4); + +/* + * if the error is not emitted by this device, we can + * just ignore it. + */ +if (!(uncor_status & ~0UL)) { +return; +} + +isfatal = uncor_status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER); + +msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN : + PCI_ERR_ROOT_CMD_NONFATAL_EN; + +pcie_aer_msg(dev, &msg); +return; +} + +stop: /* - * TBD. Retrieve the error details and decide what action - * needs to be taken. One of the actions could be to pass - * the error to the guest and have the guest driver recover - * from the error. This requires that PCIe capabilities be - * exposed to the guest. For now, we just terminate the - * guest to contain the error. + * If the aer capability is not exposed to the guest. we just + * terminate the guest to contain the error. */ error_report("%s(%s) Unrecoverable error detected. Please collect any data possible and then kill the guest", __func__, vdev->vbasedev.name); -- 1.8.3.1
[Qemu-devel] [PATCH 09/12] vfio: vote the function 0 to do host bus reset when aer occurred
From: Chen Fan Due to all devices assigned to VM on the same way as host if enable aer, so we can easily do the hot reset by selecting the function #0 to do the hot reset. Signed-off-by: Chen Fan --- hw/vfio/pci.c | 14 ++ hw/vfio/pci.h | 1 + 2 files changed, 15 insertions(+) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 109b11b..47fd407 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -1948,6 +1948,8 @@ static void vfio_check_hot_bus_reset(VFIOPCIDevice *vdev, Error **errp) /* List all affected devices by bus reset */ devices = &info->devices[0]; +vdev->single_depend_dev = (info->count == 1); + /* Verify that we have all the groups required */ for (i = 0; i < info->count; i++) { PCIHostDeviceAddress host; @@ -3058,6 +3060,18 @@ static void vfio_pci_reset(DeviceState *dev) trace_vfio_pci_reset(vdev->vbasedev.name); +if (vdev->features & VFIO_FEATURE_ENABLE_AER) { +PCIDevice *br = pci_bridge_get_device(pdev->bus); + +if ((pci_get_word(br->config + PCI_BRIDGE_CONTROL) & + PCI_BRIDGE_CTL_BUS_RESET)) { +if (pci_get_function_0(pdev) == pdev) { +vfio_pci_hot_reset(vdev, vdev->single_depend_dev); +} +return; +} +} + vfio_pci_pre_reset(vdev); if (vdev->resetfn && !vdev->resetfn(vdev)) { diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index db7c6d5..9fb0206 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -143,6 +143,7 @@ typedef struct VFIOPCIDevice { bool no_kvm_intx; bool no_kvm_msi; bool no_kvm_msix; +bool single_depend_dev; } VFIOPCIDevice; uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len); -- 1.8.3.1
[Qemu-devel] [PATCH 04/12] vfio: add aer support for vfio device
From: Chen Fan Calling pcie_aer_init to initilize aer related registers for vfio device, then reload physical related registers to expose device capability. Signed-off-by: Chen Fan --- hw/vfio/pci.c | 83 +-- hw/vfio/pci.h | 3 +++ 2 files changed, 84 insertions(+), 2 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index f697853..0516d94 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -1877,6 +1877,66 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos) return 0; } +static int vfio_setup_aer(VFIOPCIDevice *vdev, uint8_t cap_ver, + int pos, uint16_t size) +{ +PCIDevice *pdev = &vdev->pdev; +PCIDevice *dev_iter; +uint8_t type; +uint32_t errcap; + +if (!(vdev->features & VFIO_FEATURE_ENABLE_AER)) { +pcie_add_capability(pdev, PCI_EXT_CAP_ID_ERR, +cap_ver, pos, size); +return 0; +} + +dev_iter = pci_bridge_get_device(pdev->bus); +if (!dev_iter) { +goto error; +} + +while (dev_iter) { +if (!pci_is_express(dev_iter)) { +goto error; +} + +type = pcie_cap_get_type(dev_iter); +if ((type != PCI_EXP_TYPE_ROOT_PORT && + type != PCI_EXP_TYPE_UPSTREAM && + type != PCI_EXP_TYPE_DOWNSTREAM)) { +goto error; +} + +if (!dev_iter->exp.aer_cap) { +goto error; +} + +dev_iter = pci_bridge_get_device(dev_iter->bus); +} + +errcap = vfio_pci_read_config(pdev, pos + PCI_ERR_CAP, 4); +/* + * The ability to record multiple headers is depending on + * the state of the Multiple Header Recording Capable bit and + * enabled by the Multiple Header Recording Enable bit. + */ +if ((errcap & PCI_ERR_CAP_MHRC) && +(errcap & PCI_ERR_CAP_MHRE)) { +pdev->exp.aer_log.log_max = PCIE_AER_LOG_MAX_DEFAULT; +} else { +pdev->exp.aer_log.log_max = 0; +} + +pcie_cap_deverr_init(pdev); +return pcie_aer_init(pdev, pos, size); + +error: +error_report("vfio: Unable to enable AER for device %s, parent bus " + "does not support AER signaling", vdev->vbasedev.name); +return -1; +} + static int vfio_add_ext_cap(VFIOPCIDevice *vdev) { PCIDevice *pdev = &vdev->pdev; @@ -1884,6 +1944,7 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev) uint16_t cap_id, next, size; uint8_t cap_ver; uint8_t *config; +int ret = 0; /* * pcie_add_capability always inserts the new capability at the tail @@ -1907,7 +1968,19 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev) */ size = vfio_ext_cap_max_size(config, next); -pcie_add_capability(pdev, cap_id, cap_ver, next, size); +switch (cap_id) { +case PCI_EXT_CAP_ID_ERR: +ret = vfio_setup_aer(vdev, cap_ver, next, size); +break; +default: +pcie_add_capability(pdev, cap_id, cap_ver, next, size); +break; +} + +if (ret) { +goto out; +} + pci_set_long(pdev->config + next, PCI_EXT_CAP(cap_id, cap_ver, 0)); /* Use emulated next pointer to allow dropping extended caps */ @@ -1915,8 +1988,9 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev) PCI_EXT_CAP_NEXT_MASK); } +out: g_free(config); -return 0; +return ret; } static int vfio_add_capabilities(VFIOPCIDevice *vdev) @@ -2673,6 +2747,11 @@ static int vfio_initfn(PCIDevice *pdev) goto out_teardown; } +if ((vdev->features & VFIO_FEATURE_ENABLE_AER) && +!pdev->exp.aer_cap) { +goto out_teardown; +} + /* QEMU emulates all of MSI & MSIX */ if (pdev->cap_present & QEMU_PCI_CAP_MSIX) { memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff, diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index 3976f68..7b3924e 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -15,6 +15,7 @@ #include "qemu-common.h" #include "exec/memory.h" #include "hw/pci/pci.h" +#include "hw/pci/pci_bridge.h" #include "hw/vfio/vfio-common.h" #include "qemu/event_notifier.h" #include "qemu/queue.h" @@ -128,6 +129,8 @@ typedef struct VFIOPCIDevice { #define VFIO_FEATURE_ENABLE_VGA (1 << VFIO_FEATURE_ENABLE_VGA_BIT) #define VFIO_FEATURE_ENABLE_REQ_BIT 1 #define VFIO_FEATURE_ENABLE_REQ (1 << VFIO_FEATURE_ENABLE_REQ_BIT) +#define VFIO_FEATURE_ENABLE_AER_BIT 2 +#define VFIO_FEATURE_ENABLE_AER (1 << VFIO_FEATURE_ENABLE_AER_BIT) int32_t bootindex; uint8_t pm_cap; bool has_vga; -- 1.8.3.1
[Qemu-devel] [PATCH 12/12] vfio: add 'aer' property to expose aercap
From: Chen Fan add 'aer' property to let user able to decide whether expose the aer capability. by default we should disable aer feature, because it needs configuration restrictions. Signed-off-by: Chen Fan --- hw/vfio/pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 39a9a9f..bf8fc95 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3369,6 +3369,8 @@ static Property vfio_pci_dev_properties[] = { sub_vendor_id, PCI_ANY_ID), DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice, sub_device_id, PCI_ANY_ID), +DEFINE_PROP_BIT("aer", VFIOPCIDevice, features, +VFIO_FEATURE_ENABLE_AER_BIT, false), /* * TODO - support passed fds... is this necessary? * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name), -- 1.8.3.1
[Qemu-devel] [PATCH 05/12] vfio: refine function vfio_pci_host_match
From: Chen Fan Signed-off-by: Chen Fan --- hw/vfio/pci.c | 21 + 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 0516d94..5b23a86 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2060,14 +2060,27 @@ static void vfio_pci_post_reset(VFIOPCIDevice *vdev) vfio_intx_enable(vdev); } +static int vfio_pci_name_to_addr(const char *name, PCIHostDeviceAddress *addr) +{ +if (strlen(name) != 12 || +sscanf(name, "%04x:%02x:%02x.%1x", &addr->domain, + &addr->bus, &addr->slot, &addr->function) != 4) { +return -EINVAL; +} + +return 0; +} + static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name) { -char tmp[13]; +PCIHostDeviceAddress tmp; -sprintf(tmp, "%04x:%02x:%02x.%1x", addr->domain, -addr->bus, addr->slot, addr->function); +if (vfio_pci_name_to_addr(name, &tmp)) { +return false; +} -return (strcmp(tmp, name) == 0); +return (tmp.domain == addr->domain && tmp.bus == addr->bus && +tmp.slot == addr->slot && tmp.function == addr->function); } static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) -- 1.8.3.1
[Qemu-devel] [PATCH 02/12] vfio: squeeze out vfio_pci_do_hot_reset for support bus reset
From: Chen Fan Squeeze out vfio_pci_do_hot_reset to do host bus reset when AER recovery. Signed-off-by: Chen Fan --- hw/vfio/pci.c | 75 +++ 1 file changed, 44 insertions(+), 31 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index cf40f9e..1ad47ef 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -1746,6 +1746,48 @@ error: return ret; } +static int vfio_pci_do_hot_reset(VFIOPCIDevice *vdev, + struct vfio_pci_hot_reset_info *info) +{ +VFIOGroup *group; +struct vfio_pci_hot_reset *reset; +int32_t *fds; +int ret, i, count; +struct vfio_pci_dependent_device *devices; + +/* Determine how many group fds need to be passed */ +count = 0; +devices = &info->devices[0]; +QLIST_FOREACH(group, &vfio_group_list, next) { +for (i = 0; i < info->count; i++) { +if (group->groupid == devices[i].group_id) { +count++; +break; +} +} +} + +reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds))); +reset->argsz = sizeof(*reset) + (count * sizeof(*fds)); +fds = &reset->group_fds[0]; + +/* Fill in group fds */ +QLIST_FOREACH(group, &vfio_group_list, next) { +for (i = 0; i < info->count; i++) { +if (group->groupid == devices[i].group_id) { +fds[reset->count++] = group->fd; +break; +} +} +} + +/* Bus reset! */ +ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset); +g_free(reset); + +return ret; +} + static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos) { PCIDevice *pdev = &vdev->pdev; @@ -1889,9 +1931,7 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) VFIOGroup *group; struct vfio_pci_hot_reset_info *info = NULL; struct vfio_pci_dependent_device *devices; -struct vfio_pci_hot_reset *reset; -int32_t *fds; -int ret, i, count; +int ret, i; bool multi = false; trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi"); @@ -1969,34 +2009,7 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) goto out_single; } -/* Determine how many group fds need to be passed */ -count = 0; -QLIST_FOREACH(group, &vfio_group_list, next) { -for (i = 0; i < info->count; i++) { -if (group->groupid == devices[i].group_id) { -count++; -break; -} -} -} - -reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds))); -reset->argsz = sizeof(*reset) + (count * sizeof(*fds)); -fds = &reset->group_fds[0]; - -/* Fill in group fds */ -QLIST_FOREACH(group, &vfio_group_list, next) { -for (i = 0; i < info->count; i++) { -if (group->groupid == devices[i].group_id) { -fds[reset->count++] = group->fd; -break; -} -} -} - -/* Bus reset! */ -ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset); -g_free(reset); +ret = vfio_pci_do_hot_reset(vdev, info); trace_vfio_pci_hot_reset_result(vdev->vbasedev.name, ret ? "%m" : "Success"); -- 1.8.3.1
[Qemu-devel] [PATCH 03/12] vfio: add pcie extended capability support
From: Chen Fan For vfio pcie device, we could expose the extended capability on PCIE bus. due to add a new pcie capability at the tail of the chain, in order to avoid config space overwritten, we introduce a copy config for parsing extended caps. and rebuild the pcie extended config space. Signed-off-by: Chen Fan --- hw/vfio/pci.c | 72 ++- 1 file changed, 71 insertions(+), 1 deletion(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 1ad47ef..f697853 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -1528,6 +1528,21 @@ static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos) return next - pos; } + +static uint16_t vfio_ext_cap_max_size(const uint8_t *config, uint16_t pos) +{ +uint16_t tmp, next = PCIE_CONFIG_SPACE_SIZE; + +for (tmp = PCI_CONFIG_SPACE_SIZE; tmp; +tmp = PCI_EXT_CAP_NEXT(pci_get_long(config + tmp))) { +if (tmp > pos && tmp < next) { +next = tmp; +} +} + +return next - pos; +} + static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask) { pci_set_word(buf, (pci_get_word(buf) & ~mask) | val); @@ -1862,16 +1877,71 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos) return 0; } +static int vfio_add_ext_cap(VFIOPCIDevice *vdev) +{ +PCIDevice *pdev = &vdev->pdev; +uint32_t header; +uint16_t cap_id, next, size; +uint8_t cap_ver; +uint8_t *config; + +/* + * pcie_add_capability always inserts the new capability at the tail + * of the chain. Therefore to end up with a chain that matches the + * physical device, we cache the config space to avoid overwriting + * the original config space when we parse the extended capabilities. + */ +config = g_memdup(pdev->config, vdev->config_size); + +for (next = PCI_CONFIG_SPACE_SIZE; next; + next = PCI_EXT_CAP_NEXT(pci_get_long(config + next))) { +header = pci_get_long(config + next); +cap_id = PCI_EXT_CAP_ID(header); +cap_ver = PCI_EXT_CAP_VER(header); + +/* + * If it becomes important to configure extended capabilities to their + * actual size, use this as the default when it's something we don't + * recognize. Since QEMU doesn't actually handle many of the config + * accesses, exact size doesn't seem worthwhile. + */ +size = vfio_ext_cap_max_size(config, next); + +pcie_add_capability(pdev, cap_id, cap_ver, next, size); +pci_set_long(pdev->config + next, PCI_EXT_CAP(cap_id, cap_ver, 0)); + +/* Use emulated next pointer to allow dropping extended caps */ +pci_long_test_and_set_mask(vdev->emulated_config_bits + next, + PCI_EXT_CAP_NEXT_MASK); +} + +g_free(config); +return 0; +} + static int vfio_add_capabilities(VFIOPCIDevice *vdev) { PCIDevice *pdev = &vdev->pdev; +int ret; if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) || !pdev->config[PCI_CAPABILITY_LIST]) { return 0; /* Nothing to add */ } -return vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]); +ret = vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]); +if (ret) { +return ret; +} + +/* on PCI bus, it doesn't make sense to expose extended capabilities. */ +if (!pci_is_express(pdev) || +!pci_bus_is_express(pdev->bus) || +!pci_get_long(pdev->config + PCI_CONFIG_SPACE_SIZE)) { +return 0; +} + +return vfio_add_ext_cap(vdev); } static void vfio_pci_pre_reset(VFIOPCIDevice *vdev) -- 1.8.3.1
[Qemu-devel] [PATCH 06/12] vfio: add check host bus reset is support or not
From: Chen Fan When assigning a vfio device with AER enabled, we must check whether the device supports a host bus reset (ie. hot reset) as this may be used by the guest OS in order to recover the device from an AER error. QEMU must therefore have the ability to perform a physical host bus reset using the existing vfio APIs in response to a virtual bus reset in the VM. A physical bus reset affects all of the devices on the host bus, therefore we place a few simplifying configuration restriction on the VM: - All physical devices affected by a bus reset must be assigned to the VM with AER enabled on each and be configured on the same virtual bus in the VM. - No devices unaffected by the bus reset, be they physical, emulated, or paravirtual may be configured on the same virtual bus as a device supporting AER signaling through vfio. In other words users wishing to enable AER on a multifunction device need to assign all functions of the device to the same virtual bus and enable AER support for each device. The easiest way to accomplish this is to identity map the physical functions to virtual functions with multifunction enabled on the virtual device. Signed-off-by: Chen Fan --- hw/vfio/pci.c | 279 +- hw/vfio/pci.h | 1 + 2 files changed, 257 insertions(+), 23 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 5b23a86..832f25e 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -19,6 +19,7 @@ */ #include "qemu/osdep.h" +#include "qapi/error.h" #include #include #include @@ -1716,6 +1717,42 @@ static void vfio_check_af_flr(VFIOPCIDevice *vdev, uint8_t pos) } } +static int vfio_pci_name_to_addr(const char *name, PCIHostDeviceAddress *addr) +{ +if (strlen(name) != 12 || +sscanf(name, "%04x:%02x:%02x.%1x", &addr->domain, + &addr->bus, &addr->slot, &addr->function) != 4) { +return -EINVAL; +} + +return 0; +} + +static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name) +{ +PCIHostDeviceAddress tmp; + +if (vfio_pci_name_to_addr(name, &tmp)) { +return false; +} + +return (tmp.domain == addr->domain && tmp.bus == addr->bus && +tmp.slot == addr->slot && tmp.function == addr->function); +} + +static bool vfio_pci_host_match_slot(PCIHostDeviceAddress *addr, + const char *name) +{ +PCIHostDeviceAddress tmp; + +if (vfio_pci_name_to_addr(name, &tmp)) { +return false; +} + +return (tmp.domain == addr->domain && tmp.bus == addr->bus && +tmp.slot == addr->slot); +} + /* * return negative with errno, return 0 on success. * if success, the point of ret_info fill with the affected device reset info. @@ -1877,6 +1914,200 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos) return 0; } +static int vfio_device_range_limit(PCIBus *bus) +{ +PCIDevice *br; + +br = pci_bridge_get_device(bus); +if (!br || +!pci_is_express(br) || +!(br->exp.exp_cap) || +pcie_cap_is_arifwd_enabled(br)) { +return 255; +} + +return 8; +} + +static void vfio_check_hot_bus_reset(VFIOPCIDevice *vdev, Error **errp) +{ +PCIBus *bus = vdev->pdev.bus; +struct vfio_pci_hot_reset_info *info = NULL; +struct vfio_pci_dependent_device *devices; +VFIOGroup *group; +int ret, i, devfn, range_limit; + +ret = vfio_get_hot_reset_info(vdev, &info); +if (ret) { +error_setg(errp, "vfio: Cannot enable AER for device %s," + " device does not support hot reset.", + vdev->vbasedev.name); +return; +} + +/* List all affected devices by bus reset */ +devices = &info->devices[0]; + +/* Verify that we have all the groups required */ +for (i = 0; i < info->count; i++) { +PCIHostDeviceAddress host; +VFIOPCIDevice *tmp; +VFIODevice *vbasedev_iter; +bool found = false; + +host.domain = devices[i].segment; +host.bus = devices[i].bus; +host.slot = PCI_SLOT(devices[i].devfn); +host.function = PCI_FUNC(devices[i].devfn); + +/* Skip the current device */ +if (vfio_pci_host_match(&host, vdev->vbasedev.name)) { +continue; +} + +/* Ensure we own the group of the affected device */ +QLIST_FOREACH(group, &vfio_group_list, next) { +if (group->groupid == devices[i].group_id) { +break; +} +} + +if (!group) { +error_setg(errp, "vfio: Cannot enable AER for device %s, " + "depends on group %d which is not owned.", + vdev->vbasedev.name, devices[i].group_id); +goto out; +} + +/* Ensure affected devices for reset on the same bus */ +QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { +
[Qemu-devel] [PATCH 11/12] vfio: register aer resume notification handler for aer resume
From: Chen Fan For supporting aer recovery, host and guest would run the same aer recovery code, that would do the secondary bus reset if the error is fatal, the aer recovery process: 1. error_detected 2. reset_link (if fatal) 3. slot_reset/mmio_enabled 4. resume It indicates that host will do secondary bus reset to reset the physical devices under bus in step 2, that would cause devices in D3 status in a short time. But in qemu, we register an error detected handler, that would be invoked as host broadcasts the error-detected event in step 1, in order to avoid guest do reset_link when host do reset_link simultaneously. it may cause fatal error. we introduce a resmue notifier to assure host reset completely. In qemu, the aer recovery process: 1. Detect support for resume notification If host vfio driver does not support for resume notification, directly fail to boot up VM as with aer enabled. 2. Immediately notify the VM on error detected. 3. Stall any access to the device until resume is signaled. 4. Delay the guest directed bus reset. 5. Wait for resume notification. If we don't get the resume notification from the host after some timeout, we would abort the guest directed bus reset altogether and unplug of the device to prevent it from further interacting with the VM. 6. After get the resume notification reset bus. Signed-off-by: Chen Fan Signed-off-by: Zhou Jie --- hw/vfio/pci.c | 182 - hw/vfio/pci.h | 5 ++ linux-headers/linux/vfio.h | 1 + 3 files changed, 186 insertions(+), 2 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 6877a3d..39a9a9f 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -35,6 +35,7 @@ #include "trace.h" #define MSIX_CAP_LENGTH 12 +#define VFIO_RESET_TIMEOUT 1000 static void vfio_disable_interrupts(VFIOPCIDevice *vdev); static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); @@ -1937,6 +1938,14 @@ static void vfio_check_hot_bus_reset(VFIOPCIDevice *vdev, Error **errp) VFIOGroup *group; int ret, i, devfn, range_limit; +if (!vdev->vfio_resume_cap) { +error_setg(errp, "vfio: Cannot enable AER for device %s," + " host vfio driver does not support for" + " resume notification", + vdev->vbasedev.name); +return; +} + ret = vfio_get_hot_reset_info(vdev, &info); if (ret) { error_setg(errp, "vfio: Cannot enable AER for device %s," @@ -2594,6 +2603,23 @@ static int vfio_populate_device(VFIOPCIDevice *vdev) vbasedev->name); } +irq_info.index = VFIO_PCI_RESUME_IRQ_INDEX; + +ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); +if (ret) { +/* This can fail for an old kernel or legacy PCI dev */ +trace_vfio_populate_device_get_irq_info_failure(); +ret = 0; +} else if (irq_info.count == 1) { +vdev->vfio_resume_cap = true; +} else { +error_report("vfio: %s " + "Could not enable error recovery for the device," + " because host vfio driver does not support for" + " resume notification", + vbasedev->name); +} + error: return ret; } @@ -2661,6 +2687,17 @@ static void vfio_err_notifier_handler(void *opaque) msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN : PCI_ERR_ROOT_CMD_NONFATAL_EN; +if (isfatal) { +PCIDevice *dev_0 = pci_get_function_0(dev); +vdev->vfio_resume_wait = true; +vdev->vfio_resume_arrived = false; +vfio_mmap_set_enabled(vdev, false); +if (dev_0 != dev) { +VFIOPCIDevice *vdev_0 = DO_UPCAST(VFIOPCIDevice, pdev, dev_0); +vdev_0->vfio_resume_wait = true; +vdev_0->vfio_resume_arrived = false; +} +} pcie_aer_msg(dev, &msg); return; } @@ -2756,6 +2793,96 @@ static void vfio_unregister_err_notifier(VFIOPCIDevice *vdev) event_notifier_cleanup(&vdev->err_notifier); } +static void vfio_resume_notifier_handler(void *opaque) +{ +VFIOPCIDevice *vdev = opaque; + +if (!event_notifier_test_and_clear(&vdev->resume_notifier)) { +return; +} + +vdev->vfio_resume_arrived = true; +if (vdev->reset_timer != NULL) { +timer_mod(vdev->reset_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL)); +} +} + +static void vfio_register_aer_resume_notifier(VFIOPCIDevice *vdev) +{ +int ret; +int argsz; +struct vfio_irq_set *irq_set; +int32_t *pfd; + +if (!(vdev->features & VFIO_FEATURE_ENABLE_AER) || +!vdev-
[Qemu-devel] [PATCH 01/12] vfio: extract vfio_get_hot_reset_info as a single function
From: Chen Fan The function is used to get affected devices by bus reset. So here extract it, and can used for aer soon. Signed-off-by: Chen Fan --- hw/vfio/pci.c | 66 +++ 1 file changed, 48 insertions(+), 18 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index d091d8c..cf40f9e 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -1701,6 +1701,51 @@ static void vfio_check_af_flr(VFIOPCIDevice *vdev, uint8_t pos) } } +/* + * return negative with errno, return 0 on success. + * if success, the point of ret_info fill with the affected device reset info. + * + */ +static int vfio_get_hot_reset_info(VFIOPCIDevice *vdev, + struct vfio_pci_hot_reset_info **ret_info) +{ +struct vfio_pci_hot_reset_info *info; +int ret, count; + +*ret_info = NULL; + +info = g_malloc0(sizeof(*info)); +info->argsz = sizeof(*info); + +ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info); +if (ret && errno != ENOSPC) { +ret = -errno; +goto error; +} + +count = info->count; + +info = g_realloc(info, sizeof(*info) + + (count * sizeof(struct vfio_pci_dependent_device))); +info->argsz = sizeof(*info) + + (count * sizeof(struct vfio_pci_dependent_device)); + +ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info); +if (ret) { +ret = -errno; +error_report("vfio: hot reset info failed: %m"); +goto error; +} + +*ret_info = info; +info = NULL; + +return 0; +error: +g_free(info); +return ret; +} + static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos) { PCIDevice *pdev = &vdev->pdev; @@ -1842,7 +1887,7 @@ static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name) static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) { VFIOGroup *group; -struct vfio_pci_hot_reset_info *info; +struct vfio_pci_hot_reset_info *info = NULL; struct vfio_pci_dependent_device *devices; struct vfio_pci_hot_reset *reset; int32_t *fds; @@ -1854,12 +1899,8 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) vfio_pci_pre_reset(vdev); vdev->vbasedev.needs_reset = false; -info = g_malloc0(sizeof(*info)); -info->argsz = sizeof(*info); - -ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info); -if (ret && errno != ENOSPC) { -ret = -errno; +ret = vfio_get_hot_reset_info(vdev, &info); +if (ret) { if (!vdev->has_pm_reset) { error_report("vfio: Cannot reset device %s, " "no available reset mechanism.", vdev->vbasedev.name); @@ -1867,18 +1908,7 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) goto out_single; } -count = info->count; -info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices))); -info->argsz = sizeof(*info) + (count * sizeof(*devices)); devices = &info->devices[0]; - -ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info); -if (ret) { -ret = -errno; -error_report("vfio: hot reset info failed: %m"); -goto out_single; -} - trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name); /* Verify that we have all the groups required */ -- 1.8.3.1
[Qemu-devel] [PATCH 08/12] vfio: add check aer functionality for hotplug device
From: Chen Fan when function 0 is hot-added, we can check the vfio device whether support hot bus reset. Signed-off-by: Chen Fan --- hw/vfio/pci.c | 14 ++ 1 file changed, 14 insertions(+) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 832f25e..109b11b 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3087,6 +3087,19 @@ post_reset: vfio_pci_post_reset(vdev); } +static void vfio_pci_is_valid(PCIDevice *dev, Error **errp) +{ +VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, dev); +Error *local_err = NULL; + +if (vdev->features & VFIO_FEATURE_ENABLE_AER) { +vfio_check_hot_bus_reset(vdev, &local_err); +if (local_err) { +error_propagate(errp, local_err); +} +} +} + static void vfio_instance_init(Object *obj) { PCIDevice *pci_dev = PCI_DEVICE(obj); @@ -3141,6 +3154,7 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) set_bit(DEVICE_CATEGORY_MISC, dc->categories); pdc->init = vfio_initfn; pdc->exit = vfio_exitfn; +pdc->is_valid_func = vfio_pci_is_valid; pdc->config_read = vfio_pci_read_config; pdc->config_write = vfio_pci_write_config; pdc->is_express = 1; /* We might be */ -- 1.8.3.1
[Qemu-devel] [PATCH v7 00/12] vfio-pci: pass the aer error to guest
From: Chen Fan v6-v7: 1. Stall any access to the device until resume is signaled. Do guest directed bus reset after receive resume notification. If don't get the resume notification after some timeout unplug the device. 2. fix the patches 11/12 as Alex sugguestion. v5-v6: 1. register resume handler both qemu and kernel to ensure the reset in order. 2. fix the patches 6/12, 7/12 as Alex and MST sugguestion. v4-v5: 1. add back the common function 0 hotplug code in pci core. 2. fix a sporadic device stuck on D3 problem when doing aer recovery. 3. fix patches 5/12 ~ 9/12 as Alex sugguestion. v3-v4: 1. rebase patchset to fit latest master branch. 2. modifying patches 5/10 ~ 8/10 as Alex sugguestion(Thanks). v2-v3: 1. fix patch 4/9, 5/9 as Alex sugguestion. 2. patches 5/9 ~ 8/9 are made to force limiting that all vfio functions are combined in the same way as on the host. v1-v2: 1. limit all devices on same bus in guest are on same bus in host in patch For now, for vfio pci passthough devices, when qemu receives an error from host aer report, it just terminate the guest. But usually user want to know what error occurred. So these patches add aer capability support for vfio devices, and pass the error to guest. Then let guest driver to recover from the error. notes: this series patches enable aer support single/multi-function, for multi-function, require all the function of the slot assigned to VM and on the same slot. Chen Fan (12): vfio: extract vfio_get_hot_reset_info as a single function vfio: squeeze out vfio_pci_do_hot_reset for support bus reset vfio: add pcie extended capability support vfio: add aer support for vfio device vfio: refine function vfio_pci_host_match vfio: add check host bus reset is support or not pci: add a pci_function_is_valid callback to check function if valid vfio: add check aer functionality for hotplug device vfio: vote the function 0 to do host bus reset when aer occurred vfio-pci: pass the aer error to guest vfio: register aer resume notification handler for aer resume vfio: add 'aer' property to expose aercap hw/pci/pci.c | 32 ++ hw/vfio/pci.c | 826 + hw/vfio/pci.h | 10 + include/hw/pci/pci.h | 1 + linux-headers/linux/vfio.h | 1 + 5 files changed, 804 insertions(+), 66 deletions(-) -- 1.8.3.1
[Qemu-devel] [PATCH 07/12] pci: add a pci_function_is_valid callback to check function if valid
From: Chen Fan PCI hotplug requires that function 0 is added last to close the slot. Since vfio supporting AER, we require that the VM bus contains the same set of devices as the host bus to support AER, we can perform an AER validation test whenever a function 0 in the VM is hot-added. Signed-off-by: Chen Fan --- hw/pci/pci.c | 32 include/hw/pci/pci.h | 1 + 2 files changed, 33 insertions(+) diff --git a/hw/pci/pci.c b/hw/pci/pci.c index bb605ef..acd48eb 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -1841,6 +1841,22 @@ PCIDevice *pci_find_device(PCIBus *bus, int bus_num, uint8_t devfn) return bus->devices[devfn]; } +static void pci_function_is_valid(PCIBus *bus, PCIDevice *d, void *opaque) +{ +PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(d); +Error **errp = opaque; + +if (*errp) { +return; +} + +if (!pc->is_valid_func) { +return; +} + +pc->is_valid_func(d, errp); +} + static void pci_qdev_realize(DeviceState *qdev, Error **errp) { PCIDevice *pci_dev = (PCIDevice *)qdev; @@ -1883,6 +1899,22 @@ static void pci_qdev_realize(DeviceState *qdev, Error **errp) pci_qdev_unrealize(DEVICE(pci_dev), NULL); return; } + +/* + * If the function number is 0, indicate the closure of the slot. + * then we get the chance to check all functions on same device + * if valid. + */ +if (DEVICE(pci_dev)->hotplugged && +pci_get_function_0(pci_dev) == pci_dev) { +pci_for_each_device(bus, pci_bus_num(bus), +pci_function_is_valid, &local_err); +if (local_err) { +error_propagate(errp, local_err); +pci_qdev_unrealize(DEVICE(pci_dev), NULL); +return; +} +} } static void pci_default_realize(PCIDevice *dev, Error **errp) diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index ef6ba51..848fa7b 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -197,6 +197,7 @@ typedef struct PCIDeviceClass { void (*realize)(PCIDevice *dev, Error **errp); int (*init)(PCIDevice *dev);/* TODO convert to realize() and remove */ +void (*is_valid_func)(PCIDevice *dev, Error **errp); PCIUnregisterFunc *exit; PCIConfigReadFunc *config_read; PCIConfigWriteFunc *config_write; -- 1.8.3.1
Re: [Qemu-devel] [patch v6 11/12] vfio: register aer resume notification handler for aer resume
Hi, Alex What do you think about the following solution? 1. Detect support for resume notification. If host vfio driver does not have resume notifier flags, Directly fail to boot up VM as with aer enabled. 2. Immediately notify the VM on error detected. 3. Stall any access to the device until resume is signaled. Disable mmaps, drop writes, return -1 for reads. 4. Delay the guest directed bus reset. Don't reset bus in vfio_pci_reset function. 5. Wait for resume notification. If we don't get the resume notification from the host after some timeout, we would abort the guest directed bus reset altogether and make the device disappear, Initiating an unplug of the device to prevent it from further interacting with the VM. 6. After get the resume notification. Reset bus. It the second bus reset. Because the host did bus reset already. But as you said we shouldn't necessarily design the API that strictly around the current behavior of the Linux AER handler. Sincerely, Zhou Jie On 2016/5/7 0:39, Alex Williamson wrote: On Fri, 6 May 2016 09:38:41 +0800 Chen Fan wrote: On 04/26/2016 10:48 PM, Alex Williamson wrote: On Tue, 26 Apr 2016 11:39:02 +0800 Chen Fan wrote: On 04/14/2016 09:02 AM, Chen Fan wrote: On 04/12/2016 05:38 AM, Alex Williamson wrote: On Tue, 5 Apr 2016 19:42:02 +0800 Cao jin wrote: From: Chen Fan for supporting aer recovery, host and guest would run the same aer recovery code, that would do the secondary bus reset if the error is fatal, the aer recovery process: 1. error_detected 2. reset_link (if fatal) 3. slot_reset/mmio_enabled 4. resume it indicates that host will do secondary bus reset to reset the physical devices under bus in step 2, that would cause devices in D3 status in a short time. but in qemu, we register an error detected handler, that would be invoked as host broadcasts the error-detected event in step 1, in order to avoid guest do reset_link when host do reset_link simultaneously. it may cause fatal error. we introduce a resmue notifier to assure host reset completely. then do guest aer injection. Why is it safe to continue running the VM between the error detected notification and the resume notification? We're just pushing back the point at which we inject the AER into the guest, potentially negating any benefit by allowing the VM to consume bad data. Shouldn't we instead be immediately notifying the VM on error detected, but stalling any access to the device until resume is signaled? How do we know that resume will ever be signaled? We have both the problem that we may be running on an older kernel that won't support a resume notification and the problem that seeing a resume notification depends on the host being able to successfully complete a link reset after fatal error. We can detect support for resume notification, but we still need a strategy for never receiving it. Thanks, That's make sense, but I haven't came up with a good idea. do you have any idea, Alex? I don't know that there are any good solutions here. We need to respond to the current error notifier interrupt and not regress from our support there. I think that means that if we want to switch from a simple halt-on-error to a mechanism for the guest to handle recovery, we need to disable access to the device between being notified that the error occurred and being notified to resume. We can do that by disabling mmaps to the device and preventing access via the slow path handlers. I don't know what the best solution is for preventing access, do we block and pause the VM or do we drop writes and return -1 for reads, that's something that needs to be determined. We also need to inject the AER into the VM at the point we're notified of an error because the VM needs to know as soon as possible to stop using the device or trusting any data from it. The next coordination point would be something like the resume notifier that you've added and there are numerous questions around the interaction of that with the guest handling. Clearly we can't do a guest directed bus reset until we get the resume notifier, so do we block that execution path in QEMU until the resume notification is received? What happens if we don't get that notification? Is there any way that we can rely on the host having done a bus reset to the point where we don't need to act on the guest directed reset? These are all things that need to be figured out. Thanks, Maybe we can simply pause the vcpu running and avoid the VM to access the device. and add two flags in VFIO_DEVICE_GET_INFO to query whether the vfio pci driver has a resume notifier, if it does not have resume notifier flags, we can directly fail to boot up VM as with aer enabled. We can already tell if a resume interrupt is supported betw
[Qemu-devel] [PATCH v4] block: always compile-check debug prints
Files with conditional debug statements should ensure that the printf is always compiled. This prevents bitrot of the format string of the debug statement. And switch debug output to stderr. Signed-off-by: Zhou Jie Reviewed-by: Eric Blake --- v3 -> v4: * Alter commit message to add "And switch debug output to stderr". block/curl.c | 10 -- block/sheepdog.c | 13 - 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/block/curl.c b/block/curl.c index 5a8f8b6..da9f5e8 100644 --- a/block/curl.c +++ b/block/curl.c @@ -36,10 +36,16 @@ // #define DEBUG_VERBOSE #ifdef DEBUG_CURL -#define DPRINTF(fmt, ...) do { printf(fmt, ## __VA_ARGS__); } while (0) +#define DEBUG_CURL_PRINT 1 #else -#define DPRINTF(fmt, ...) do { } while (0) +#define DEBUG_CURL_PRINT 0 #endif +#define DPRINTF(fmt, ...)\ +do { \ +if (DEBUG_CURL_PRINT) { \ +fprintf(stderr, fmt, ## __VA_ARGS__);\ +}\ +} while (0) #if LIBCURL_VERSION_NUM >= 0x071000 /* The multi interface timer callback was introduced in 7.16.0 */ diff --git a/block/sheepdog.c b/block/sheepdog.c index 33e0a33..9023686 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -294,13 +294,16 @@ static inline size_t count_data_objs(const struct SheepdogInode *inode) #undef DPRINTF #ifdef DEBUG_SDOG -#define DPRINTF(fmt, args...) \ -do {\ -fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \ -} while (0) +#define DEBUG_SDOG_PRINT 1 #else -#define DPRINTF(fmt, args...) +#define DEBUG_SDOG_PRINT 0 #endif +#define DPRINTF(fmt, args...) \ +do {\ +if (DEBUG_SDOG_PRINT) { \ +fprintf(stderr, "%s %d: " fmt, __func__, __LINE__, ##args); \ +} \ +} while (0) typedef struct SheepdogAIOCB SheepdogAIOCB; -- 2.5.5
[Qemu-devel] [PATCH v3] block: always compile-check debug prints
Files with conditional debug statements should ensure that the printf is always compiled. This prevents bitrot of the format string of the debug statement. Signed-off-by: Zhou Jie Reviewed-by: Eric Blake --- v1 -> v2: * Keep the user-visible witness as defined/undefined, and create a secondary witness as the actual conditional. * Switch debug output to stderr. v2 -> v3: * Alter commit message to add "Reviewed-by ...". block/curl.c | 10 -- block/sheepdog.c | 13 - 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/block/curl.c b/block/curl.c index 5a8f8b6..da9f5e8 100644 --- a/block/curl.c +++ b/block/curl.c @@ -36,10 +36,16 @@ // #define DEBUG_VERBOSE #ifdef DEBUG_CURL -#define DPRINTF(fmt, ...) do { printf(fmt, ## __VA_ARGS__); } while (0) +#define DEBUG_CURL_PRINT 1 #else -#define DPRINTF(fmt, ...) do { } while (0) +#define DEBUG_CURL_PRINT 0 #endif +#define DPRINTF(fmt, ...)\ +do { \ +if (DEBUG_CURL_PRINT) { \ +fprintf(stderr, fmt, ## __VA_ARGS__);\ +}\ +} while (0) #if LIBCURL_VERSION_NUM >= 0x071000 /* The multi interface timer callback was introduced in 7.16.0 */ diff --git a/block/sheepdog.c b/block/sheepdog.c index 33e0a33..9023686 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -294,13 +294,16 @@ static inline size_t count_data_objs(const struct SheepdogInode *inode) #undef DPRINTF #ifdef DEBUG_SDOG -#define DPRINTF(fmt, args...) \ -do {\ -fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \ -} while (0) +#define DEBUG_SDOG_PRINT 1 #else -#define DPRINTF(fmt, args...) +#define DEBUG_SDOG_PRINT 0 #endif +#define DPRINTF(fmt, args...) \ +do {\ +if (DEBUG_SDOG_PRINT) { \ +fprintf(stderr, "%s %d: " fmt, __func__, __LINE__, ##args); \ +} \ +} while (0) typedef struct SheepdogAIOCB SheepdogAIOCB; -- 2.5.5
[Qemu-devel] [PATCH v2] block: always compile-check debug prints
Files with conditional debug statements should ensure that the printf is always compiled. This prevents bitrot of the format string of the debug statement. Signed-off-by: Zhou Jie --- block/curl.c | 10 -- block/sheepdog.c | 13 - 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/block/curl.c b/block/curl.c index 5a8f8b6..da9f5e8 100644 --- a/block/curl.c +++ b/block/curl.c @@ -36,10 +36,16 @@ // #define DEBUG_VERBOSE #ifdef DEBUG_CURL -#define DPRINTF(fmt, ...) do { printf(fmt, ## __VA_ARGS__); } while (0) +#define DEBUG_CURL_PRINT 1 #else -#define DPRINTF(fmt, ...) do { } while (0) +#define DEBUG_CURL_PRINT 0 #endif +#define DPRINTF(fmt, ...)\ +do { \ +if (DEBUG_CURL_PRINT) { \ +fprintf(stderr, fmt, ## __VA_ARGS__);\ +}\ +} while (0) #if LIBCURL_VERSION_NUM >= 0x071000 /* The multi interface timer callback was introduced in 7.16.0 */ diff --git a/block/sheepdog.c b/block/sheepdog.c index 33e0a33..9023686 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -294,13 +294,16 @@ static inline size_t count_data_objs(const struct SheepdogInode *inode) #undef DPRINTF #ifdef DEBUG_SDOG -#define DPRINTF(fmt, args...) \ -do {\ -fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \ -} while (0) +#define DEBUG_SDOG_PRINT 1 #else -#define DPRINTF(fmt, args...) +#define DEBUG_SDOG_PRINT 0 #endif +#define DPRINTF(fmt, args...) \ +do {\ +if (DEBUG_SDOG_PRINT) { \ +fprintf(stderr, "%s %d: " fmt, __func__, __LINE__, ##args); \ +} \ +} while (0) typedef struct SheepdogAIOCB SheepdogAIOCB; -- 2.5.5
[Qemu-devel] [PATCH] block: always compile-check debug prints
Files with conditional debug statements should ensure that the printf is always compiled. This prevents bitrot of the format string of the debug statement. Signed-off-by: Zhou Jie --- block/curl.c | 13 +++-- block/sheepdog.c | 13 ++--- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/block/curl.c b/block/curl.c index 5a8f8b6..6655108 100644 --- a/block/curl.c +++ b/block/curl.c @@ -32,14 +32,15 @@ #include #include "qemu/cutils.h" -// #define DEBUG_CURL +#define DEBUG_CURL 0 // #define DEBUG_VERBOSE -#ifdef DEBUG_CURL -#define DPRINTF(fmt, ...) do { printf(fmt, ## __VA_ARGS__); } while (0) -#else -#define DPRINTF(fmt, ...) do { } while (0) -#endif +#define DPRINTF(fmt, ...)\ +do { \ +if (DEBUG_CURL) {\ +printf(fmt, ## __VA_ARGS__); \ +}\ +} while (0) #if LIBCURL_VERSION_NUM >= 0x071000 /* The multi interface timer callback was introduced in 7.16.0 */ diff --git a/block/sheepdog.c b/block/sheepdog.c index 33e0a33..47cca74 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -293,14 +293,13 @@ static inline size_t count_data_objs(const struct SheepdogInode *inode) } #undef DPRINTF -#ifdef DEBUG_SDOG -#define DPRINTF(fmt, args...) \ -do {\ -fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \ +#define DEBUG_SDOG 0 +#define DPRINTF(fmt, args...) \ +do {\ +if (DEBUG_SDOG) { \ +fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \ +} \ } while (0) -#else -#define DPRINTF(fmt, args...) -#endif typedef struct SheepdogAIOCB SheepdogAIOCB; -- 2.5.5
Re: [Qemu-devel] [PATCH v2] hw/net/opencores_eth: Allocating Large sized arrays to heap
On 2016/4/27 10:46, Max Filippov wrote: Hi Zhou, On Wed, Apr 27, 2016 at 10:18:56AM +0800, Zhou Jie wrote: When I committed another patch which named as "hw/net/virtio-net: Allocating Large sized arrays to heap" . Christian Borntraeger said that 16k is usually perfectly fine for a userspace stack and doing allocations in a hot path might actually hurt performance. Although the size is 65536 bytes here, I think open_eth_start_xmit is in a hot path. So, it is OK, if you think that this patch should not be applied. With Linux as guest OS we shouldn't see any allocations as it doesn't send huge packets, so I think this patch is fine. I can take it through the xtensa tree if you don't have other plan. OK, Thanks Sincerely, Zhou Jie
Re: [Qemu-devel] [PATCH v2] hw/net/opencores_eth: Allocating Large sized arrays to heap
Hi Max, When I committed another patch which named as "hw/net/virtio-net: Allocating Large sized arrays to heap" . Christian Borntraeger said that 16k is usually perfectly fine for a userspace stack and doing allocations in a hot path might actually hurt performance. Although the size is 65536 bytes here, I think open_eth_start_xmit is in a hot path. So, it is OK, if you think that this patch should not be applied. Sincerely, Zhou Jie On 2016/4/27 10:07, Zhou Jie wrote: open_eth_start_xmit has a huge stack usage of 65536 bytes approx. Moving large arrays to heap to reduce stack usage. Signed-off-by: Zhou Jie --- hw/net/opencores_eth.c | 11 ++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/hw/net/opencores_eth.c b/hw/net/opencores_eth.c index c6094fb..fa0a4e7 100644 --- a/hw/net/opencores_eth.c +++ b/hw/net/opencores_eth.c @@ -483,7 +483,8 @@ static NetClientInfo net_open_eth_info = { static void open_eth_start_xmit(OpenEthState *s, desc *tx) { -uint8_t buf[65536]; +uint8_t *buf = NULL; +uint8_t buffer[0x600]; unsigned len = GET_FIELD(tx->len_flags, TXD_LEN); unsigned tx_len = len; @@ -498,6 +499,11 @@ static void open_eth_start_xmit(OpenEthState *s, desc *tx) trace_open_eth_start_xmit(tx->buf_ptr, len, tx_len); +if (tx_len > 0x600) { +buf = g_new(uint8_t, tx_len); +} else { +buf = buffer; +} if (len > tx_len) { len = tx_len; } @@ -506,6 +512,9 @@ static void open_eth_start_xmit(OpenEthState *s, desc *tx) memset(buf + len, 0, tx_len - len); } qemu_send_packet(qemu_get_queue(s->nic), buf, tx_len); +if (tx_len > 0x600) { +g_free(buf); +} if (tx->len_flags & TXD_WR) { s->tx_desc = 0; -- 周潔 Dept 1 No. 6 Wenzhu Road, Nanjing, 210012, China TEL:+86+25-86630566-8557 FUJITSU INTERNAL:7998-8557 E-Mail:zhoujie2...@cn.fujitsu.com
[Qemu-devel] [PATCH v2] hw/net/opencores_eth: Allocating Large sized arrays to heap
open_eth_start_xmit has a huge stack usage of 65536 bytes approx. Moving large arrays to heap to reduce stack usage. Signed-off-by: Zhou Jie --- hw/net/opencores_eth.c | 11 ++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/hw/net/opencores_eth.c b/hw/net/opencores_eth.c index c6094fb..fa0a4e7 100644 --- a/hw/net/opencores_eth.c +++ b/hw/net/opencores_eth.c @@ -483,7 +483,8 @@ static NetClientInfo net_open_eth_info = { static void open_eth_start_xmit(OpenEthState *s, desc *tx) { -uint8_t buf[65536]; +uint8_t *buf = NULL; +uint8_t buffer[0x600]; unsigned len = GET_FIELD(tx->len_flags, TXD_LEN); unsigned tx_len = len; @@ -498,6 +499,11 @@ static void open_eth_start_xmit(OpenEthState *s, desc *tx) trace_open_eth_start_xmit(tx->buf_ptr, len, tx_len); +if (tx_len > 0x600) { +buf = g_new(uint8_t, tx_len); +} else { +buf = buffer; +} if (len > tx_len) { len = tx_len; } @@ -506,6 +512,9 @@ static void open_eth_start_xmit(OpenEthState *s, desc *tx) memset(buf + len, 0, tx_len - len); } qemu_send_packet(qemu_get_queue(s->nic), buf, tx_len); +if (tx_len > 0x600) { +g_free(buf); +} if (tx->len_flags & TXD_WR) { s->tx_desc = 0; -- 2.5.5
Re: [Qemu-devel] [PATCH] hw/net/opencores_eth: Allocating Large sized arrays to heap
On 2016/4/26 12:12, Max Filippov wrote: Hi Zhou, On Tue, Apr 26, 2016 at 6:35 AM, Zhou Jie wrote: open_eth_start_xmit has a huge stack usage of 65536 bytes approx. Moving large arrays to heap to reduce stack usage. It's an exception, not the rule when full 65536 byte long buffer might be needed. Can we do a little better change and not allocate and free this buffer every time unconditionally, but instead make buf smaller (1536 bytes, maximal frame length when HUGEN bit is not set in MODER) and only do allocation when that's not enough? Thank you for your suggestion. I will modify this patch. Sincerely, Zhou Jie -- 周潔 Dept 1 No. 6 Wenzhu Road, Nanjing, 210012, China TEL:+86+25-86630566-8557 FUJITSU INTERNAL:7998-8557 E-Mail:zhoujie2...@cn.fujitsu.com
Re: [Qemu-devel] [PATCH] hw/net/virtio-net: Allocating Large sized arrays to heap
On 2016/4/26 20:42, Michael S. Tsirkin wrote: On Tue, Apr 26, 2016 at 04:05:24PM +0800, Zhou Jie wrote: virtio_net_flush_tx has a huge stack usage of 16392 bytes approx. Moving large arrays to heap to reduce stack usage. Signed-off-by: Zhou Jie I don't think it's appropriate for trivial. Also - what's the point, exactly? I think functions should not have very large stack frames. For 64bit machine it will be 32k. But according what Christian Borntraeger said, it doesn't matter. So, considerate that virtio_net_flush_tx is in a hot path, I agree to not change this. Sincerely, Zhou Jie --- hw/net/virtio-net.c | 15 +++ 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 5798f87..cab7bbc 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -1213,6 +1213,7 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q) VirtIONet *n = q->n; VirtIODevice *vdev = VIRTIO_DEVICE(n); VirtQueueElement *elem; +struct iovec *sg = NULL, *sg2 = NULL; int32_t num_packets = 0; int queue_index = vq2q(virtio_get_queue_index(q->tx_vq)); if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) { @@ -1224,10 +1225,12 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q) return num_packets; } +sg = g_new(struct iovec, VIRTQUEUE_MAX_SIZE); +sg2 = g_new(struct iovec, VIRTQUEUE_MAX_SIZE + 1); for (;;) { ssize_t ret; unsigned int out_num; -struct iovec sg[VIRTQUEUE_MAX_SIZE], sg2[VIRTQUEUE_MAX_SIZE + 1], *out_sg; +struct iovec *out_sg; struct virtio_net_hdr_mrg_rxbuf mhdr; elem = virtqueue_pop(q->tx_vq, sizeof(VirtQueueElement)); @@ -1252,7 +1255,7 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q) virtio_net_hdr_swap(vdev, (void *) &mhdr); sg2[0].iov_base = &mhdr; sg2[0].iov_len = n->guest_hdr_len; -out_num = iov_copy(&sg2[1], ARRAY_SIZE(sg2) - 1, +out_num = iov_copy(&sg2[1], VIRTQUEUE_MAX_SIZE, out_sg, out_num, n->guest_hdr_len, -1); if (out_num == VIRTQUEUE_MAX_SIZE) { @@ -1269,10 +1272,10 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q) */ assert(n->host_hdr_len <= n->guest_hdr_len); if (n->host_hdr_len != n->guest_hdr_len) { -unsigned sg_num = iov_copy(sg, ARRAY_SIZE(sg), +unsigned sg_num = iov_copy(sg, VIRTQUEUE_MAX_SIZE, out_sg, out_num, 0, n->host_hdr_len); -sg_num += iov_copy(sg + sg_num, ARRAY_SIZE(sg) - sg_num, +sg_num += iov_copy(sg + sg_num, VIRTQUEUE_MAX_SIZE - sg_num, out_sg, out_num, n->guest_hdr_len, -1); out_num = sg_num; @@ -1284,6 +1287,8 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q) if (ret == 0) { virtio_queue_set_notification(q->tx_vq, 0); q->async_tx.elem = elem; +g_free(sg); +g_free(sg2); return -EBUSY; } @@ -1296,6 +1301,8 @@ drop: break; } } +g_free(sg); +g_free(sg2); return num_packets; } -- 2.5.5 . -- 周潔 Dept 1 No. 6 Wenzhu Road, Nanjing, 210012, China TEL:+86+25-86630566-8557 FUJITSU INTERNAL:7998-8557 E-Mail:zhoujie2...@cn.fujitsu.com
Re: [Qemu-devel] [PATCH] hw/net/virtio-net: Allocating Large sized arrays to heap
On 2016/4/26 16:49, Christian Borntraeger wrote: On 04/26/2016 10:05 AM, Zhou Jie wrote: virtio_net_flush_tx has a huge stack usage of 16392 bytes approx. Moving large arrays to heap to reduce stack usage. Signed-off-by: Zhou Jie --- hw/net/virtio-net.c | 15 +++ 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 5798f87..cab7bbc 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -1213,6 +1213,7 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q) VirtIONet *n = q->n; VirtIODevice *vdev = VIRTIO_DEVICE(n); VirtQueueElement *elem; +struct iovec *sg = NULL, *sg2 = NULL; int32_t num_packets = 0; int queue_index = vq2q(virtio_get_queue_index(q->tx_vq)); if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) { @@ -1224,10 +1225,12 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q) return num_packets; } +sg = g_new(struct iovec, VIRTQUEUE_MAX_SIZE); +sg2 = g_new(struct iovec, VIRTQUEUE_MAX_SIZE + 1); As I said in another mail, 16k is usually perfectly fine for a userspace stack and doing allocations in a hot path might actually hurt performance. Unless we have a real problem (e.g. very long call stack on a small thread stack) I would prefer to not change this. Do you have seen a real problem due to this? No, I don't. I think functions should not have very large stack frames. For 64bit machine it will be 32k. But according what you said, it doesn't matter. So, considerate that virtio_net_flush_tx is in a hot path, I agree to not change this. Sincerely, Zhou Jie for (;;) { ssize_t ret; unsigned int out_num; -struct iovec sg[VIRTQUEUE_MAX_SIZE], sg2[VIRTQUEUE_MAX_SIZE + 1], *out_sg; +struct iovec *out_sg; struct virtio_net_hdr_mrg_rxbuf mhdr; elem = virtqueue_pop(q->tx_vq, sizeof(VirtQueueElement)); @@ -1252,7 +1255,7 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q) virtio_net_hdr_swap(vdev, (void *) &mhdr); sg2[0].iov_base = &mhdr; sg2[0].iov_len = n->guest_hdr_len; -out_num = iov_copy(&sg2[1], ARRAY_SIZE(sg2) - 1, +out_num = iov_copy(&sg2[1], VIRTQUEUE_MAX_SIZE, out_sg, out_num, n->guest_hdr_len, -1); if (out_num == VIRTQUEUE_MAX_SIZE) { @@ -1269,10 +1272,10 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q) */ assert(n->host_hdr_len <= n->guest_hdr_len); if (n->host_hdr_len != n->guest_hdr_len) { -unsigned sg_num = iov_copy(sg, ARRAY_SIZE(sg), +unsigned sg_num = iov_copy(sg, VIRTQUEUE_MAX_SIZE, out_sg, out_num, 0, n->host_hdr_len); -sg_num += iov_copy(sg + sg_num, ARRAY_SIZE(sg) - sg_num, +sg_num += iov_copy(sg + sg_num, VIRTQUEUE_MAX_SIZE - sg_num, out_sg, out_num, n->guest_hdr_len, -1); out_num = sg_num; @@ -1284,6 +1287,8 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q) if (ret == 0) { virtio_queue_set_notification(q->tx_vq, 0); q->async_tx.elem = elem; +g_free(sg); +g_free(sg2); return -EBUSY; } @@ -1296,6 +1301,8 @@ drop: break; } } +g_free(sg); +g_free(sg2); return num_packets; } . -- 周潔 Dept 1 No. 6 Wenzhu Road, Nanjing, 210012, China TEL:+86+25-86630566-8557 FUJITSU INTERNAL:7998-8557 E-Mail:zhoujie2...@cn.fujitsu.com
Re: [Qemu-devel] [PATCH] net/tap: Allocating Large sized arrays to heap
On 2016/4/26 15:45, Christian Borntraeger wrote: On 04/26/2016 03:26 AM, Zhou Jie wrote: net_init_tap has a huge stack usage of 8192 bytes approx. Moving large arrays to heap to reduce stack usage. I am wondering. Why is 8k a problem for a user space program? For 64bit machine it will be 16k. Please note that malloc/new like allocations are much more expensive than stack allocation in terms of performance. This does not matter here, but in your other patch that deals with the xmit function, I would not be surprised if that actually harms performance. OK. I will note it. Sincerely, Zhou Jie Christian Signed-off-by: Zhou Jie --- net/tap.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/tap.c b/net/tap.c index 740e8a2..49817c7 100644 --- a/net/tap.c +++ b/net/tap.c @@ -769,8 +769,8 @@ int net_init_tap(const NetClientOptions *opts, const char *name, return -1; } } else if (tap->has_fds) { -char *fds[MAX_TAP_QUEUES]; -char *vhost_fds[MAX_TAP_QUEUES]; +char **fds = g_new(char *, MAX_TAP_QUEUES); +char **vhost_fds = g_new(char *, MAX_TAP_QUEUES); int nfds, nvhosts; if (tap->has_ifname || tap->has_script || tap->has_downscript || @@ -818,6 +818,8 @@ int net_init_tap(const NetClientOptions *opts, const char *name, return -1; } } +g_free(fds); +g_free(vhost_fds); } else if (tap->has_helper) { if (tap->has_ifname || tap->has_script || tap->has_downscript || tap->has_vnet_hdr || tap->has_queues || tap->has_vhostfds) { -- 周潔 Dept 1 No. 6 Wenzhu Road, Nanjing, 210012, China TEL:+86+25-86630566-8557 FUJITSU INTERNAL:7998-8557 E-Mail:zhoujie2...@cn.fujitsu.com
[Qemu-devel] [PATCH] hw/net/virtio-net: Allocating Large sized arrays to heap
virtio_net_flush_tx has a huge stack usage of 16392 bytes approx. Moving large arrays to heap to reduce stack usage. Signed-off-by: Zhou Jie --- hw/net/virtio-net.c | 15 +++ 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 5798f87..cab7bbc 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -1213,6 +1213,7 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q) VirtIONet *n = q->n; VirtIODevice *vdev = VIRTIO_DEVICE(n); VirtQueueElement *elem; +struct iovec *sg = NULL, *sg2 = NULL; int32_t num_packets = 0; int queue_index = vq2q(virtio_get_queue_index(q->tx_vq)); if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) { @@ -1224,10 +1225,12 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q) return num_packets; } +sg = g_new(struct iovec, VIRTQUEUE_MAX_SIZE); +sg2 = g_new(struct iovec, VIRTQUEUE_MAX_SIZE + 1); for (;;) { ssize_t ret; unsigned int out_num; -struct iovec sg[VIRTQUEUE_MAX_SIZE], sg2[VIRTQUEUE_MAX_SIZE + 1], *out_sg; +struct iovec *out_sg; struct virtio_net_hdr_mrg_rxbuf mhdr; elem = virtqueue_pop(q->tx_vq, sizeof(VirtQueueElement)); @@ -1252,7 +1255,7 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q) virtio_net_hdr_swap(vdev, (void *) &mhdr); sg2[0].iov_base = &mhdr; sg2[0].iov_len = n->guest_hdr_len; -out_num = iov_copy(&sg2[1], ARRAY_SIZE(sg2) - 1, +out_num = iov_copy(&sg2[1], VIRTQUEUE_MAX_SIZE, out_sg, out_num, n->guest_hdr_len, -1); if (out_num == VIRTQUEUE_MAX_SIZE) { @@ -1269,10 +1272,10 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q) */ assert(n->host_hdr_len <= n->guest_hdr_len); if (n->host_hdr_len != n->guest_hdr_len) { -unsigned sg_num = iov_copy(sg, ARRAY_SIZE(sg), +unsigned sg_num = iov_copy(sg, VIRTQUEUE_MAX_SIZE, out_sg, out_num, 0, n->host_hdr_len); -sg_num += iov_copy(sg + sg_num, ARRAY_SIZE(sg) - sg_num, +sg_num += iov_copy(sg + sg_num, VIRTQUEUE_MAX_SIZE - sg_num, out_sg, out_num, n->guest_hdr_len, -1); out_num = sg_num; @@ -1284,6 +1287,8 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q) if (ret == 0) { virtio_queue_set_notification(q->tx_vq, 0); q->async_tx.elem = elem; +g_free(sg); +g_free(sg2); return -EBUSY; } @@ -1296,6 +1301,8 @@ drop: break; } } +g_free(sg); +g_free(sg2); return num_packets; } -- 2.5.5
[Qemu-devel] [PATCH] hw/arm/nseries: Allocating Large sized arrays to heap
n8x0_init has a huge stack usage of 65536 bytes approx. Moving large arrays to heap to reduce stack usage. Signed-off-by: Zhou Jie --- hw/arm/nseries.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/arm/nseries.c b/hw/arm/nseries.c index 5382505..c7068c0 100644 --- a/hw/arm/nseries.c +++ b/hw/arm/nseries.c @@ -1364,7 +1364,7 @@ static void n8x0_init(MachineState *machine, if (option_rom[0].name && (machine->boot_order[0] == 'n' || !machine->kernel_filename)) { -uint8_t nolo_tags[0x1]; +uint8_t *nolo_tags = g_new(uint8_t, 0x1); /* No, wait, better start at the ROM. */ s->mpu->cpu->env.regs[15] = OMAP2_Q2_BASE + 0x40; @@ -1383,6 +1383,7 @@ static void n8x0_init(MachineState *machine, n800_setup_nolo_tags(nolo_tags); cpu_physical_memory_write(OMAP2_SRAM_BASE, nolo_tags, 0x1); +g_free(nolo_tags); } } -- 2.5.5
[Qemu-devel] [PATCH] hw/net/opencores_eth: Allocating Large sized arrays to heap
open_eth_start_xmit has a huge stack usage of 65536 bytes approx. Moving large arrays to heap to reduce stack usage. Signed-off-by: Zhou Jie --- hw/net/opencores_eth.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hw/net/opencores_eth.c b/hw/net/opencores_eth.c index c6094fb..ba23922 100644 --- a/hw/net/opencores_eth.c +++ b/hw/net/opencores_eth.c @@ -483,7 +483,7 @@ static NetClientInfo net_open_eth_info = { static void open_eth_start_xmit(OpenEthState *s, desc *tx) { -uint8_t buf[65536]; +uint8_t *buf = NULL; unsigned len = GET_FIELD(tx->len_flags, TXD_LEN); unsigned tx_len = len; @@ -498,6 +498,7 @@ static void open_eth_start_xmit(OpenEthState *s, desc *tx) trace_open_eth_start_xmit(tx->buf_ptr, len, tx_len); +buf = g_new(uint8_t, tx_len); if (len > tx_len) { len = tx_len; } @@ -506,6 +507,7 @@ static void open_eth_start_xmit(OpenEthState *s, desc *tx) memset(buf + len, 0, tx_len - len); } qemu_send_packet(qemu_get_queue(s->nic), buf, tx_len); +g_free(buf); if (tx->len_flags & TXD_WR) { s->tx_desc = 0; -- 2.5.5
[Qemu-devel] [PATCH] net/tap: Allocating Large sized arrays to heap
net_init_tap has a huge stack usage of 8192 bytes approx. Moving large arrays to heap to reduce stack usage. Signed-off-by: Zhou Jie --- net/tap.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/tap.c b/net/tap.c index 740e8a2..49817c7 100644 --- a/net/tap.c +++ b/net/tap.c @@ -769,8 +769,8 @@ int net_init_tap(const NetClientOptions *opts, const char *name, return -1; } } else if (tap->has_fds) { -char *fds[MAX_TAP_QUEUES]; -char *vhost_fds[MAX_TAP_QUEUES]; +char **fds = g_new(char *, MAX_TAP_QUEUES); +char **vhost_fds = g_new(char *, MAX_TAP_QUEUES); int nfds, nvhosts; if (tap->has_ifname || tap->has_script || tap->has_downscript || @@ -818,6 +818,8 @@ int net_init_tap(const NetClientOptions *opts, const char *name, return -1; } } +g_free(fds); +g_free(vhost_fds); } else if (tap->has_helper) { if (tap->has_ifname || tap->has_script || tap->has_downscript || tap->has_vnet_hdr || tap->has_queues || tap->has_vhostfds) { -- 2.5.5
[Qemu-devel] [PATCH] Added negative check for get_image_size()
This patch adds check for negative return value from get_image_size(), where it is missing. It avoids unnecessary two function calls. Signed-off-by: Zhou Jie --- hw/ppc/spapr.c | 4 1 file changed, 4 insertions(+) diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index feaab08..ccea633 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -1842,6 +1842,10 @@ static void ppc_spapr_init(MachineState *machine) exit(1); } spapr->rtas_size = get_image_size(filename); +if (spapr->rtas_size < 0) { +error_report("Could not get size of LPAR rtas '%s'", filename); +exit(1); +} spapr->rtas_blob = g_malloc(spapr->rtas_size); if (load_image_size(filename, spapr->rtas_blob, spapr->rtas_size) < 0) { error_report("Could not load LPAR rtas '%s'", filename); -- 2.5.5