Re: [Qemu-devel] [PATCH v3] vfio : add aer process
ping On 2016/8/15 10:53, Zhou Jie wrote: ping On 2016/8/2 11:57, Zhou Jie wrote: During aer err occurs and resume do following to protect device from being accessed. 1. Make config space read only. 2. Disable INTx/MSI Interrupt. 3. Do nothing for bar regions. Signed-off-by: Zhou Jie --- v2-v3: 1. Call init_completion() in vfio_pci_probe. 2. Call reinit_completion() in vfio_pci_aer_err_detected. 3. Remove unnecessary brackets. v1-v2: 1. Add aer process to vfio driver. drivers/vfio/pci/vfio_pci.c | 48 + drivers/vfio/pci/vfio_pci_private.h | 2 ++ include/uapi/linux/vfio.h | 2 ++ 3 files changed, 52 insertions(+) diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index d624a52..4c246a1 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -648,6 +648,15 @@ static long vfio_pci_ioctl(void *device_data, struct vfio_pci_device *vdev = device_data; unsigned long minsz; +if (vdev->aer_error_in_progress && (cmd == VFIO_DEVICE_SET_IRQS || +cmd == VFIO_DEVICE_RESET || cmd == VFIO_DEVICE_PCI_HOT_RESET)) { +int ret; +ret = wait_for_completion_interruptible( +&vdev->aer_error_completion); +if (ret) +return ret; +} + if (cmd == VFIO_DEVICE_GET_INFO) { struct vfio_device_info info; @@ -664,6 +673,10 @@ static long vfio_pci_ioctl(void *device_data, if (vdev->reset_works) info.flags |= VFIO_DEVICE_FLAGS_RESET; +info.flags |= VFIO_DEVICE_FLAGS_AERPROCESS; +if (vdev->aer_error_in_progress) +info.flags |= VFIO_DEVICE_FLAGS_INAERPROCESS; + info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions; info.num_irqs = VFIO_PCI_NUM_IRQS; @@ -1070,6 +1083,13 @@ static ssize_t vfio_pci_rw(void *device_data, char __user *buf, switch (index) { case VFIO_PCI_CONFIG_REGION_INDEX: +if (vdev->aer_error_in_progress && iswrite) { +int ret; +ret = wait_for_completion_interruptible( +&vdev->aer_error_completion); +if (ret) +return ret; +} return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite); case VFIO_PCI_ROM_REGION_INDEX: @@ -1228,6 +1248,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) vdev->irq_type = VFIO_PCI_NUM_IRQS; mutex_init(&vdev->igate); spin_lock_init(&vdev->irqlock); +init_completion(&vdev->aer_error_completion); ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev); if (ret) { @@ -1300,6 +1321,11 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, mutex_lock(&vdev->igate); +vdev->aer_error_in_progress = true; +reinit_completion(&vdev->aer_error_completion); +vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE | +VFIO_IRQ_SET_ACTION_TRIGGER, +vdev->irq_type, 0, 0, NULL); if (vdev->err_trigger) eventfd_signal(vdev->err_trigger, 1); @@ -1310,8 +1336,30 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, return PCI_ERS_RESULT_CAN_RECOVER; } +static void vfio_pci_aer_resume(struct pci_dev *pdev) +{ +struct vfio_pci_device *vdev; +struct vfio_device *device; + +device = vfio_device_get_from_dev(&pdev->dev); +if (device == NULL) +return; + +vdev = vfio_device_data(device); +if (vdev == NULL) { +vfio_device_put(device); +return; +} + +vdev->aer_error_in_progress = false; +complete_all(&vdev->aer_error_completion); + +vfio_device_put(device); +} + static const struct pci_error_handlers vfio_err_handlers = { .error_detected = vfio_pci_aer_err_detected, +.resume = vfio_pci_aer_resume, }; static struct pci_driver vfio_pci_driver = { diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index 2128de8..7430d92 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -91,6 +91,8 @@ struct vfio_pci_device { boolhas_vga; boolneeds_reset; boolnointx; +boolaer_error_in_progress; +struct completionaer_error_completion; struct pci_saved_state*pci_saved_state; intrefcnt; struct eventfd_ctx*err_trigger; diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 255a211..59b9cf6 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -198,6 +198,8 @@ struct vfio_device_info { #define VFIO_DEVICE_FLAGS_PCI(1 << 1)/* vfio-pci device */ #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2)/* vfio-platform device */ #define VFIO_DEVICE_FLAG
Re: [PATCH v3] vfio : add aer process
ping On 2016/8/2 11:57, Zhou Jie wrote: During aer err occurs and resume do following to protect device from being accessed. 1. Make config space read only. 2. Disable INTx/MSI Interrupt. 3. Do nothing for bar regions. Signed-off-by: Zhou Jie --- v2-v3: 1. Call init_completion() in vfio_pci_probe. 2. Call reinit_completion() in vfio_pci_aer_err_detected. 3. Remove unnecessary brackets. v1-v2: 1. Add aer process to vfio driver. drivers/vfio/pci/vfio_pci.c | 48 + drivers/vfio/pci/vfio_pci_private.h | 2 ++ include/uapi/linux/vfio.h | 2 ++ 3 files changed, 52 insertions(+) diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index d624a52..4c246a1 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -648,6 +648,15 @@ static long vfio_pci_ioctl(void *device_data, struct vfio_pci_device *vdev = device_data; unsigned long minsz; + if (vdev->aer_error_in_progress && (cmd == VFIO_DEVICE_SET_IRQS || + cmd == VFIO_DEVICE_RESET || cmd == VFIO_DEVICE_PCI_HOT_RESET)) { + int ret; + ret = wait_for_completion_interruptible( + &vdev->aer_error_completion); + if (ret) + return ret; + } + if (cmd == VFIO_DEVICE_GET_INFO) { struct vfio_device_info info; @@ -664,6 +673,10 @@ static long vfio_pci_ioctl(void *device_data, if (vdev->reset_works) info.flags |= VFIO_DEVICE_FLAGS_RESET; + info.flags |= VFIO_DEVICE_FLAGS_AERPROCESS; + if (vdev->aer_error_in_progress) + info.flags |= VFIO_DEVICE_FLAGS_INAERPROCESS; + info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions; info.num_irqs = VFIO_PCI_NUM_IRQS; @@ -1070,6 +1083,13 @@ static ssize_t vfio_pci_rw(void *device_data, char __user *buf, switch (index) { case VFIO_PCI_CONFIG_REGION_INDEX: + if (vdev->aer_error_in_progress && iswrite) { + int ret; + ret = wait_for_completion_interruptible( + &vdev->aer_error_completion); + if (ret) + return ret; + } return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite); case VFIO_PCI_ROM_REGION_INDEX: @@ -1228,6 +1248,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) vdev->irq_type = VFIO_PCI_NUM_IRQS; mutex_init(&vdev->igate); spin_lock_init(&vdev->irqlock); + init_completion(&vdev->aer_error_completion); ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev); if (ret) { @@ -1300,6 +1321,11 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, mutex_lock(&vdev->igate); + vdev->aer_error_in_progress = true; + reinit_completion(&vdev->aer_error_completion); + vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE | + VFIO_IRQ_SET_ACTION_TRIGGER, + vdev->irq_type, 0, 0, NULL); if (vdev->err_trigger) eventfd_signal(vdev->err_trigger, 1); @@ -1310,8 +1336,30 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, return PCI_ERS_RESULT_CAN_RECOVER; } +static void vfio_pci_aer_resume(struct pci_dev *pdev) +{ + struct vfio_pci_device *vdev; + struct vfio_device *device; + + device = vfio_device_get_from_dev(&pdev->dev); + if (device == NULL) + return; + + vdev = vfio_device_data(device); + if (vdev == NULL) { + vfio_device_put(device); + return; + } + + vdev->aer_error_in_progress = false; + complete_all(&vdev->aer_error_completion); + + vfio_device_put(device); +} + static const struct pci_error_handlers vfio_err_handlers = { .error_detected = vfio_pci_aer_err_detected, + .resume = vfio_pci_aer_resume, }; static struct pci_driver vfio_pci_driver = { diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index 2128de8..7430d92 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -91,6 +91,8 @@ struct vfio_pci_device { boolhas_vga; boolneeds_reset; boolnointx; + boolaer_error_in_progress; + struct completion aer_error_completion; struct pci_saved_state *pci_saved_state; int refcnt; struct eventfd_ctx *err_trigger; diff --git
[PATCH v3] vfio : add aer process
During aer err occurs and resume do following to protect device from being accessed. 1. Make config space read only. 2. Disable INTx/MSI Interrupt. 3. Do nothing for bar regions. Signed-off-by: Zhou Jie --- v2-v3: 1. Call init_completion() in vfio_pci_probe. 2. Call reinit_completion() in vfio_pci_aer_err_detected. 3. Remove unnecessary brackets. v1-v2: 1. Add aer process to vfio driver. drivers/vfio/pci/vfio_pci.c | 48 + drivers/vfio/pci/vfio_pci_private.h | 2 ++ include/uapi/linux/vfio.h | 2 ++ 3 files changed, 52 insertions(+) diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index d624a52..4c246a1 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -648,6 +648,15 @@ static long vfio_pci_ioctl(void *device_data, struct vfio_pci_device *vdev = device_data; unsigned long minsz; + if (vdev->aer_error_in_progress && (cmd == VFIO_DEVICE_SET_IRQS || + cmd == VFIO_DEVICE_RESET || cmd == VFIO_DEVICE_PCI_HOT_RESET)) { + int ret; + ret = wait_for_completion_interruptible( + &vdev->aer_error_completion); + if (ret) + return ret; + } + if (cmd == VFIO_DEVICE_GET_INFO) { struct vfio_device_info info; @@ -664,6 +673,10 @@ static long vfio_pci_ioctl(void *device_data, if (vdev->reset_works) info.flags |= VFIO_DEVICE_FLAGS_RESET; + info.flags |= VFIO_DEVICE_FLAGS_AERPROCESS; + if (vdev->aer_error_in_progress) + info.flags |= VFIO_DEVICE_FLAGS_INAERPROCESS; + info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions; info.num_irqs = VFIO_PCI_NUM_IRQS; @@ -1070,6 +1083,13 @@ static ssize_t vfio_pci_rw(void *device_data, char __user *buf, switch (index) { case VFIO_PCI_CONFIG_REGION_INDEX: + if (vdev->aer_error_in_progress && iswrite) { + int ret; + ret = wait_for_completion_interruptible( + &vdev->aer_error_completion); + if (ret) + return ret; + } return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite); case VFIO_PCI_ROM_REGION_INDEX: @@ -1228,6 +1248,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) vdev->irq_type = VFIO_PCI_NUM_IRQS; mutex_init(&vdev->igate); spin_lock_init(&vdev->irqlock); + init_completion(&vdev->aer_error_completion); ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev); if (ret) { @@ -1300,6 +1321,11 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, mutex_lock(&vdev->igate); + vdev->aer_error_in_progress = true; + reinit_completion(&vdev->aer_error_completion); + vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE | + VFIO_IRQ_SET_ACTION_TRIGGER, + vdev->irq_type, 0, 0, NULL); if (vdev->err_trigger) eventfd_signal(vdev->err_trigger, 1); @@ -1310,8 +1336,30 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, return PCI_ERS_RESULT_CAN_RECOVER; } +static void vfio_pci_aer_resume(struct pci_dev *pdev) +{ + struct vfio_pci_device *vdev; + struct vfio_device *device; + + device = vfio_device_get_from_dev(&pdev->dev); + if (device == NULL) + return; + + vdev = vfio_device_data(device); + if (vdev == NULL) { + vfio_device_put(device); + return; + } + + vdev->aer_error_in_progress = false; + complete_all(&vdev->aer_error_completion); + + vfio_device_put(device); +} + static const struct pci_error_handlers vfio_err_handlers = { .error_detected = vfio_pci_aer_err_detected, + .resume = vfio_pci_aer_resume, }; static struct pci_driver vfio_pci_driver = { diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index 2128de8..7430d92 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -91,6 +91,8 @@ struct vfio_pci_device { boolhas_vga; boolneeds_reset; boolnointx; + boolaer_error_in_progress; + struct completion aer_error_completion; struct pci_saved_state *pci_saved_state; int refcnt; struct eventfd_ctx *err_trigger; diff --git a/include/uapi/linux/vfio.h b/includ
Re: [Qemu-devel] [PATCH v2 2/2] vfio : add aer process
Hi, Alex Clearly this has only been tested for a single instance of an AER error event and resume per device. Are the things you're intending to block actually blocked for subsequent events? Note how complete_all() fills the done field to let all current and future waiters go through and nowhere is there a call to reinit_completion() to drain that path. Thanks, Alex Do you mean this condition? For device 1: error1 occurs error1 resumes error2 occurs error2 resumes error3 occurs error3 resumes In current code, I do complete_all() when error1 resumes. And this will unblock the device when error2 and error3 are still be processed. So walk me through how this works. On vfio_pci_open() we call init_completion(), which sets aer_error_completion.done equal to zero (BTW, a user can open the device file descriptor multiple times, so there's already a bug here). I will call init_completion() in vfio_pci_probe. Let's assume that an error occurs and the user stalls a single access on wait_for_completion_interruptible(). The bulk of this function happens here: static inline long __sched do_wait_for_common(struct completion *x, long (*action)(long), long timeout, int state) { if (!x->done) { DECLARE_WAITQUEUE(wait, current); __add_wait_queue_tail_exclusive(&x->wait, &wait); do { if (signal_pending_state(state, current)) { timeout = -ERESTARTSYS; break; } __set_current_state(state); spin_unlock_irq(&x->wait.lock); timeout = action(timeout); spin_lock_irq(&x->wait.lock); } while (!x->done && timeout); __remove_wait_queue(&x->wait, &wait); if (!x->done) return timeout; } x->done--; return timeout ?: 1; } So it waits within that do{}while loop for a completion, interruption, or timeout. Then we have: void complete_all(struct completion *x) { unsigned long flags; spin_lock_irqsave(&x->wait.lock, flags); x->done += UINT_MAX/2; __wake_up_locked(&x->wait, TASK_NORMAL, 0); spin_unlock_irqrestore(&x->wait.lock, flags); } So aer_error_completion.done gets incremented to let a couple billion completion waiters through... Show me how another call to wait_for_completion_interruptible() will ever block again within our lifetime when the actual wait of do_wait_for_common() is only entered when 'done' count is equal to zero. This seems to be why reinit_completion() exists, but it's not used here. Thanks, Alex I will call reinit_completion() in vfio_pci_aer_err_detected when an aer error is detected. Thank you very much. Sincerely ZhouJie
Re: [Qemu-devel] [PATCH v2 2/2] vfio : add aer process
Hi, Alex On 2016/7/30 1:12, Alex Williamson wrote: On Tue, 19 Jul 2016 15:32:43 +0800 Zhou Jie wrote: From: Chen Fan During aer err occurs and resume do following to protect device from being accessed. 1. Make config space read only. 2. Disable INTx/MSI Interrupt. 3. Do nothing for bar regions. Signed-off-by: Zhou Jie --- drivers/vfio/pci/vfio_pci.c | 30 ++ drivers/vfio/pci/vfio_pci_private.h | 2 ++ include/uapi/linux/vfio.h | 2 ++ 3 files changed, 34 insertions(+) diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 2d12b03..dd96b60 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -318,6 +318,7 @@ static int vfio_pci_open(void *device_data) return -ENODEV; mutex_lock(&driver_lock); + init_completion(&vdev->aer_error_completion); if (!vdev->refcnt) { ret = vfio_pci_enable(vdev); @@ -571,6 +572,16 @@ static long vfio_pci_ioctl(void *device_data, struct vfio_pci_device *vdev = device_data; unsigned long minsz; + if (vdev->aer_error_in_progress && (cmd == VFIO_DEVICE_SET_IRQS || + cmd == VFIO_DEVICE_RESET || cmd == VFIO_DEVICE_PCI_HOT_RESET)) { + int ret; + ret = wait_for_completion_interruptible( + &vdev->aer_error_completion); + if (ret) { + return ret; + } No brackets necessary. + } + if (cmd == VFIO_DEVICE_GET_INFO) { struct vfio_device_info info; @@ -587,6 +598,10 @@ static long vfio_pci_ioctl(void *device_data, if (vdev->reset_works) info.flags |= VFIO_DEVICE_FLAGS_RESET; + info.flags |= VFIO_DEVICE_FLAGS_AERPROCESS; + if (vdev->aer_error_in_progress) + info.flags |= VFIO_DEVICE_FLAGS_INAERPROCESS; + info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions; info.num_irqs = VFIO_PCI_NUM_IRQS; @@ -996,6 +1011,14 @@ static ssize_t vfio_pci_rw(void *device_data, char __user *buf, switch (index) { case VFIO_PCI_CONFIG_REGION_INDEX: + if (vdev->aer_error_in_progress && iswrite) { + int ret; + ret = wait_for_completion_interruptible( + &vdev->aer_error_completion); + if (ret) { + return ret; + } + } return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite); case VFIO_PCI_ROM_REGION_INDEX: @@ -1226,6 +1249,10 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, mutex_lock(&vdev->igate); + vdev->aer_error_in_progress = true; + vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE | + VFIO_IRQ_SET_ACTION_TRIGGER, + vdev->irq_type, 0, 0, NULL); if (vdev->err_trigger) eventfd_signal(vdev->err_trigger, 1); @@ -1252,6 +1279,9 @@ static void vfio_pci_aer_resume(struct pci_dev *pdev) } mutex_lock(&vdev->igate); + + vdev->aer_error_in_progress = false; + complete_all(&vdev->aer_error_completion); if (vdev->resume_trigger) eventfd_signal(vdev->resume_trigger, 1); diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index 80d4ddd..2f151f5 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -84,6 +84,8 @@ struct vfio_pci_device { boolhas_vga; boolneeds_reset; boolnointx; + boolaer_error_in_progress; + struct completion aer_error_completion; struct pci_saved_state *pci_saved_state; int refcnt; struct eventfd_ctx *err_trigger; diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 34ab138..276ce50 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -198,6 +198,8 @@ struct vfio_device_info { #define VFIO_DEVICE_FLAGS_PCI (1 << 1) /* vfio-pci device */ #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2) /* vfio-platform device */ #define VFIO_DEVICE_FLAGS_AMBA (1 << 3) /* vfio-amba device */ +#define VFIO_DEVICE_FLAGS_AERPROCESS (1 << 4) /* support aer error progress */ +#define VFIO_DEVICE_FLAGS_INAERPROCESS (1 << 5)/* status in aer error progress */ __u32 num_regions;/* Max region index + 1 */ __u32 num_irqs; /* Max IRQ index + 1 */ }; Clearly this has only been tested for a single instance of an AER error event and
Re: [PATCH v2 1/2] vfio : resume notifier
Hi, Alex On 2016/7/30 1:11, Alex Williamson wrote: On Tue, 19 Jul 2016 15:52:45 +0800 Zhou Jie wrote: From: Chen Fan An empty commit log is unacceptable for all but the most trivial patches. There's also no sign-off on this patch. Sorry. I should note it. I also don't know why we need this since you previously found that for QEMU, ordering of error versus resume notifications is not guaranteed, which is why I thought we went with a status flag within the struct vfio_device_info. I'm not adding an interrupt to the user that has no users. This was not part of the most recent discussion we had about this, so I'm lost why this patch exists. Thanks, Alex I will remove the resume interrupt. Sincerely ZhouJie
Re: [Qemu-devel] [PATCH v2 0/2] vfio: add aer process
ping On 2016/7/19 16:13, Zhou Jie wrote: From: Chen Fan v1-v2: 1. Add aer process to vfio driver. Chen Fan (2): vfio : add aer process vfio : resume notifier drivers/vfio/pci/vfio_pci.c | 58 - drivers/vfio/pci/vfio_pci_intrs.c | 18 drivers/vfio/pci/vfio_pci_private.h | 3 ++ include/uapi/linux/vfio.h | 3 ++ 4 files changed, 81 insertions(+), 1 deletion(-)
[PATCH v2 0/2] vfio: add aer process
From: Chen Fan v1-v2: 1. Add aer process to vfio driver. Chen Fan (2): vfio : add aer process vfio : resume notifier drivers/vfio/pci/vfio_pci.c | 58 - drivers/vfio/pci/vfio_pci_intrs.c | 18 drivers/vfio/pci/vfio_pci_private.h | 3 ++ include/uapi/linux/vfio.h | 3 ++ 4 files changed, 81 insertions(+), 1 deletion(-) -- 1.8.3.1
[PATCH v2 1/2] vfio : resume notifier
From: Chen Fan --- drivers/vfio/pci/vfio_pci.c | 28 +++- drivers/vfio/pci/vfio_pci_intrs.c | 18 ++ drivers/vfio/pci/vfio_pci_private.h | 1 + include/uapi/linux/vfio.h | 1 + 4 files changed, 47 insertions(+), 1 deletion(-) diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 188b1ff..2d12b03 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -363,7 +363,8 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type) return (flags & PCI_MSIX_FLAGS_QSIZE) + 1; } - } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) { + } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX || + irq_type == VFIO_PCI_RESUME_IRQ_INDEX) { if (pci_is_pcie(vdev->pdev)) return 1; } else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) { @@ -731,6 +732,7 @@ static long vfio_pci_ioctl(void *device_data, case VFIO_PCI_REQ_IRQ_INDEX: break; case VFIO_PCI_ERR_IRQ_INDEX: + case VFIO_PCI_RESUME_IRQ_INDEX: if (pci_is_pcie(vdev->pdev)) break; /* pass thru to return error */ @@ -1234,8 +1236,32 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, return PCI_ERS_RESULT_CAN_RECOVER; } +static void vfio_pci_aer_resume(struct pci_dev *pdev) +{ + struct vfio_pci_device *vdev; + struct vfio_device *device; + + device = vfio_device_get_from_dev(&pdev->dev); + if (device == NULL) + return; + + vdev = vfio_device_data(device); + if (vdev == NULL) { + vfio_device_put(device); + return; + } + + mutex_lock(&vdev->igate); + if (vdev->resume_trigger) + eventfd_signal(vdev->resume_trigger, 1); + + mutex_unlock(&vdev->igate); + vfio_device_put(device); +} + static const struct pci_error_handlers vfio_err_handlers = { .error_detected = vfio_pci_aer_err_detected, + .resume = vfio_pci_aer_resume, }; static struct pci_driver vfio_pci_driver = { diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 15ecfc9..3a01a62 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -617,6 +617,16 @@ static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev, return vfio_pci_set_ctx_trigger_single(&vdev->err_trigger, flags, data); } +static int vfio_pci_set_resume_trigger(struct vfio_pci_device *vdev, + unsigned index, unsigned start, + unsigned count, uint32_t flags, void *data) +{ + if (index != VFIO_PCI_RESUME_IRQ_INDEX) + return -EINVAL; + + return vfio_pci_set_ctx_trigger_single(&vdev->resume_trigger, flags, data); +} + static int vfio_pci_set_req_trigger(struct vfio_pci_device *vdev, unsigned index, unsigned start, unsigned count, uint32_t flags, void *data) @@ -676,6 +686,14 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags, break; } break; + case VFIO_PCI_RESUME_IRQ_INDEX: + switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { + case VFIO_IRQ_SET_ACTION_TRIGGER: + if (pci_is_pcie(vdev->pdev)) + func = vfio_pci_set_resume_trigger; + break; + } + break; } if (!func) diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index 016c14a..80d4ddd 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -88,6 +88,7 @@ struct vfio_pci_device { int refcnt; struct eventfd_ctx *err_trigger; struct eventfd_ctx *req_trigger; + struct eventfd_ctx *resume_trigger; }; #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 255a211..34ab138 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -433,6 +433,7 @@ enum { VFIO_PCI_MSIX_IRQ_INDEX, VFIO_PCI_ERR_IRQ_INDEX, VFIO_PCI_REQ_IRQ_INDEX, + VFIO_PCI_RESUME_IRQ_INDEX, VFIO_PCI_NUM_IRQS }; -- 1.8.3.1
[PATCH v2 0/2] vfio: add aer process
From: Chen Fan v1-v2: 1. Add aer process to vfio driver. Chen Fan (1): vfio : add aer process root (1): vfio : resume notifier drivers/vfio/pci/vfio_pci.c | 58 - drivers/vfio/pci/vfio_pci_intrs.c | 18 drivers/vfio/pci/vfio_pci_private.h | 3 ++ include/uapi/linux/vfio.h | 3 ++ 4 files changed, 81 insertions(+), 1 deletion(-) -- 1.8.3.1
[PATCH v2 2/2] vfio : add aer process
From: Chen Fan During aer err occurs and resume do following to protect device from being accessed. 1. Make config space read only. 2. Disable INTx/MSI Interrupt. 3. Do nothing for bar regions. Signed-off-by: Zhou Jie --- drivers/vfio/pci/vfio_pci.c | 30 ++ drivers/vfio/pci/vfio_pci_private.h | 2 ++ include/uapi/linux/vfio.h | 2 ++ 3 files changed, 34 insertions(+) diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 2d12b03..dd96b60 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -318,6 +318,7 @@ static int vfio_pci_open(void *device_data) return -ENODEV; mutex_lock(&driver_lock); + init_completion(&vdev->aer_error_completion); if (!vdev->refcnt) { ret = vfio_pci_enable(vdev); @@ -571,6 +572,16 @@ static long vfio_pci_ioctl(void *device_data, struct vfio_pci_device *vdev = device_data; unsigned long minsz; + if (vdev->aer_error_in_progress && (cmd == VFIO_DEVICE_SET_IRQS || + cmd == VFIO_DEVICE_RESET || cmd == VFIO_DEVICE_PCI_HOT_RESET)) { + int ret; + ret = wait_for_completion_interruptible( + &vdev->aer_error_completion); + if (ret) { + return ret; + } + } + if (cmd == VFIO_DEVICE_GET_INFO) { struct vfio_device_info info; @@ -587,6 +598,10 @@ static long vfio_pci_ioctl(void *device_data, if (vdev->reset_works) info.flags |= VFIO_DEVICE_FLAGS_RESET; + info.flags |= VFIO_DEVICE_FLAGS_AERPROCESS; + if (vdev->aer_error_in_progress) + info.flags |= VFIO_DEVICE_FLAGS_INAERPROCESS; + info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions; info.num_irqs = VFIO_PCI_NUM_IRQS; @@ -996,6 +1011,14 @@ static ssize_t vfio_pci_rw(void *device_data, char __user *buf, switch (index) { case VFIO_PCI_CONFIG_REGION_INDEX: + if (vdev->aer_error_in_progress && iswrite) { + int ret; + ret = wait_for_completion_interruptible( + &vdev->aer_error_completion); + if (ret) { + return ret; + } + } return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite); case VFIO_PCI_ROM_REGION_INDEX: @@ -1226,6 +1249,10 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, mutex_lock(&vdev->igate); + vdev->aer_error_in_progress = true; + vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE | + VFIO_IRQ_SET_ACTION_TRIGGER, + vdev->irq_type, 0, 0, NULL); if (vdev->err_trigger) eventfd_signal(vdev->err_trigger, 1); @@ -1252,6 +1279,9 @@ static void vfio_pci_aer_resume(struct pci_dev *pdev) } mutex_lock(&vdev->igate); + + vdev->aer_error_in_progress = false; + complete_all(&vdev->aer_error_completion); if (vdev->resume_trigger) eventfd_signal(vdev->resume_trigger, 1); diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index 80d4ddd..2f151f5 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -84,6 +84,8 @@ struct vfio_pci_device { boolhas_vga; boolneeds_reset; boolnointx; + boolaer_error_in_progress; + struct completion aer_error_completion; struct pci_saved_state *pci_saved_state; int refcnt; struct eventfd_ctx *err_trigger; diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 34ab138..276ce50 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -198,6 +198,8 @@ struct vfio_device_info { #define VFIO_DEVICE_FLAGS_PCI (1 << 1)/* vfio-pci device */ #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2)/* vfio-platform device */ #define VFIO_DEVICE_FLAGS_AMBA (1 << 3) /* vfio-amba device */ +#define VFIO_DEVICE_FLAGS_AERPROCESS (1 << 4) /* support aer error progress */ +#define VFIO_DEVICE_FLAGS_INAERPROCESS (1 << 5)/* status in aer error progress */ __u32 num_regions;/* Max region index + 1 */ __u32 num_irqs; /* Max IRQ index + 1 */ }; -- 1.8.3.1
[PATCH v2 1/2] vfio : resume notifier
From: root --- drivers/vfio/pci/vfio_pci.c | 28 +++- drivers/vfio/pci/vfio_pci_intrs.c | 18 ++ drivers/vfio/pci/vfio_pci_private.h | 1 + include/uapi/linux/vfio.h | 1 + 4 files changed, 47 insertions(+), 1 deletion(-) diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 188b1ff..2d12b03 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -363,7 +363,8 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type) return (flags & PCI_MSIX_FLAGS_QSIZE) + 1; } - } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) { + } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX || + irq_type == VFIO_PCI_RESUME_IRQ_INDEX) { if (pci_is_pcie(vdev->pdev)) return 1; } else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) { @@ -731,6 +732,7 @@ static long vfio_pci_ioctl(void *device_data, case VFIO_PCI_REQ_IRQ_INDEX: break; case VFIO_PCI_ERR_IRQ_INDEX: + case VFIO_PCI_RESUME_IRQ_INDEX: if (pci_is_pcie(vdev->pdev)) break; /* pass thru to return error */ @@ -1234,8 +1236,32 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, return PCI_ERS_RESULT_CAN_RECOVER; } +static void vfio_pci_aer_resume(struct pci_dev *pdev) +{ + struct vfio_pci_device *vdev; + struct vfio_device *device; + + device = vfio_device_get_from_dev(&pdev->dev); + if (device == NULL) + return; + + vdev = vfio_device_data(device); + if (vdev == NULL) { + vfio_device_put(device); + return; + } + + mutex_lock(&vdev->igate); + if (vdev->resume_trigger) + eventfd_signal(vdev->resume_trigger, 1); + + mutex_unlock(&vdev->igate); + vfio_device_put(device); +} + static const struct pci_error_handlers vfio_err_handlers = { .error_detected = vfio_pci_aer_err_detected, + .resume = vfio_pci_aer_resume, }; static struct pci_driver vfio_pci_driver = { diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 15ecfc9..3a01a62 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -617,6 +617,16 @@ static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev, return vfio_pci_set_ctx_trigger_single(&vdev->err_trigger, flags, data); } +static int vfio_pci_set_resume_trigger(struct vfio_pci_device *vdev, + unsigned index, unsigned start, + unsigned count, uint32_t flags, void *data) +{ + if (index != VFIO_PCI_RESUME_IRQ_INDEX) + return -EINVAL; + + return vfio_pci_set_ctx_trigger_single(&vdev->resume_trigger, flags, data); +} + static int vfio_pci_set_req_trigger(struct vfio_pci_device *vdev, unsigned index, unsigned start, unsigned count, uint32_t flags, void *data) @@ -676,6 +686,14 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags, break; } break; + case VFIO_PCI_RESUME_IRQ_INDEX: + switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { + case VFIO_IRQ_SET_ACTION_TRIGGER: + if (pci_is_pcie(vdev->pdev)) + func = vfio_pci_set_resume_trigger; + break; + } + break; } if (!func) diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index 016c14a..80d4ddd 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -88,6 +88,7 @@ struct vfio_pci_device { int refcnt; struct eventfd_ctx *err_trigger; struct eventfd_ctx *req_trigger; + struct eventfd_ctx *resume_trigger; }; #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 255a211..34ab138 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -433,6 +433,7 @@ enum { VFIO_PCI_MSIX_IRQ_INDEX, VFIO_PCI_ERR_IRQ_INDEX, VFIO_PCI_REQ_IRQ_INDEX, + VFIO_PCI_RESUME_IRQ_INDEX, VFIO_PCI_NUM_IRQS }; -- 1.8.3.1
Re: [Qemu-devel] [RFC PATCH 0/3] x86: Add support for guest DMA dirty page tracking
Hi, Alex On 2016/6/9 23:39, Alexander Duyck wrote: On Thu, Jun 9, 2016 at 3:14 AM, Zhou Jie wrote: TO Alex TO Michael In your solution you add a emulate PCI bridge to act as a bridge between direct assigned devices and the host bridge. Do you mean put all direct assigned devices to one emulate PCI bridge? If yes, this maybe bring some problems. We are writing a patchset to support aer feature in qemu. When assigning a vfio device with AER enabled, we must check whether the device supports a host bus reset (ie. hot reset) as this may be used by the guest OS in order to recover the device from an AER error. QEMU must therefore have the ability to perform a physical host bus reset using the existing vfio APIs in response to a virtual bus reset in the VM. A physical bus reset affects all of the devices on the host bus. Therefore all physical devices affected by a bus reset must be configured on the same virtual bus in the VM. And no devices unaffected by the bus reset, be configured on the same virtual bus. http://lists.nongnu.org/archive/html/qemu-devel/2016-05/msg02989.html Sincerely, Zhou Jie That makes sense, but I don't think you have to worry much about this at this point at least on my side as this was mostly just theory and I haven't had a chance to put any of it into practice as of yet. My idea has been evolving on this for a while. One thought I had is that we may want to have something like an emulated IOMMU and if possible we would want to split it up over multiple domains just so we can be certain that the virtual interfaces and the physical ones existed in separate domains. In regards to your concerns perhaps what we could do is put each assigned device into its own domain to prevent them from affecting each other. To that end we could probably break things up so that each device effectively lives in its own PCIe slot in the emulated system. Then when we start a migration of the guest the assigned device domains would then have to be tracked for unmap and sync calls when the direction is from the device. I will keep your concerns in mind in the future when I get some time to look at exploring this solution further. - Alex I am thinking about the practice of migration of passthrough device. In your solution, you use a vendor specific configuration space to negotiate with guest. If you put each assigned device into its own domain, how can qemu negotiate with guest? Add the vendor specific configuration space to every pci bus which is assigned a passthrough device? Sincerely Zhou Jie
Re: [Qemu-devel] [RFC PATCH 0/3] x86: Add support for guest DMA dirty page tracking
TO Alex TO Michael In your solution you add a emulate PCI bridge to act as a bridge between direct assigned devices and the host bridge. Do you mean put all direct assigned devices to one emulate PCI bridge? If yes, this maybe bring some problems. We are writing a patchset to support aer feature in qemu. When assigning a vfio device with AER enabled, we must check whether the device supports a host bus reset (ie. hot reset) as this may be used by the guest OS in order to recover the device from an AER error. QEMU must therefore have the ability to perform a physical host bus reset using the existing vfio APIs in response to a virtual bus reset in the VM. A physical bus reset affects all of the devices on the host bus. Therefore all physical devices affected by a bus reset must be configured on the same virtual bus in the VM. And no devices unaffected by the bus reset, be configured on the same virtual bus. http://lists.nongnu.org/archive/html/qemu-devel/2016-05/msg02989.html Sincerely, Zhou Jie On 2016/6/7 0:04, Alex Duyck wrote: On Mon, Jun 6, 2016 at 2:18 AM, Zhou Jie wrote: Hi Alex, On 2016/1/6 0:18, Alexander Duyck wrote: On Tue, Jan 5, 2016 at 1:40 AM, Michael S. Tsirkin wrote: On Mon, Jan 04, 2016 at 07:11:25PM -0800, Alexander Duyck wrote: The two mechanisms referenced above would likely require coordination with QEMU and as such are open to discussion. I haven't attempted to address them as I am not sure there is a consensus as of yet. My personal preference would be to add a vendor-specific configuration block to the emulated pci-bridge interfaces created by QEMU that would allow us to essentially extend shpc to support guest live migration with pass-through devices. shpc? That is kind of what I was thinking. We basically need some mechanism to allow for the host to ask the device to quiesce. It has been proposed to possibly even look at something like an ACPI interface since I know ACPI is used by QEMU to manage hot-plug in the standard case. - Alex Start by using hot-unplug for this! Really use your patch guest side, and write host side to allow starting migration with the device, but defer completing it. Yeah, I'm fully on board with this idea, though I'm not really working on this right now since last I knew the folks on this thread from Intel were working on it. My patches were mostly meant to be a nudge in this direction so that we could get away from the driver specific code. I have seen your email about live migration. I conclude the idea you proposed as following. 1. Extend swiotlb to allow for a page dirtying functionality. 2. Use pci express capability to implement of a PCI bridge to act as a bridge between direct assigned devices and the host bridge. 3. Using APCI event or extend shpc driver to support device pause. Is it right? Will you implement the patchs for live migration? That is pretty much the heart of the proposal I had. I submitted an RFC as a proof-of-concept for item 1 in the hopes that someone else might try tackling items 2 and 3 but I haven't seen any updates since then. The trick is to find a way to make it so that item 1 doesn't slow down standard SWIOTLB when you are not migrating a VM. If nothing else we would probably just need to add a static key that we could default to false unless there is a PCI bridge indicating we are starting a migration. I haven't had time to really work on this though. In addition I am not that familiar with QEMU and the internals of live migration so pieces 2 and 3 would take me some additional time to work on. - Alex .
Re: Re: [Qemu-devel] [RFC PATCH 0/3] x86: Add support for guest DMA dirty page tracking
Hi Alex, On 2016/1/6 0:18, Alexander Duyck wrote: On Tue, Jan 5, 2016 at 1:40 AM, Michael S. Tsirkin wrote: On Mon, Jan 04, 2016 at 07:11:25PM -0800, Alexander Duyck wrote: The two mechanisms referenced above would likely require coordination with QEMU and as such are open to discussion. I haven't attempted to address them as I am not sure there is a consensus as of yet. My personal preference would be to add a vendor-specific configuration block to the emulated pci-bridge interfaces created by QEMU that would allow us to essentially extend shpc to support guest live migration with pass-through devices. shpc? That is kind of what I was thinking. We basically need some mechanism to allow for the host to ask the device to quiesce. It has been proposed to possibly even look at something like an ACPI interface since I know ACPI is used by QEMU to manage hot-plug in the standard case. - Alex Start by using hot-unplug for this! Really use your patch guest side, and write host side to allow starting migration with the device, but defer completing it. Yeah, I'm fully on board with this idea, though I'm not really working on this right now since last I knew the folks on this thread from Intel were working on it. My patches were mostly meant to be a nudge in this direction so that we could get away from the driver specific code. I have seen your email about live migration. I conclude the idea you proposed as following. 1. Extend swiotlb to allow for a page dirtying functionality. 2. Use pci express capability to implement of a PCI bridge to act as a bridge between direct assigned devices and the host bridge. 3. Using APCI event or extend shpc driver to support device pause. Is it right? Will you implement the patchs for live migration? Sincerely, Zhou Jie So 1.- host tells guest to start tracking memory writes 2.- guest acks 3.- migration starts 4.- most memory is migrated 5.- host tells guest to eject device 6.- guest acks 7.- stop vm and migrate rest of state Sounds about right. The only way this differs from what I see as the final solution for this is that instead of fully ejecting the device in step 5 the driver would instead pause the device and give the host something like 10 seconds to stop the VM and resume with the same device connected if it is available. We would probably also need to look at a solution that would force the device to be ejected or abort prior to starting the migration if it doesn't give us the ack in step 2. It will already be a win since hot unplug after migration starts and most memory has been migrated is better than hot unplug before migration starts. Right. Generally the longer the VF can be maintained as a part of the guest the longer the network performance is improved versus using a purely virtual interface. Then measure downtime and profile. Then we can look at ways to quiesce device faster which really means step 5 is replaced with "host tells guest to quiesce device and dirty (or just unmap!) all memory mapped for write by device". Step 5 will be the spot where we really need to start modifying drivers. Specifically we probably need to go through and clean-up things so that we can reduce as many of the delays in the driver suspend/resume path as possible. I suspect there is quite a bit that can be done there that would probably also improve boot and shutdown times since those are also impacted by the devices. - Alex .