Re: [PATCH V5 19/38] vfio-pci: preserve INTx
On 7/2/2025 11:23 AM, Cédric Le Goater wrote:
On 6/10/25 17:39, Steve Sistare wrote:
Preserve vfio INTx state across cpr-transfer. Preserve VFIOINTx fields as
follows:
pin : Recover this from the vfio config in kernel space
interrupt : Preserve its eventfd descriptor across exec.
unmask : Ditto
route.irq : This could perhaps be recovered in vfio_pci_post_load by
calling pci_device_route_intx_to_irq(pin), whose implementation reads
config space for a bridge device such as ich9. However, there is no
guarantee that the bridge vmstate is read before vfio vmstate. Rather
than fiddling with MigrationPriority for vmstate handlers, explicitly
save route.irq in vfio vmstate.
pending : save in vfio vmstate.
mmap_timeout, mmap_timer : Re-initialize
bool kvm_accel : Re-initialize
In vfio_realize, defer calling vfio_intx_enable until the vmstate
is available, in vfio_pci_post_load. Modify vfio_intx_enable and
vfio_intx_kvm_enable to skip vfio initialization, but still perform
kvm initialization.
Signed-off-by: Steve Sistare
---
hw/vfio/cpr.c | 27 ++-
hw/vfio/pci.c | 32
2 files changed, 54 insertions(+), 5 deletions(-)
diff --git a/hw/vfio/cpr.c b/hw/vfio/cpr.c
index e467373..fca 100644
--- a/hw/vfio/cpr.c
+++ b/hw/vfio/cpr.c
@@ -139,7 +139,11 @@ static int vfio_cpr_pci_post_load(void *opaque, int
version_id)
vfio_cpr_claim_vectors(vdev, nr_vectors, false);
} else if (vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1)) {
- g_assert_not_reached(); /* completed in a subsequent patch */
+ Error *local_err = NULL;
+ if (!vfio_pci_intx_enable(vdev, &local_err)) {
+ error_report_err(local_err);
+ return -1;
+ }
}
return 0;
@@ -152,6 +156,26 @@ static bool pci_msix_present(void *opaque, int version_id)
return msix_present(pdev);
}
+static const VMStateDescription vfio_intx_vmstate = {
+ .name = "vfio-cpr-intx",
+ .version_id = 0,
+ .minimum_version_id = 0,
+ .fields = (VMStateField[]) {
+ VMSTATE_BOOL(pending, VFIOINTx),
+ VMSTATE_UINT32(route.mode, VFIOINTx),
+ VMSTATE_INT32(route.irq, VFIOINTx),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+#define VMSTATE_VFIO_INTX(_field, _state) { \
+ .name = (stringify(_field)), \
+ .size = sizeof(VFIOINTx), \
+ .vmsd = &vfio_intx_vmstate, \
+ .flags = VMS_STRUCT, \
+ .offset = vmstate_offset_value(_state, _field, VFIOINTx), \
+}
+
const VMStateDescription vfio_cpr_pci_vmstate = {
.name = "vfio-cpr-pci",
.version_id = 0,
@@ -162,6 +186,7 @@ const VMStateDescription vfio_cpr_pci_vmstate = {
.fields = (VMStateField[]) {
VMSTATE_PCI_DEVICE(pdev, VFIOPCIDevice),
VMSTATE_MSIX_TEST(pdev, VFIOPCIDevice, pci_msix_present),
+ VMSTATE_VFIO_INTX(intx, VFIOPCIDevice),
VMSTATE_END_OF_LIST()
}
};
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index b3dbb84..b52c488 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -161,12 +161,17 @@ static bool vfio_intx_enable_kvm(VFIOPCIDevice *vdev,
Error **errp)
return true;
}
+ if (cpr_is_incoming()) {
+ goto skip_state;
+ }
+
/* Get to a known interrupt state */
qemu_set_fd_handler(irq_fd, NULL, NULL, vdev);
vfio_device_irq_mask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
vdev->intx.pending = false;
pci_irq_deassert(&vdev->pdev);
+skip_state:
/* Get an eventfd for resample/unmask */
if (!vfio_notifier_init(vdev, &vdev->intx.unmask, "intx-unmask", 0,
errp)) {
goto fail;
@@ -180,6 +185,10 @@ static bool vfio_intx_enable_kvm(VFIOPCIDevice *vdev,
Error **errp)
goto fail_irqfd;
}
+ if (cpr_is_incoming()) {
+ goto skip_irq;
+ }
+
if (!vfio_device_irq_set_signaling(&vdev->vbasedev,
VFIO_PCI_INTX_IRQ_INDEX, 0,
VFIO_IRQ_SET_ACTION_UNMASK,
event_notifier_get_fd(&vdev->intx.unmask),
@@ -190,6 +199,7 @@ static bool vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error
**errp)
/* Let'em rip */
vfio_device_irq_unmask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
+skip_irq:
vdev->intx.kvm_accel = true;
Looking closer at the code, I think it would clearer to introduce a
vfio_cpr_intx_enable_kvm() routine and duplicate some of the code
of vfio_intx_enable_kvm().
OK:
static bool vfio_cpr_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp)
{
#ifdef CONFIG_KVM
if (vdev->no_kvm_intx || !kvm_irqfds_enabled() ||
vdev->intx.route.mode != PCI_INTX_ENABLED ||
!kvm_resamplefds_enabled()) {
return true;
}
if (!vfio_notifier_init(vdev, &vdev->intx.
Re: [PATCH V5 19/38] vfio-pci: preserve INTx
On 6/10/25 17:39, Steve Sistare wrote:
Preserve vfio INTx state across cpr-transfer. Preserve VFIOINTx fields as
follows:
pin : Recover this from the vfio config in kernel space
interrupt : Preserve its eventfd descriptor across exec.
unmask : Ditto
route.irq : This could perhaps be recovered in vfio_pci_post_load by
calling pci_device_route_intx_to_irq(pin), whose implementation reads
config space for a bridge device such as ich9. However, there is no
guarantee that the bridge vmstate is read before vfio vmstate. Rather
than fiddling with MigrationPriority for vmstate handlers, explicitly
save route.irq in vfio vmstate.
pending : save in vfio vmstate.
mmap_timeout, mmap_timer : Re-initialize
bool kvm_accel : Re-initialize
In vfio_realize, defer calling vfio_intx_enable until the vmstate
is available, in vfio_pci_post_load. Modify vfio_intx_enable and
vfio_intx_kvm_enable to skip vfio initialization, but still perform
kvm initialization.
Signed-off-by: Steve Sistare
---
hw/vfio/cpr.c | 27 ++-
hw/vfio/pci.c | 32
2 files changed, 54 insertions(+), 5 deletions(-)
diff --git a/hw/vfio/cpr.c b/hw/vfio/cpr.c
index e467373..fca 100644
--- a/hw/vfio/cpr.c
+++ b/hw/vfio/cpr.c
@@ -139,7 +139,11 @@ static int vfio_cpr_pci_post_load(void *opaque, int
version_id)
vfio_cpr_claim_vectors(vdev, nr_vectors, false);
} else if (vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1)) {
-g_assert_not_reached(); /* completed in a subsequent patch */
+Error *local_err = NULL;
+if (!vfio_pci_intx_enable(vdev, &local_err)) {
+error_report_err(local_err);
+return -1;
+}
}
return 0;
@@ -152,6 +156,26 @@ static bool pci_msix_present(void *opaque, int version_id)
return msix_present(pdev);
}
+static const VMStateDescription vfio_intx_vmstate = {
+.name = "vfio-cpr-intx",
+.version_id = 0,
+.minimum_version_id = 0,
+.fields = (VMStateField[]) {
+VMSTATE_BOOL(pending, VFIOINTx),
+VMSTATE_UINT32(route.mode, VFIOINTx),
+VMSTATE_INT32(route.irq, VFIOINTx),
+VMSTATE_END_OF_LIST()
+}
+};
+
+#define VMSTATE_VFIO_INTX(_field, _state) { \
+.name = (stringify(_field)), \
+.size = sizeof(VFIOINTx), \
+.vmsd = &vfio_intx_vmstate, \
+.flags = VMS_STRUCT, \
+.offset = vmstate_offset_value(_state, _field, VFIOINTx), \
+}
+
const VMStateDescription vfio_cpr_pci_vmstate = {
.name = "vfio-cpr-pci",
.version_id = 0,
@@ -162,6 +186,7 @@ const VMStateDescription vfio_cpr_pci_vmstate = {
.fields = (VMStateField[]) {
VMSTATE_PCI_DEVICE(pdev, VFIOPCIDevice),
VMSTATE_MSIX_TEST(pdev, VFIOPCIDevice, pci_msix_present),
+VMSTATE_VFIO_INTX(intx, VFIOPCIDevice),
VMSTATE_END_OF_LIST()
}
};
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index b3dbb84..b52c488 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -161,12 +161,17 @@ static bool vfio_intx_enable_kvm(VFIOPCIDevice *vdev,
Error **errp)
return true;
}
+if (cpr_is_incoming()) {
+goto skip_state;
+}
+
/* Get to a known interrupt state */
qemu_set_fd_handler(irq_fd, NULL, NULL, vdev);
vfio_device_irq_mask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
vdev->intx.pending = false;
pci_irq_deassert(&vdev->pdev);
+skip_state:
/* Get an eventfd for resample/unmask */
if (!vfio_notifier_init(vdev, &vdev->intx.unmask, "intx-unmask", 0,
errp)) {
goto fail;
@@ -180,6 +185,10 @@ static bool vfio_intx_enable_kvm(VFIOPCIDevice *vdev,
Error **errp)
goto fail_irqfd;
}
+if (cpr_is_incoming()) {
+goto skip_irq;
+}
+
if (!vfio_device_irq_set_signaling(&vdev->vbasedev,
VFIO_PCI_INTX_IRQ_INDEX, 0,
VFIO_IRQ_SET_ACTION_UNMASK,
event_notifier_get_fd(&vdev->intx.unmask),
@@ -190,6 +199,7 @@ static bool vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error
**errp)
/* Let'em rip */
vfio_device_irq_unmask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
+skip_irq:
vdev->intx.kvm_accel = true;
Looking closer at the code, I think it would clearer to introduce a
vfio_cpr_intx_enable_kvm() routine and duplicate some of the code
of vfio_intx_enable_kvm().
trace_vfio_intx_enable_kvm(vdev->vbasedev.name);
@@ -305,7 +315,13 @@ static bool vfio_intx_enable(VFIOPCIDevice *vdev, Error
**errp)
return true;
}
-vfio_disable_interrupts(vdev);
+/*
+ * Do not alter interrupt state during vfio_realize and cpr load.
+ * The incoming state is cleared therea
[PATCH V5 19/38] vfio-pci: preserve INTx
Preserve vfio INTx state across cpr-transfer. Preserve VFIOINTx fields as
follows:
pin : Recover this from the vfio config in kernel space
interrupt : Preserve its eventfd descriptor across exec.
unmask : Ditto
route.irq : This could perhaps be recovered in vfio_pci_post_load by
calling pci_device_route_intx_to_irq(pin), whose implementation reads
config space for a bridge device such as ich9. However, there is no
guarantee that the bridge vmstate is read before vfio vmstate. Rather
than fiddling with MigrationPriority for vmstate handlers, explicitly
save route.irq in vfio vmstate.
pending : save in vfio vmstate.
mmap_timeout, mmap_timer : Re-initialize
bool kvm_accel : Re-initialize
In vfio_realize, defer calling vfio_intx_enable until the vmstate
is available, in vfio_pci_post_load. Modify vfio_intx_enable and
vfio_intx_kvm_enable to skip vfio initialization, but still perform
kvm initialization.
Signed-off-by: Steve Sistare
---
hw/vfio/cpr.c | 27 ++-
hw/vfio/pci.c | 32
2 files changed, 54 insertions(+), 5 deletions(-)
diff --git a/hw/vfio/cpr.c b/hw/vfio/cpr.c
index e467373..fca 100644
--- a/hw/vfio/cpr.c
+++ b/hw/vfio/cpr.c
@@ -139,7 +139,11 @@ static int vfio_cpr_pci_post_load(void *opaque, int
version_id)
vfio_cpr_claim_vectors(vdev, nr_vectors, false);
} else if (vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1)) {
-g_assert_not_reached(); /* completed in a subsequent patch */
+Error *local_err = NULL;
+if (!vfio_pci_intx_enable(vdev, &local_err)) {
+error_report_err(local_err);
+return -1;
+}
}
return 0;
@@ -152,6 +156,26 @@ static bool pci_msix_present(void *opaque, int version_id)
return msix_present(pdev);
}
+static const VMStateDescription vfio_intx_vmstate = {
+.name = "vfio-cpr-intx",
+.version_id = 0,
+.minimum_version_id = 0,
+.fields = (VMStateField[]) {
+VMSTATE_BOOL(pending, VFIOINTx),
+VMSTATE_UINT32(route.mode, VFIOINTx),
+VMSTATE_INT32(route.irq, VFIOINTx),
+VMSTATE_END_OF_LIST()
+}
+};
+
+#define VMSTATE_VFIO_INTX(_field, _state) { \
+.name = (stringify(_field)), \
+.size = sizeof(VFIOINTx), \
+.vmsd = &vfio_intx_vmstate, \
+.flags = VMS_STRUCT, \
+.offset = vmstate_offset_value(_state, _field, VFIOINTx), \
+}
+
const VMStateDescription vfio_cpr_pci_vmstate = {
.name = "vfio-cpr-pci",
.version_id = 0,
@@ -162,6 +186,7 @@ const VMStateDescription vfio_cpr_pci_vmstate = {
.fields = (VMStateField[]) {
VMSTATE_PCI_DEVICE(pdev, VFIOPCIDevice),
VMSTATE_MSIX_TEST(pdev, VFIOPCIDevice, pci_msix_present),
+VMSTATE_VFIO_INTX(intx, VFIOPCIDevice),
VMSTATE_END_OF_LIST()
}
};
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index b3dbb84..b52c488 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -161,12 +161,17 @@ static bool vfio_intx_enable_kvm(VFIOPCIDevice *vdev,
Error **errp)
return true;
}
+if (cpr_is_incoming()) {
+goto skip_state;
+}
+
/* Get to a known interrupt state */
qemu_set_fd_handler(irq_fd, NULL, NULL, vdev);
vfio_device_irq_mask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
vdev->intx.pending = false;
pci_irq_deassert(&vdev->pdev);
+skip_state:
/* Get an eventfd for resample/unmask */
if (!vfio_notifier_init(vdev, &vdev->intx.unmask, "intx-unmask", 0, errp))
{
goto fail;
@@ -180,6 +185,10 @@ static bool vfio_intx_enable_kvm(VFIOPCIDevice *vdev,
Error **errp)
goto fail_irqfd;
}
+if (cpr_is_incoming()) {
+goto skip_irq;
+}
+
if (!vfio_device_irq_set_signaling(&vdev->vbasedev,
VFIO_PCI_INTX_IRQ_INDEX, 0,
VFIO_IRQ_SET_ACTION_UNMASK,
event_notifier_get_fd(&vdev->intx.unmask),
@@ -190,6 +199,7 @@ static bool vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error
**errp)
/* Let'em rip */
vfio_device_irq_unmask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
+skip_irq:
vdev->intx.kvm_accel = true;
trace_vfio_intx_enable_kvm(vdev->vbasedev.name);
@@ -305,7 +315,13 @@ static bool vfio_intx_enable(VFIOPCIDevice *vdev, Error
**errp)
return true;
}
-vfio_disable_interrupts(vdev);
+/*
+ * Do not alter interrupt state during vfio_realize and cpr load.
+ * The incoming state is cleared thereafter.
+ */
+if (!cpr_is_incoming()) {
+vfio_disable_interrupts(vdev);
+}
vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
pci_config_set_interrupt_pin(vdev->pdev.config, pin);
@@ -328,8 +344,10 @@ static bool vfio_intx_enable(VFIOPCIDev
