Re: [PATCH V3 22/42] vfio-pci: preserve MSI
On 5/28/25 19:44, Steven Sistare wrote:
Hi Cedric,
Do you have any comments on this before I send V4?
Ditto for patch "vfio-pci: preserve INTx".
In both, I made the changes you requested in V2.
And I will change all "reused" tests to cpr_is_incoming as we
discussed elsewhere.
I saw. Thanks for the changes.
You mentioned these possibly conflict with vfio-user, but it would
help to get your stylistic and correctness comments on these from a
cpr-only point of view before I send the next version.
OK. I will try to do that on v4 after the PR (with part 1) is sent.
And as I mentioned, I propose to block CPR when vfio-user is used,
at least initially, so you can ignore vfio-user in the cpr load paths below.
Thanks,
C.
- Steve
On 5/12/2025 11:32 AM, Steve Sistare wrote:
Save the MSI message area as part of vfio-pci vmstate, and preserve the
interrupt and notifier eventfd's. migrate_incoming loads the MSI data,
then the vfio-pci post_load handler finds the eventfds in CPR state,
rebuilds vector data structures, and attaches the interrupts to the new
KVM instance.
Signed-off-by: Steve Sistare
---
hw/vfio/cpr.c | 91 ++
hw/vfio/pci.c | 40 ++--
include/hw/vfio/vfio-cpr.h | 8
3 files changed, 136 insertions(+), 3 deletions(-)
diff --git a/hw/vfio/cpr.c b/hw/vfio/cpr.c
index 6ea8e9f..be132fa 100644
--- a/hw/vfio/cpr.c
+++ b/hw/vfio/cpr.c
@@ -9,6 +9,8 @@
#include "hw/vfio/vfio-device.h"
#include "hw/vfio/vfio-cpr.h"
#include "hw/vfio/pci.h"
+#include "hw/pci/msix.h"
+#include "hw/pci/msi.h"
#include "migration/cpr.h"
#include "qapi/error.h"
#include "system/runstate.h"
@@ -40,6 +42,69 @@ void vfio_cpr_unregister_container(VFIOContainerBase
*bcontainer)
migration_remove_notifier(&bcontainer->cpr_reboot_notifier);
}
+#define STRDUP_VECTOR_FD_NAME(vdev, name) \
+ g_strdup_printf("%s_%s", (vdev)->vbasedev.name, (name))
+
+void vfio_cpr_save_vector_fd(VFIOPCIDevice *vdev, const char *name, int nr,
+ int fd)
+{
+ g_autofree char *fdname = STRDUP_VECTOR_FD_NAME(vdev, name);
+ cpr_save_fd(fdname, nr, fd);
+}
+
+int vfio_cpr_load_vector_fd(VFIOPCIDevice *vdev, const char *name, int nr)
+{
+ g_autofree char *fdname = STRDUP_VECTOR_FD_NAME(vdev, name);
+ return cpr_find_fd(fdname, nr);
+}
+
+void vfio_cpr_delete_vector_fd(VFIOPCIDevice *vdev, const char *name, int nr)
+{
+ g_autofree char *fdname = STRDUP_VECTOR_FD_NAME(vdev, name);
+ cpr_delete_fd(fdname, nr);
+}
+
+static void vfio_cpr_claim_vectors(VFIOPCIDevice *vdev, int nr_vectors,
+ bool msix)
+{
+ int i, fd;
+ bool pending = false;
+ PCIDevice *pdev = &vdev->pdev;
+
+ vdev->nr_vectors = nr_vectors;
+ vdev->msi_vectors = g_new0(VFIOMSIVector, nr_vectors);
+ vdev->interrupt = msix ? VFIO_INT_MSIX : VFIO_INT_MSI;
+
+ vfio_prepare_kvm_msi_virq_batch(vdev);
+
+ for (i = 0; i < nr_vectors; i++) {
+ VFIOMSIVector *vector = &vdev->msi_vectors[i];
+
+ fd = vfio_cpr_load_vector_fd(vdev, "interrupt", i);
+ if (fd >= 0) {
+ vfio_vector_init(vdev, i);
+ qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL, vector);
+ }
+
+ if (vfio_cpr_load_vector_fd(vdev, "kvm_interrupt", i) >= 0) {
+ vfio_add_kvm_msi_virq(vdev, vector, i, msix);
+ } else {
+ vdev->msi_vectors[i].virq = -1;
+ }
+
+ if (msix && msix_is_pending(pdev, i) && msix_is_masked(pdev, i)) {
+ set_bit(i, vdev->msix->pending);
+ pending = true;
+ }
+ }
+
+ vfio_commit_kvm_msi_virq_batch(vdev);
+
+ if (msix) {
+ memory_region_set_enabled(&pdev->msix_pba_mmio, pending);
+ }
+}
+
/*
* The kernel may change non-emulated config bits. Exclude them from the
* changed-bits check in get_pci_config_device.
@@ -58,13 +123,39 @@ static int vfio_cpr_pci_pre_load(void *opaque)
return 0;
}
+static int vfio_cpr_pci_post_load(void *opaque, int version_id)
+{
+ VFIOPCIDevice *vdev = opaque;
+ PCIDevice *pdev = &vdev->pdev;
+ int nr_vectors;
+
+ if (msix_enabled(pdev)) {
+ msix_set_vector_notifiers(pdev, vfio_msix_vector_use,
+ vfio_msix_vector_release, NULL);
+ nr_vectors = vdev->msix->entries;
+ vfio_cpr_claim_vectors(vdev, nr_vectors, true);
+
+ } else if (msi_enabled(pdev)) {
+ nr_vectors = msi_nr_vectors_allocated(pdev);
+ vfio_cpr_claim_vectors(vdev, nr_vectors, false);
+
+ } else if (vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1)) {
+ g_assert_not_reached(); /* completed in a subsequent patch */
+ }
+
+ return 0;
+}
+
const VMStateDescription vfio_cpr_pci_vmstate = {
.name = "vfio-cpr-pci",
.version_id = 0,
.minimum_version_id = 0,
.pre_load = vfio_cpr_pci_pre_load,
+ .post_load = vfio_cpr_pci_post_l
Re: [PATCH V3 22/42] vfio-pci: preserve MSI
Hi Cedric,
Do you have any comments on this before I send V4?
Ditto for patch "vfio-pci: preserve INTx".
In both, I made the changes you requested in V2.
And I will change all "reused" tests to cpr_is_incoming as we
discussed elsewhere.
You mentioned these possibly conflict with vfio-user, but it would
help to get your stylistic and correctness comments on these from a
cpr-only point of view before I send the next version.
And as I mentioned, I propose to block CPR when vfio-user is used,
at least initially, so you can ignore vfio-user in the cpr load paths below.
- Steve
On 5/12/2025 11:32 AM, Steve Sistare wrote:
Save the MSI message area as part of vfio-pci vmstate, and preserve the
interrupt and notifier eventfd's. migrate_incoming loads the MSI data,
then the vfio-pci post_load handler finds the eventfds in CPR state,
rebuilds vector data structures, and attaches the interrupts to the new
KVM instance.
Signed-off-by: Steve Sistare
---
hw/vfio/cpr.c | 91 ++
hw/vfio/pci.c | 40 ++--
include/hw/vfio/vfio-cpr.h | 8
3 files changed, 136 insertions(+), 3 deletions(-)
diff --git a/hw/vfio/cpr.c b/hw/vfio/cpr.c
index 6ea8e9f..be132fa 100644
--- a/hw/vfio/cpr.c
+++ b/hw/vfio/cpr.c
@@ -9,6 +9,8 @@
#include "hw/vfio/vfio-device.h"
#include "hw/vfio/vfio-cpr.h"
#include "hw/vfio/pci.h"
+#include "hw/pci/msix.h"
+#include "hw/pci/msi.h"
#include "migration/cpr.h"
#include "qapi/error.h"
#include "system/runstate.h"
@@ -40,6 +42,69 @@ void vfio_cpr_unregister_container(VFIOContainerBase
*bcontainer)
migration_remove_notifier(&bcontainer->cpr_reboot_notifier);
}
+#define STRDUP_VECTOR_FD_NAME(vdev, name) \
+g_strdup_printf("%s_%s", (vdev)->vbasedev.name, (name))
+
+void vfio_cpr_save_vector_fd(VFIOPCIDevice *vdev, const char *name, int nr,
+ int fd)
+{
+g_autofree char *fdname = STRDUP_VECTOR_FD_NAME(vdev, name);
+cpr_save_fd(fdname, nr, fd);
+}
+
+int vfio_cpr_load_vector_fd(VFIOPCIDevice *vdev, const char *name, int nr)
+{
+g_autofree char *fdname = STRDUP_VECTOR_FD_NAME(vdev, name);
+return cpr_find_fd(fdname, nr);
+}
+
+void vfio_cpr_delete_vector_fd(VFIOPCIDevice *vdev, const char *name, int nr)
+{
+g_autofree char *fdname = STRDUP_VECTOR_FD_NAME(vdev, name);
+cpr_delete_fd(fdname, nr);
+}
+
+static void vfio_cpr_claim_vectors(VFIOPCIDevice *vdev, int nr_vectors,
+ bool msix)
+{
+int i, fd;
+bool pending = false;
+PCIDevice *pdev = &vdev->pdev;
+
+vdev->nr_vectors = nr_vectors;
+vdev->msi_vectors = g_new0(VFIOMSIVector, nr_vectors);
+vdev->interrupt = msix ? VFIO_INT_MSIX : VFIO_INT_MSI;
+
+vfio_prepare_kvm_msi_virq_batch(vdev);
+
+for (i = 0; i < nr_vectors; i++) {
+VFIOMSIVector *vector = &vdev->msi_vectors[i];
+
+fd = vfio_cpr_load_vector_fd(vdev, "interrupt", i);
+if (fd >= 0) {
+vfio_vector_init(vdev, i);
+qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL, vector);
+}
+
+if (vfio_cpr_load_vector_fd(vdev, "kvm_interrupt", i) >= 0) {
+vfio_add_kvm_msi_virq(vdev, vector, i, msix);
+} else {
+vdev->msi_vectors[i].virq = -1;
+}
+
+if (msix && msix_is_pending(pdev, i) && msix_is_masked(pdev, i)) {
+set_bit(i, vdev->msix->pending);
+pending = true;
+}
+}
+
+vfio_commit_kvm_msi_virq_batch(vdev);
+
+if (msix) {
+memory_region_set_enabled(&pdev->msix_pba_mmio, pending);
+}
+}
+
/*
* The kernel may change non-emulated config bits. Exclude them from the
* changed-bits check in get_pci_config_device.
@@ -58,13 +123,39 @@ static int vfio_cpr_pci_pre_load(void *opaque)
return 0;
}
+static int vfio_cpr_pci_post_load(void *opaque, int version_id)
+{
+VFIOPCIDevice *vdev = opaque;
+PCIDevice *pdev = &vdev->pdev;
+int nr_vectors;
+
+if (msix_enabled(pdev)) {
+msix_set_vector_notifiers(pdev, vfio_msix_vector_use,
+ vfio_msix_vector_release, NULL);
+nr_vectors = vdev->msix->entries;
+vfio_cpr_claim_vectors(vdev, nr_vectors, true);
+
+} else if (msi_enabled(pdev)) {
+nr_vectors = msi_nr_vectors_allocated(pdev);
+vfio_cpr_claim_vectors(vdev, nr_vectors, false);
+
+} else if (vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1)) {
+g_assert_not_reached(); /* completed in a subsequent patch */
+}
+
+return 0;
+}
+
const VMStateDescription vfio_cpr_pci_vmstate = {
.name = "vfio-cpr-pci",
.version_id = 0,
.minimum_version_id = 0,
.pre_load = vfio_cpr_pci_pre_load,
+.post_load = vfio_cpr_pci_post_load,
.needed = cpr_needed_for_reuse,
.fields = (VMStateField[]) {
+VMSTATE_PCI_DEVICE(pdev, VFIOPCIDevice),
+VMSTATE_MSIX_TEST(pd
